1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-22 01:32:03 +01:00

Make view specify where and drive data motion - first cut.

This is a compile tiime option --enable-unified=yes/no
This commit is contained in:
Peter Boyle
2020-05-21 16:13:16 -04:00
parent ebb60330c9
commit 7860a50f70
48 changed files with 688 additions and 718 deletions

View File

@ -233,10 +233,10 @@ public:
Uconj = where(coor==neglink,-Uconj,Uconj);
}
auto U_v = U.View();
auto Uds_v = Uds.View();
auto Uconj_v = Uconj.View();
auto Utmp_v= Utmp.View();
auto U_v = U.View(CpuRead);
auto Uds_v = Uds.View(CpuWrite);
auto Uconj_v = Uconj.View(CpuRead);
auto Utmp_v= Utmp.View(CpuWrite);
thread_foreach(ss,U_v,{
Uds_v[ss](0)(mu) = U_v[ss]();
Uds_v[ss](1)(mu) = Uconj_v[ss]();
@ -272,8 +272,8 @@ public:
GaugeLinkField link(mat.Grid());
// use lorentz for flavour as hack.
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
auto link_v = link.View();
auto tmp_v = tmp.View();
auto link_v = link.View(CpuWrite);
auto tmp_v = tmp.View(CpuRead);
thread_foreach(ss,tmp_v,{
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
});
@ -306,9 +306,9 @@ public:
GaugeLinkField tmp(mat.Grid());
tmp = Zero();
auto tmp_v = tmp.View();
auto Atilde_v = Atilde.View();
auto Btilde_v = Btilde.View();
auto tmp_v = tmp.View(CpuWrite);
auto Atilde_v = Atilde.View(CpuRead);
auto Btilde_v = Btilde.View(CpuRead);
thread_for(ss,tmp.Grid()->oSites(),{
for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss;

View File

@ -264,8 +264,8 @@ private:
{
CloverFieldType T(F.Grid());
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
auto T_v = T.View(CpuWrite);
auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(),
{
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
@ -282,8 +282,8 @@ private:
CloverFieldType T(F.Grid());
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
auto T_v = T.View(CpuWrite);
auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(),
{
T_v[i]()(0, 1) = -F_v[i]()();
@ -300,8 +300,8 @@ private:
CloverFieldType T(F.Grid());
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
auto T_v = T.View(CpuWrite);
auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(),
{
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
@ -318,8 +318,8 @@ private:
CloverFieldType T(F.Grid());
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
auto T_v = T.View(CpuWrite);
auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(),
{
T_v[i]()(0, 1) = timesI(F_v[i]()());
@ -336,8 +336,8 @@ private:
CloverFieldType T(F.Grid());
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
auto T_v = T.View(CpuWrite);
auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(),
{
T_v[i]()(0, 1) = -(F_v[i]()());
@ -355,8 +355,8 @@ private:
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
auto T_v = T.View(CpuWrite);
auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(),
{
T_v[i]()(0, 0) = timesI(F_v[i]()());

View File

@ -106,9 +106,9 @@ public:
const _SpinorField & phi,
int mu)
{
auto out_v= out.View();
auto phi_v= phi.View();
auto Umu_v= Umu.View();
auto out_v= out.View(CpuWrite);
auto phi_v= phi.View(CpuRead);
auto Umu_v= Umu.View(CpuRead);
thread_for(sss,out.Grid()->oSites(),{
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
});
@ -191,9 +191,9 @@ public:
int Ls=Btilde.Grid()->_fdimensions[0];
GaugeLinkField tmp(mat.Grid());
tmp = Zero();
auto tmp_v = tmp.View();
auto Btilde_v = Btilde.View();
auto Atilde_v = Atilde.View();
auto tmp_v = tmp.View(CpuWrite);
auto Btilde_v = Btilde.View(CpuRead);
auto Atilde_v = Atilde.View(CpuRead);
thread_for(sss,tmp.Grid()->oSites(),{
int sU=sss;
for(int s=0;s<Ls;s++){

View File

@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid();
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
{
chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid();
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid();
auto psi = psi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
int Ls=this->Ls;
@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
GridBase *grid=psi_i.Grid();
int Ls=this->Ls;
auto psi = psi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
auto plee = & lee [0];
auto pdee = & dee [0];

View File

@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid();
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(CpuRead);
auto phi = phi_i.View(CpuRead);
auto chi = chi_i.View(CpuWrite);
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
const int nsimd= Simd::Nsimd();
@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid();
auto psi=psi_i.View();
auto phi=phi_i.View();
auto chi=chi_i.View();
auto psi=psi_i.View(CpuRead);
auto phi=phi_i.View(CpuRead);
auto chi=chi_i.View(CpuWrite);
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
int nsimd= Simd::Nsimd();
@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
Vector<iSinglet<Simd> > &Matm)
{
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
auto psi = psi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(CpuRead);
auto chi = chi_i.View(CpuWrite);
#ifndef AVX512
{
SiteHalfSpinor BcastP;
@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
EnableIf<Impl::LsVectorised,int> sfinae=0;
#ifndef AVX512
{
auto psi = psi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(CpuRead);
auto chi = chi_i.View(CpuWrite);
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
}
#else
{
auto psi = psi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(CpuRead);
auto chi = chi_i.View(CpuWrite);
// pointers
// MASK_REGS;
#define Chi_00 %zmm0

View File

@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
chi_i.Checkerboard() = psi_i.Checkerboard();
int Ls = this->Ls;
GridBase* grid = psi_i.Grid();
auto phi = phi_i.View();
auto psi = psi_i.View();
auto chi = chi_i.View();
auto phi = phi_i.View(AcceleratorRead);
auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
GridBase* grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
{
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid();
auto psi=psi_i.View();
auto chi=chi_i.View();
auto psi=psi_i.View(AcceleratorRead);
auto chi=chi_i.View(AcceleratorWrite);
int Ls = this->Ls;
auto plee = & this->lee[0];
@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
{
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid();
auto psi = psi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
int Ls = this->Ls;
auto plee = & this->lee[0];

View File

@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
Compressor compressor;
Stencil.HaloExchange(in,compressor);
auto Umu_v = Umu.View();
auto UUUmu_v = UUUmu.View();
auto in_v = in.View();
auto out_v = out.View();
auto Umu_v = Umu.View(CpuRead);
auto UUUmu_v = UUUmu.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
thread_for( ss,Umu.Grid()->oSites(),{
for(int s=0;s<Ls;s++){
int sU=ss;
@ -339,10 +339,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
}
// do the compute
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
@ -376,10 +376,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DhopComputeTime2-=usecond();
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) {
int sz=st.surface_list.size();
thread_for( ss,sz,{
@ -418,10 +418,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) {
thread_for( ss,U.Grid()->oSites(),{
int sU=ss;

View File

@ -250,10 +250,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
////////////////////////
// Call the single hop
////////////////////////
auto U_v = U.View();
auto UUU_v = UUU.View();
auto B_v = B.View();
auto Btilde_v = Btilde.View();
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto B_v = B.View(CpuWrite);
auto Btilde_v = Btilde.View(CpuWrite);
thread_for(sss,B.Grid()->oSites(),{
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
});
@ -378,10 +378,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
Compressor compressor;
Stencil.HaloExchange(in, compressor);
auto Umu_v = Umu.View();
auto UUUmu_v = UUUmu.View();
auto in_v = in.View();
auto out_v = out.View();
auto Umu_v = Umu.View(CpuRead);
auto UUUmu_v = UUUmu.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
thread_for( sss, in.Grid()->oSites(),{
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
});
@ -449,10 +449,10 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
}
// do the compute
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
@ -479,10 +479,10 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopComputeTime2 -= usecond();
{
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) {
int sz=st.surface_list.size();
thread_for(ss,sz,{
@ -520,10 +520,10 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
DhopComputeTime -= usecond();
if (dag == DaggerYes) {
thread_for(sss, in.Grid()->oSites(),{

View File

@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
auto pm = this->pm;
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
auto plee = & this->lee [0];
auto pdee = & this->dee [0];
@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
auto pm = this->pm;
auto plee = & this->lee [0];
@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
auto plee = & this->lee [0];
auto pdee = & this->dee [0];
@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
{
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
auto psi = psi_i.View();
auto chi = chi_i.View();
auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(AcceleratorWrite);
int Ls = this->Ls;
auto pm = this->pm;

View File

@ -475,12 +475,12 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
// Inefficient comms method but not performance critical.
tmp1 = Cshift(q_in_1, mu, 1);
tmp2 = Cshift(q_in_2, mu, 1);
auto tmp1_v = tmp1.View();
auto tmp2_v = tmp2.View();
auto q_in_1_v=q_in_1.View();
auto q_in_2_v=q_in_2.View();
auto q_out_v = q_out.View();
auto Umu_v = Umu.View();
auto tmp1_v = tmp1.View(CpuWrite);
auto tmp2_v = tmp2.View(CpuWrite);
auto q_in_1_v=q_in_1.View(CpuRead);
auto q_in_2_v=q_in_2.View(CpuRead);
auto q_out_v = q_out.View(CpuRead);
auto Umu_v = Umu.View(CpuRead);
thread_for(sU, Umu.Grid()->oSites(),{
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
q_in_2_v[sU],
@ -526,11 +526,11 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
tmp = lattice_cmplx*q_in;
tmpBwd = Cshift(tmp, mu, -1);
auto coords_v = coords.View();
auto tmpFwd_v = tmpFwd.View();
auto tmpBwd_v = tmpBwd.View();
auto Umu_v = Umu.View();
auto q_out_v = q_out.View();
auto coords_v = coords.View(CpuRead);
auto tmpFwd_v = tmpFwd.View(CpuRead);
auto tmpBwd_v = tmpBwd.View(CpuRead);
auto Umu_v = Umu.View(CpuRead);
auto q_out_v = q_out.View(CpuWrite);
thread_for(sU, Umu.Grid()->oSites(), {

View File

@ -348,18 +348,18 @@ template <class Impl>
void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
int Nsite, const FermionField &in, std::vector<FermionField> &out)
{
auto U_v = U.View();
auto in_v = in.View();
auto st_v = st.View();
auto U_v = U.View(AcceleratorRead);
auto in_v = in.View(AcceleratorRead);
auto st_v = st.View(AcceleratorRead);
auto out_Xm = out[0].View();
auto out_Ym = out[1].View();
auto out_Zm = out[2].View();
auto out_Tm = out[3].View();
auto out_Xp = out[4].View();
auto out_Yp = out[5].View();
auto out_Zp = out[6].View();
auto out_Tp = out[7].View();
auto out_Xm = out[0].View(AcceleratorWrite);
auto out_Ym = out[1].View(AcceleratorWrite);
auto out_Zm = out[2].View(AcceleratorWrite);
auto out_Tm = out[3].View(AcceleratorWrite);
auto out_Xp = out[4].View(AcceleratorWrite);
auto out_Yp = out[5].View(AcceleratorWrite);
auto out_Zp = out[6].View(AcceleratorWrite);
auto out_Tp = out[7].View(AcceleratorWrite);
auto CBp=st.CommBuf();
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
int sU=sss/Ls;
@ -383,10 +383,10 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
assert(dirdisp<=7);
assert(dirdisp>=0);
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
auto U_v = U.View(AcceleratorRead);
auto in_v = in.View(AcceleratorRead);
auto out_v = out.View(AcceleratorWrite);
auto st_v = st.View(AcceleratorRead);
auto CBp=st.CommBuf();
#define LoopBody(Dir) \
case Dir : \
@ -438,10 +438,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior)
{
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
auto U_v = U.View(AcceleratorRead);
auto in_v = in.View(AcceleratorRead);
auto out_v = out.View(AcceleratorWrite);
auto st_v = st.View(AcceleratorRead);
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
@ -469,10 +469,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior)
{
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
auto U_v = U.View(AcceleratorRead);
auto in_v = in.View(AcceleratorRead);
auto out_v = out.View(AcceleratorWrite);
auto st_v = st.View(AcceleratorRead);
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}