From 8720aecb80bdc40e1fcbb6f48612b2918ebd123c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 10 Jun 2020 12:57:55 -0400 Subject: [PATCH] Offload more loops --- Grid/qcd/action/fermion/WilsonCloverFermion.h | 37 ++++---- Grid/qcd/action/fermion/WilsonImpl.h | 16 ++-- .../WilsonFermionImplementation.h | 84 +------------------ 3 files changed, 29 insertions(+), 108 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h index aa8fb150..a2755389 100644 --- a/Grid/qcd/action/fermion/WilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h @@ -258,15 +258,16 @@ private: CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO + public: // eventually these can be compressed into 6x6 blocks instead of the 12x12 // using the DeGrand-Rossi basis for the gamma matrices CloverFieldType fillCloverYZ(const GaugeLinkField &F) { CloverFieldType T(F.Grid()); T = Zero(); - autoView(T_v,T,CpuWrite); - autoView(F_v,F,CpuRead); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView(T_v,T,AcceleratorWrite); + autoView(F_v,F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); @@ -282,9 +283,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - autoView(T_v, T,CpuWrite); - autoView(F_v, F,CpuRead); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView(T_v, T,AcceleratorWrite); + autoView(F_v, F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = -F_v[i]()(); T_v[i]()(1, 0) = F_v[i]()(); @@ -300,9 +301,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - autoView(T_v,T,CpuWrite); - autoView(F_v,F,CpuRead); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView(T_v,T,AcceleratorWrite); + autoView(F_v,F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); T_v[i]()(1, 1) = timesI(F_v[i]()()); @@ -318,9 +319,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - autoView( T_v , T, CpuWrite); - autoView( F_v , F, CpuRead); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView( T_v , T, AcceleratorWrite); + autoView( F_v , F, AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = timesI(F_v[i]()()); T_v[i]()(1, 0) = timesI(F_v[i]()()); @@ -336,9 +337,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - autoView( T_v ,T,CpuWrite); - autoView( F_v ,F,CpuRead); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView( T_v ,T,AcceleratorWrite); + autoView( F_v ,F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = -(F_v[i]()()); T_v[i]()(1, 0) = (F_v[i]()()); @@ -355,9 +356,9 @@ private: T = Zero(); - autoView( T_v , T,CpuWrite); - autoView( F_v , F,CpuRead); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView( T_v , T,AcceleratorWrite); + autoView( F_v , F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 0) = timesI(F_v[i]()()); T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index b4afc69a..52e1ee00 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -106,10 +106,10 @@ public: const _SpinorField & phi, int mu) { - autoView( out_v, out, CpuWrite); - autoView( phi_v, phi, CpuRead); - autoView( Umu_v, Umu, CpuRead); - thread_for(sss,out.Grid()->oSites(),{ + autoView( out_v, out, AcceleratorWrite); + autoView( phi_v, phi, AcceleratorRead); + autoView( Umu_v, Umu, AcceleratorRead); + accelerator_for(sss,out.Grid()->oSites(),1,{ multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); }); } @@ -192,10 +192,10 @@ public: GaugeLinkField tmp(mat.Grid()); tmp = Zero(); { - autoView( tmp_v , tmp, CpuWrite); - autoView( Btilde_v , Btilde, CpuRead); - autoView( Atilde_v , Atilde, CpuRead); - thread_for(sss,tmp.Grid()->oSites(),{ + autoView( tmp_v , tmp, AcceleratorWrite); + autoView( Btilde_v , Btilde, AcceleratorRead); + autoView( Atilde_v , Atilde, AcceleratorRead); + accelerator_for(sss,tmp.Grid()->oSites(),1,{ int sU=sss; for(int s=0;s::ContractConservedCurrent(PropagatorField &q_in_1, conformable(_grid, q_in_1.Grid()); conformable(_grid, q_in_2.Grid()); conformable(_grid, q_out.Grid()); -#if 0 - PropagatorField tmp1(_grid), tmp2(_grid); - q_out = Zero(); - - // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu). - // Inefficient comms method but not performance critical. - tmp1 = Cshift(q_in_1, mu, 1); - tmp2 = Cshift(q_in_2, mu, 1); - autoView( tmp1_v , tmp1, CpuWrite); - autoView( tmp2_v , tmp2, CpuWrite); - autoView( q_in_1_v,q_in_1, CpuRead); - autoView( q_in_2_v,q_in_2, CpuRead); - autoView( q_out_v , q_out, CpuRead); - autoView( Umu_v , Umu, CpuRead); - thread_for(sU, Umu.Grid()->oSites(),{ - Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU], - q_in_2_v[sU], - q_out_v[sU], - Umu_v, sU, mu); - Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU], - tmp2_v[sU], - q_out_v[sU], - Umu_v, sU, mu); - }); -#else -#endif + assert(0); } @@ -508,62 +483,7 @@ void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, { conformable(_grid, q_in.Grid()); conformable(_grid, q_out.Grid()); -#if 0 - - // Lattice> ph(_grid), coor(_grid); - Complex i(0.0,1.0); - PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid); - unsigned int tshift = (mu == Tp) ? 1 : 0; - unsigned int LLt = GridDefaultLatt()[Tp]; - - q_out = Zero(); - LatticeInteger coords(_grid); - LatticeCoordinate(coords, Tp); - - // Need q(x + mu) and q(x - mu). - tmp = Cshift(q_in, mu, 1); - tmpFwd = tmp*lattice_cmplx; - tmp = lattice_cmplx*q_in; - tmpBwd = Cshift(tmp, mu, -1); - - autoView( coords_v , coords, CpuRead); - autoView( tmpFwd_v , tmpFwd, CpuRead); - autoView( tmpBwd_v , tmpBwd, CpuRead); - autoView( Umu_v , Umu, CpuRead); - autoView( q_out_v , q_out, CpuWrite); - - thread_for(sU, Umu.Grid()->oSites(), { - - // Compute the sequential conserved current insertion only if our simd - // object contains a timeslice we need. - vPredicate t_mask; - t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax)); - Integer timeSlices = Reduce(t_mask()); - - if (timeSlices > 0) { - Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], - q_out_v[sU], - Umu_v, sU, mu, t_mask); - } - - // Repeat for backward direction. - t_mask() = ((coords_v[sU] >= (tmin + tshift)) && - (coords_v[sU] <= (tmax + tshift))); - - //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3) - unsigned int t0 = 0; - if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 )); - - timeSlices = Reduce(t_mask()); - - if (timeSlices > 0) { - Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], - q_out_v[sU], - Umu_v, sU, mu, t_mask); - } - }); -#else -#endif + assert(0); } NAMESPACE_END(Grid);