diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index 52e1ee00..d7941d1f 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -106,11 +106,15 @@ public: const _SpinorField & phi, int mu) { + const int Nsimd = SiteHalfSpinor::Nsimd(); autoView( out_v, out, AcceleratorWrite); autoView( phi_v, phi, AcceleratorRead); autoView( Umu_v, Umu, AcceleratorRead); - accelerator_for(sss,out.Grid()->oSites(),1,{ - multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); + typedef decltype(coalescedRead(out_v[0])) calcSpinor; + accelerator_for(sss,out.Grid()->oSites(),Nsimd,{ + calcSpinor tmp; + multLink(tmp,Umu_v[sss],phi_v(sss),mu); + coalescedWrite(out_v[sss],tmp); }); } diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 032535b3..5d602ce9 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -445,7 +445,7 @@ public: // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 // double flops=(1344.0*volume)/2; -#if 1 +#if 0 double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; #else double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2; diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc index 5282c756..ea88885e 100644 --- a/tests/debug/Test_cayley_mres.cc +++ b/tests/debug/Test_cayley_mres.cc @@ -117,8 +117,8 @@ int main (int argc, char ** argv) else { std::cout<::ColdConfiguration(Umu); - // SU::HotConfiguration(RNG4,Umu); + //SU::ColdConfiguration(Umu); + SU::HotConfiguration(RNG4,Umu); } RealD mass=0.3;