Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering

2025-11-23 16:09:32 +00:00 · 2016-12-14 09:23:22 +00:00
parent 426197e446 c22c3db9ad
commit 0cd6b1858c
8 changed files with 257 additions and 127 deletions
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@@ -41,7 +41,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=8;
+  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -70,7 +70,7 @@ int main (int argc, char ** argv)
  if (1)
  {
-    const int ncall=100;
+    const int ncall=1000;
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
@@ -81,18 +81,7 @@ int main (int argc, char ** argv)
    LatticeFermion result(FGrid);
    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
    FGrid->Barrier();
    double t0,t1;
    t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.Dhop(src,result,0);
    }
    t1=usecond();
    FGrid->Barrier();
    std::cout<<GridLogMessage << "Called Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
    LatticeFermion r_eo(FGrid);
    LatticeFermion src_e (FrbGrid);
@@ -109,48 +98,46 @@ int main (int argc, char ** argv)
    r_e = zero;
    r_o = zero;
    FGrid->Barrier();
    t0=usecond();
    for (int i = 0; i < ncall; i++) {
      Dw.DhopEO(src_o, r_e, DaggerNo);
    }
    t1=usecond();
    FGrid->Barrier();
    std::cout<<GridLogMessage << "Called DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
-    FGrid->Barrier();
+#define BENCH_DW(A,in,out)			\
-    t0=usecond();
+    Dw.CayleyZeroCounters();			\
-    for (int i = 0; i < ncall; i++) {
+    Dw. A (in,out);				\
-      Dw.Mooee(src_o, r_o);
+    FGrid->Barrier();				\
-    }
+    t0=usecond();				\
-    t1=usecond();
+    for(int i=0;i<ncall;i++){			\
-    FGrid->Barrier();
+      Dw. A (in,out);				\
-    std::cout<<GridLogMessage << "Called Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
+    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
    Dw.CayleyReport();					\
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;
-    FGrid->Barrier();
+#define BENCH_DW_MEO(A,in,out)			\
-    t0=usecond();
+    Dw.CayleyZeroCounters();			\
-    for (int i = 0; i < ncall; i++) {
+    Dw. A (in,out,0);				\
-      Dw.MooeeInv(src_o, r_o);
+    FGrid->Barrier();				\
-    }
+    t0=usecond();				\
-    t1=usecond();
+    for(int i=0;i<ncall;i++){			\
-    FGrid->Barrier();
+      Dw. A (in,out,0);				\
-    std::cout<<GridLogMessage << "Called MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
+    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
    Dw.CayleyReport();					\
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;
-
+    BENCH_DW_MEO(Dhop    ,src,result);
-    FGrid->Barrier();
+    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
-    t0=usecond();
+    BENCH_DW(Meooe   ,src_o,r_e);
-    for (int i = 0; i < ncall; i++) {
+    BENCH_DW(Mooee   ,src_o,r_o);
-      Dw.Meooe(src_o, r_e);
+    BENCH_DW(MooeeInv,src_o,r_o);
    }
    t1=usecond();
    FGrid->Barrier();
    std::cout<<GridLogMessage << "Called Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
  }
  if (1)
  {
-    const int ncall=100;
+    const int ncall=1000;
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
@@ -168,14 +155,6 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    double t0,t1;
    t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.Dhop(src,result,0);
    }
    t1=usecond();
    FGrid->Barrier();
    std::cout<<GridLogMessage << "Called Vec5D Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
    LatticeFermion r_eo(sFGrid);
    LatticeFermion src_e (sFrbGrid);
@@ -192,46 +171,13 @@ int main (int argc, char ** argv)
    r_e = zero;
    r_o = zero;
-    FGrid->Barrier();
+    BENCH_DW_MEO(Dhop    ,src,result);
-    t0=usecond();
+    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
-    for (int i = 0; i < ncall; i++) {
+    BENCH_DW(Meooe   ,src_o,r_e);
-      Dw.DhopEO(src_o, r_e, DaggerNo);
+    BENCH_DW(Mooee   ,src_o,r_o);
-    }
+    BENCH_DW(MooeeInv,src_o,r_o);
    t1=usecond();
    FGrid->Barrier();
    std::cout<<GridLogMessage << "Called Vec5D DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
    FGrid->Barrier();
    t0=usecond();
    for (int i = 0; i < ncall; i++) {
      Dw.Mooee(src_o, r_o);
    }
    t1=usecond();
    FGrid->Barrier();
    std::cout<<GridLogMessage << "Called Vec5D Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
    FGrid->Barrier();
    t0=usecond();
    for (int i = 0; i < ncall; i++) {
      Dw.MooeeInv(src_o, r_o);
    }
    t1=usecond();
    FGrid->Barrier();
    std::cout<<GridLogMessage << "Called Vec5D MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
    FGrid->Barrier();
    t0=usecond();
    for (int i = 0; i < ncall; i++) {
      Dw.Meooe(src_o, r_e);
    }
    t1=usecond();
    FGrid->Barrier();
    std::cout<<GridLogMessage << "Called Vec5D Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
  }
  Grid_finalize();
 }
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -62,6 +62,50 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
  }
 }
 template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 {
  this->Report();
  std::vector<int> latt = GridDefaultLatt();          
  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP     = this->_FourDimGrid->_Nprocessors;
  if ( M5Dcalls > 0 ) {
    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
    // Flops = 6.0*(Nc*Ns) *Ls*vol
    RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  }
  if ( MooeeInvCalls > 0 ) {
    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
    // Flops = 9*12*Ls*vol/2
    RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  }
 }
 template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
 {
  this->ZeroCounters();
  M5Dflops=0;
  M5Dcalls=0;
  M5Dtime=0;
  MooeeInvFlops=0;
  MooeeInvCalls=0;
  MooeeInvTime=0;
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
 {
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -120,6 +120,18 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
     void CayleyReport(void);
     void CayleyZeroCounters(void);
     double M5Dflops;
     double M5Dcalls;
     double M5Dtime;
     double MooeeInvFlops;
     double MooeeInvCalls;
     double MooeeInvTime;
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -51,6 +51,9 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
  GridBase *grid=psi._grid;
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  M5Dcalls++;
  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    for(int s=0;s<Ls;s++){
@@ -76,6 +79,7 @@ PARALLEL_FOR_LOOP
      }
    }
  }
  M5Dtime+=usecond();
 }
 template<class Impl>  
@@ -91,6 +95,9 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  M5Dcalls++;
  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
@@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP
      }
    }
  }
  M5Dtime+=usecond();
 }
 template<class Impl>
@@ -126,10 +134,14 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &
  chi.checkerboard=psi.checkerboard;
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
    // Apply (L^{\prime})^{-1}
    chi[ss]=psi[ss]; // chi[0]=psi[0]
    for(int s=1;s<Ls;s++){
@@ -155,6 +167,9 @@ PARALLEL_FOR_LOOP
      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
    }
  }
  MooeeInvTime+=usecond();
 }
 template<class Impl>
@@ -166,6 +181,8 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
  assert(psi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
@@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP
      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
    }
  }
  MooeeInvTime+=usecond();
 }
 #ifdef CAYLEY_DPERP_CACHE
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -60,7 +60,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
-  int nsimd= Simd::Nsimd();
+  const int nsimd= Simd::Nsimd();
  Vector<iSinglet<Simd> > u(LLs);
  Vector<iSinglet<Simd> > l(LLs);
@@ -86,35 +86,138 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
    d_p[ss] = diag[s];
  }}
  M5Dcalls++;
  M5Dtime-=usecond();
  assert(Nc==3);
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 #if 0
      alignas(64) SiteHalfSpinor hp;
      alignas(64) SiteHalfSpinor hm;
      alignas(64) SiteSpinor fp;
      alignas(64) SiteSpinor fm;
-    alignas(64) SiteHalfSpinor hp;
+      for(int v=0;v<LLs;v++){
    alignas(64) SiteHalfSpinor hm;
    alignas(64) SiteSpinor fp;
    alignas(64) SiteSpinor fm;
-    for(int v=0;v<LLs;v++){
+	int vp=(v+1)%LLs;
 	int vm=(v+LLs-1)%LLs;
-      int vp=(v+1)%LLs;
+	spProj5m(hp,psi[ss+vp]);
-      int vm=(v+LLs-1)%LLs;
+	spProj5p(hm,psi[ss+vm]);
-      spProj5m(hp,psi[ss+vp]);
+	if ( vp<=v ) rotate(hp,hp,1);
-      spProj5p(hm,psi[ss+vm]);
+	if ( vm>=v ) rotate(hm,hm,nsimd-1);
-      
+	
-      if ( vp<=v ) rotate(hp,hp,1);
+	hp=0.5*hp;
-      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+        hm=0.5*hm;
-      hp=hp*0.5;
+	spRecon5m(fp,hp);
-      hm=hm*0.5;
+	spRecon5p(fm,hm);
      spRecon5m(fp,hp);
      spRecon5p(fm,hm);
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+	chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+	chi[ss+v] = chi[ss+v]     +u[v]*fp;
 	chi[ss+v] = chi[ss+v]     +l[v]*fm;
-    }
+      }
 #else
      for(int v=0;v<LLs;v++){
 	vprefetch(psi[ss+v+LLs]);
 	//	vprefetch(phi[ss+v+LLs]);
 	int vp= (v==LLs-1) ? 0     : v+1;
 	int vm= (v==0    ) ? LLs-1 : v-1;
 	Simd hp_00 = psi[ss+vp]()(2)(0); 
 	Simd hp_01 = psi[ss+vp]()(2)(1); 
 	Simd hp_02 = psi[ss+vp]()(2)(2); 
 	Simd hp_10 = psi[ss+vp]()(3)(0); 
 	Simd hp_11 = psi[ss+vp]()(3)(1); 
 	Simd hp_12 = psi[ss+vp]()(3)(2); 
 	Simd hm_00 = psi[ss+vm]()(0)(0); 
 	Simd hm_01 = psi[ss+vm]()(0)(1); 
 	Simd hm_02 = psi[ss+vm]()(0)(2); 
 	Simd hm_10 = psi[ss+vm]()(1)(0); 
 	Simd hm_11 = psi[ss+vm]()(1)(1); 
 	Simd hm_12 = psi[ss+vm]()(1)(2); 
 	//	if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
 	//	if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
 	if ( vp<=v ) {
 	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 	}
 	if ( vm>=v ) {
 	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 	}
 	/*
 	if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
 	if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
 	if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
 	if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
 	*/	
 	Simd p_00  = d[v]()()() * phi[ss+v]()(0)(0)  + l[v]()()()*hm_00; 
 	Simd p_01  = d[v]()()() * phi[ss+v]()(0)(1)  + l[v]()()()*hm_01; 
 	Simd p_02  = d[v]()()() * phi[ss+v]()(0)(2)  + l[v]()()()*hm_02; 
 	Simd p_10  = d[v]()()() * phi[ss+v]()(1)(0)  + l[v]()()()*hm_10; 
 	Simd p_11  = d[v]()()() * phi[ss+v]()(1)(1)  + l[v]()()()*hm_11; 
 	Simd p_12  = d[v]()()() * phi[ss+v]()(1)(2)  + l[v]()()()*hm_12; 
 	Simd p_20  = d[v]()()() * phi[ss+v]()(2)(0)  + u[v]()()()*hp_00; 
 	Simd p_21  = d[v]()()() * phi[ss+v]()(2)(1)  + u[v]()()()*hp_01; 
 	Simd p_22  = d[v]()()() * phi[ss+v]()(2)(2)  + u[v]()()()*hp_02;  
 	Simd p_30  = d[v]()()() * phi[ss+v]()(3)(0)  + u[v]()()()*hp_10; 
 	Simd p_31  = d[v]()()() * phi[ss+v]()(3)(1)  + u[v]()()()*hp_11; 
 	Simd p_32  = d[v]()()() * phi[ss+v]()(3)(2)  + u[v]()()()*hp_12; 
 	//	if ( ss==0){
 	/*
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
 	}
 	*/
 	vstream(chi[ss+v]()(0)(0),p_00);
 	vstream(chi[ss+v]()(0)(1),p_01);
 	vstream(chi[ss+v]()(0)(2),p_02);
 	vstream(chi[ss+v]()(1)(0),p_10);
 	vstream(chi[ss+v]()(1)(1),p_11);
 	vstream(chi[ss+v]()(1)(2),p_12);
 	vstream(chi[ss+v]()(2)(0),p_20);
 	vstream(chi[ss+v]()(2)(1),p_21);
 	vstream(chi[ss+v]()(2)(2),p_22);
 	vstream(chi[ss+v]()(3)(0),p_30);
 	vstream(chi[ss+v]()(3)(1),p_31);
 	vstream(chi[ss+v]()(3)(2),p_32);
      }
 #endif
  }
  M5Dtime+=usecond();
 }
 template<class Impl>  
@@ -154,6 +257,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
    d_p[ss] = diag[s];
  }}
  M5Dcalls++;
  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
@@ -183,8 +288,8 @@ PARALLEL_FOR_LOOP
    }
  }
  M5Dtime+=usecond();
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
@@ -250,13 +355,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
    }
  }
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  // Dynamic allocate on stack to get per thread without serialised heap acces
-PARALLEL_FOR_LOOP
+#pragma omp parallel  
-  for(auto site=0;site<vol;site++){
+  {
    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
    Vector<SiteHalfSpinor> SitePplus(LLs);
    Vector<SiteHalfSpinor> SitePminus(LLs);
@@ -267,6 +370,9 @@ PARALLEL_FOR_LOOP
    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;
 #pragma omp for 
  for(auto site=0;site<vol;site++){
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spProj5p(SitePplus[s] ,psi[lex]);
@@ -294,6 +400,8 @@ PARALLEL_FOR_LOOP
      chi[lex] = SiteChi[s]*0.5;
    }
  }
  }
  MooeeInvTime+=usecond();
 }
 INSTANTIATE_DPERP(DomainWallVec5dImplD);
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -514,7 +514,7 @@ namespace Optimization {
    template<int n>
    static inline __m256 tRotate(__m256 in){ 
      __m256 tmp = Permute::Permute0(in);
-      __m256 ret;
+      __m256 ret = in;
      if ( n > 3 ) { 
 	_mm256_alignr_epi32_grid(ret,in,tmp,n);  
      } else {
@@ -526,7 +526,7 @@ namespace Optimization {
    template<int n>
    static inline __m256d tRotate(__m256d in){ 
      __m256d tmp = Permute::Permute0(in);
-      __m256d ret;
+      __m256d ret = in;
      if ( n > 1 ) {
 	_mm256_alignr_epi64_grid(ret,in,tmp,n);          
      } else {
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -86,13 +86,13 @@ namespace Optimization {
  struct Vstream{
    //Float
    inline void operator()(float * a, __m512 b){
-      //_mm512_stream_ps(a,b);
+      _mm512_stream_ps(a,b);
-      _mm512_store_ps(a,b);
+      //      _mm512_store_ps(a,b);
    }
    //Double
    inline void operator()(double * a, __m512d b){
-      //_mm512_stream_pd(a,b);
+      _mm512_stream_pd(a,b);
-      _mm512_store_pd(a,b);
+      //      _mm512_store_pd(a,b);
    }
  };
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -130,7 +130,7 @@ class Grid_simd {
  Vector_type v;
-  static inline int Nsimd(void) {
+  static inline constexpr int Nsimd(void) {
    return sizeof(Vector_type) / sizeof(Scalar_type);
  }