Merge branch 'develop' into feature/hmc_generalise

2025-07-27 17:57:08 +01:00 · 2016-12-05 05:10:27 +00:00
parent 62749d05a6 e27c6b217c
commit 01480da0a8
30 changed files with 1384 additions and 3038 deletions
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -244,7 +244,10 @@ namespace Grid {
            pokeLocalSite(s,pgbuf,cbuf);
          }
        }
-        result = Cshift(result,dim,L);
+        if (p != processors[dim] - 1)
+        {
+          result = Cshift(result,dim,L);
+        }
      }
      
      // Loop over orthog coords
@@ -287,10 +290,10 @@ namespace Grid {
          cgbuf = clbuf;
          cgbuf[dim] = clbuf[dim]+L*pc;
          peekLocalSite(s,pgbuf,cgbuf);
-          s = s * div;
          pokeLocalSite(s,result,clbuf);
        }
      }
+      result = result*div;
      
      // destroying plan
      FFTW<scalar>::fftw_destroy_plan(p);
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -1080,10 +1080,10 @@ say con = 2
 **/

 template<class T>
-static void Lock(DenseMatrix<T> &H, 	///Hess mtx	
-		 DenseMatrix<T> &Q, 	///Lock Transform
-		 T val, 		///value to be locked
-		 int con, 	///number already locked
+static void Lock(DenseMatrix<T> &H, 	// Hess mtx	
+		 DenseMatrix<T> &Q, 	// Lock Transform
+		 T val, 		// value to be locked
+		 int con, 	// number already locked
 		 RealD small,
 		 int dfg,
 		 bool herm)
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -208,6 +208,7 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
+
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
@@ -216,6 +217,20 @@ typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;

+// Ls vectorised 
+typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
+typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
+typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
+
+typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
+typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
+typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
+
+typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
+typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
+typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
+
+
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
@@ -267,6 +282,7 @@ typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;


+
  }}
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -62,6 +62,50 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
  }
 }
+
+
+template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
+{
+  this->Report();
+  std::vector<int> latt = GridDefaultLatt();          
+  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP     = this->_FourDimGrid->_Nprocessors;
+  if ( M5Dcalls > 0 ) {
+    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  }
+
+  if ( MooeeInvCalls > 0 ) {
+
+    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
+
+    // Flops = 9*12*Ls*vol/2
+    RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  }
+
+}
+template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
+{
+  this->ZeroCounters();
+  M5Dflops=0;
+  M5Dcalls=0;
+  M5Dtime=0;
+  MooeeInvFlops=0;
+  MooeeInvCalls=0;
+  MooeeInvTime=0;
+}
+
+
 template<class Impl>  
 void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
 {
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -120,6 +120,18 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

+      
+
+     void CayleyReport(void);
+     void CayleyZeroCounters(void);
+
+     double M5Dflops;
+     double M5Dcalls;
+     double M5Dtime;
+
+     double MooeeInvFlops;
+     double MooeeInvCalls;
+     double MooeeInvTime;

    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -51,6 +51,9 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
  GridBase *grid=psi._grid;
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  M5Dcalls++;
+  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    for(int s=0;s<Ls;s++){
@@ -76,6 +79,7 @@ PARALLEL_FOR_LOOP
      }
    }
  }
+  M5Dtime+=usecond();
 }

 template<class Impl>  
@@ -91,6 +95,9 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;

+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  M5Dcalls++;
+  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
@@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP
      }
    }
  }
+  M5Dtime+=usecond();
 }

 template<class Impl>
@@ -126,10 +134,14 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &

  chi.checkerboard=psi.checkerboard;

+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];

+    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
    // Apply (L^{\prime})^{-1}
    chi[ss]=psi[ss]; // chi[0]=psi[0]
    for(int s=1;s<Ls;s++){
@@ -155,6 +167,9 @@ PARALLEL_FOR_LOOP
      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
    }
  }
+
+  MooeeInvTime+=usecond();
+
 }

 template<class Impl>
@@ -166,6 +181,8 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
  assert(psi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;

+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();

 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
@@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP
      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
    }
  }
+
+  MooeeInvTime+=usecond();
+
 }

 #ifdef CAYLEY_DPERP_CACHE
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -60,7 +60,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
-  int nsimd= Simd::Nsimd();
+  const int nsimd= Simd::Nsimd();

  Vector<iSinglet<Simd> > u(LLs);
  Vector<iSinglet<Simd> > l(LLs);
@@ -86,35 +86,138 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
    d_p[ss] = diag[s];
  }}

+
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  assert(Nc==3);
+
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+#if 0
+      alignas(64) SiteHalfSpinor hp;
+      alignas(64) SiteHalfSpinor hm;
+      alignas(64) SiteSpinor fp;
+      alignas(64) SiteSpinor fm;

-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
+      for(int v=0;v<LLs;v++){

-    for(int v=0;v<LLs;v++){
+	int vp=(v+1)%LLs;
+	int vm=(v+LLs-1)%LLs;

-      int vp=(v+1)%LLs;
-      int vm=(v+LLs-1)%LLs;
+	spProj5m(hp,psi[ss+vp]);
+	spProj5p(hm,psi[ss+vm]);

-      spProj5m(hp,psi[ss+vp]);
-      spProj5p(hm,psi[ss+vm]);
-      
-      if ( vp<=v ) rotate(hp,hp,1);
-      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+	if ( vp<=v ) rotate(hp,hp,1);
+	if ( vm>=v ) rotate(hm,hm,nsimd-1);
+	
+	hp=0.5*hp;
+        hm=0.5*hm;

-      hp=hp*0.5;
-      hm=hm*0.5;
-      spRecon5m(fp,hp);
-      spRecon5p(fm,hm);
+	spRecon5m(fp,hp);
+	spRecon5p(fm,hm);

-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+	chi[ss+v] = d[v]*phi[ss+v];
+	chi[ss+v] = chi[ss+v]     +u[v]*fp;
+	chi[ss+v] = chi[ss+v]     +l[v]*fm;

-    }
+      }
+#else
+      for(int v=0;v<LLs;v++){
+
+	vprefetch(psi[ss+v+LLs]);
+	//	vprefetch(phi[ss+v+LLs]);
+
+	int vp= (v==LLs-1) ? 0     : v+1;
+	int vm= (v==0    ) ? LLs-1 : v-1;
+	
+	Simd hp_00 = psi[ss+vp]()(2)(0); 
+	Simd hp_01 = psi[ss+vp]()(2)(1); 
+	Simd hp_02 = psi[ss+vp]()(2)(2); 
+	Simd hp_10 = psi[ss+vp]()(3)(0); 
+	Simd hp_11 = psi[ss+vp]()(3)(1); 
+	Simd hp_12 = psi[ss+vp]()(3)(2); 
+	
+	Simd hm_00 = psi[ss+vm]()(0)(0); 
+	Simd hm_01 = psi[ss+vm]()(0)(1); 
+	Simd hm_02 = psi[ss+vm]()(0)(2); 
+	Simd hm_10 = psi[ss+vm]()(1)(0); 
+	Simd hm_11 = psi[ss+vm]()(1)(1); 
+	Simd hm_12 = psi[ss+vm]()(1)(2); 
+
+	//	if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
+	//	if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
+
+	if ( vp<=v ) {
+	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+	}
+	if ( vm>=v ) {
+	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+	}
+
+	/*
+	if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
+	if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
+	if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
+	if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
+	*/	
+	Simd p_00  = d[v]()()() * phi[ss+v]()(0)(0)  + l[v]()()()*hm_00; 
+	Simd p_01  = d[v]()()() * phi[ss+v]()(0)(1)  + l[v]()()()*hm_01; 
+	Simd p_02  = d[v]()()() * phi[ss+v]()(0)(2)  + l[v]()()()*hm_02; 
+	Simd p_10  = d[v]()()() * phi[ss+v]()(1)(0)  + l[v]()()()*hm_10; 
+	Simd p_11  = d[v]()()() * phi[ss+v]()(1)(1)  + l[v]()()()*hm_11; 
+	Simd p_12  = d[v]()()() * phi[ss+v]()(1)(2)  + l[v]()()()*hm_12; 
+	Simd p_20  = d[v]()()() * phi[ss+v]()(2)(0)  + u[v]()()()*hp_00; 
+	Simd p_21  = d[v]()()() * phi[ss+v]()(2)(1)  + u[v]()()()*hp_01; 
+	Simd p_22  = d[v]()()() * phi[ss+v]()(2)(2)  + u[v]()()()*hp_02;  
+	Simd p_30  = d[v]()()() * phi[ss+v]()(3)(0)  + u[v]()()()*hp_10; 
+	Simd p_31  = d[v]()()() * phi[ss+v]()(3)(1)  + u[v]()()()*hp_11; 
+	Simd p_32  = d[v]()()() * phi[ss+v]()(3)(2)  + u[v]()()()*hp_12; 
+
+	
+	//	if ( ss==0){
+	/*
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
+	}
+	*/
+	vstream(chi[ss+v]()(0)(0),p_00);
+	vstream(chi[ss+v]()(0)(1),p_01);
+	vstream(chi[ss+v]()(0)(2),p_02);
+	vstream(chi[ss+v]()(1)(0),p_10);
+	vstream(chi[ss+v]()(1)(1),p_11);
+	vstream(chi[ss+v]()(1)(2),p_12);
+	vstream(chi[ss+v]()(2)(0),p_20);
+	vstream(chi[ss+v]()(2)(1),p_21);
+	vstream(chi[ss+v]()(2)(2),p_22);
+	vstream(chi[ss+v]()(3)(0),p_30);
+	vstream(chi[ss+v]()(3)(1),p_31);
+	vstream(chi[ss+v]()(3)(2),p_32);
+
+      }
+#endif
  }
+  M5Dtime+=usecond();
 }

 template<class Impl>  
@@ -154,6 +257,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
    d_p[ss] = diag[s];
  }}

+  M5Dcalls++;
+  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs

@@ -183,8 +288,8 @@ PARALLEL_FOR_LOOP

    }
  }
+  M5Dtime+=usecond();
 }
-
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
@@ -250,13 +355,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
    }
  }
  
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
  // Dynamic allocate on stack to get per thread without serialised heap acces
-PARALLEL_FOR_LOOP
-  for(auto site=0;site<vol;site++){
-    
-    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
-    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
-    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
+#pragma omp parallel  
+  {

    Vector<SiteHalfSpinor> SitePplus(LLs);
    Vector<SiteHalfSpinor> SitePminus(LLs);
@@ -267,6 +370,9 @@ PARALLEL_FOR_LOOP
    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;

+#pragma omp for 
+  for(auto site=0;site<vol;site++){
+
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spProj5p(SitePplus[s] ,psi[lex]);
@@ -294,6 +400,8 @@ PARALLEL_FOR_LOOP
      chi[lex] = SiteChi[s]*0.5;
    }
  }
+  }
+  MooeeInvTime+=usecond();
 }

 INSTANTIATE_DPERP(DomainWallVec5dImplD);
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -194,6 +194,11 @@ void WilsonFermion5D<Impl>::Report(void)
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;

+    RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+
+
   }

  if ( DerivCalls > 0 ) {
@@ -209,12 +214,15 @@ void WilsonFermion5D<Impl>::Report(void)
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
-  }
+
+    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl;  }

  if (DerivCalls > 0 || DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D Stencil"    <<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
  }
 }

--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -167,7 +167,7 @@ namespace Optimization {
    }
    //Integer
    inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA4)
+#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
          __m128i a0,a1;
          __m128i b0,b1;
          a0 = _mm256_extractf128_si256(a,0);
@@ -195,7 +195,7 @@ namespace Optimization {
    }
    //Integer
    inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA4)
+#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
          __m128i a0,a1;
          __m128i b0,b1;
          a0 = _mm256_extractf128_si256(a,0);
@@ -216,7 +216,7 @@ namespace Optimization {
  struct MultComplex{
    // Complex float
    inline __m256 operator()(__m256 a, __m256 b){
-#if defined (AVX1) 
+#if defined (AVX1)
      __m256 ymm0,ymm1,ymm2;
      ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
      ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
@@ -233,7 +233,7 @@ namespace Optimization {
      a_imag = _mm256_mul_ps( a_imag,tmp  );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
      return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
-#if defined (AVX2)
+#if defined (AVX2)  || defined (AVXFMA)
      __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
      __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
      a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ));  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@@ -264,7 +264,7 @@ namespace Optimization {
 	IF IMM0[3] = 0
 	THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
      */
-#if defined (AVX1) 
+#if defined (AVX1)
      __m256d ymm0,ymm1,ymm2;
      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
      ymm0 = _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
@@ -279,7 +279,7 @@ namespace Optimization {
      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
      return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
-#if defined (AVX2)
+#if defined (AVX2) || defined (AVXFMA)
      __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
      __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@@ -320,7 +320,7 @@ namespace Optimization {
 #if defined (AVXFMA4)
      a= _mm256_macc_ps(b,c,a);
 #endif
-#if defined (AVX2)
+#if defined (AVX2) || defined (AVXFMA)
      a= _mm256_fmadd_ps( b, c, a);
 #endif
    }
@@ -332,7 +332,7 @@ namespace Optimization {
 #if defined (AVXFMA4)
      a= _mm256_macc_pd(b,c,a);
 #endif
-#if defined (AVX2)
+#if defined (AVX2) || defined (AVXFMA)
      a= _mm256_fmadd_pd( b, c, a);
 #endif
    }
@@ -347,7 +347,7 @@ namespace Optimization {
    }
    // Integer
    inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) 
+#if defined (AVX1) || defined (AVXFMA)
      __m128i a0,a1;
      __m128i b0,b1;
      a0 = _mm256_extractf128_si256(a,0);
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -27,15 +27,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-//----------------------------------------------------------------------
-/*! @file Grid_knc.h
-  @brief Optimization libraries for AVX512 instructions set for KNC
-
-  Using intrinsics
-*/
-// Time-stamp: <2015-06-09 14:27:28 neo>
-//----------------------------------------------------------------------
-
 #include <immintrin.h>


@@ -95,13 +86,13 @@ namespace Optimization {
  struct Vstream{
    //Float
    inline void operator()(float * a, __m512 b){
-      //_mm512_stream_ps(a,b);
-      _mm512_store_ps(a,b);
+      _mm512_stream_ps(a,b);
+      //      _mm512_store_ps(a,b);
    }
    //Double
    inline void operator()(double * a, __m512d b){
-      //_mm512_stream_pd(a,b);
-      _mm512_store_pd(a,b);
+      _mm512_stream_pd(a,b);
+      //      _mm512_store_pd(a,b);
    }

  };
--- a/lib/simd/Grid_generic.h
+++ b/lib/simd/Grid_generic.h
@@ -6,8 +6,7 @@

    Copyright (C) 2015

-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
+Author: Antonin Portelli <antonin.portelli@me.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -27,133 +26,352 @@ Author: neo <cossu@post.kek.jp>
    *************************************************************************************/
    /*  END LEGAL */

+static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
+
+//#define VECTOR_LOOPS
+
+// playing with compiler pragmas
+#ifdef VECTOR_LOOPS
+#ifdef __clang__
+#define VECTOR_FOR(i, w, inc)\
+_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\
+for (unsigned int i = 0; i < w; i += inc)
+#elif defined __INTEL_COMPILER
+#define VECTOR_FOR(i, w, inc)\
+_Pragma("simd vectorlength(w*8)")\
+for (unsigned int i = 0; i < w; i += inc)
+#else
+#define VECTOR_FOR(i, w, inc)\
+for (unsigned int i = 0; i < w; i += inc)
+#endif
+#else
+#define VECTOR_FOR(i, w, inc)\
+for (unsigned int i = 0; i < w; i += inc)
+#endif
+
 namespace Grid {
 namespace Optimization {

-  template<class vtype>
-  union uconv {
-    float f;
-    vtype v;
+  // type traits giving the number of elements for each vector type
+  template <typename T> struct W;
+  template <> struct W<double> {
+    constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
+    constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
  };
-
-  union u128f {
-    float v;
-    float f[4];
-  };
-  union u128d {
-    double v;
-    double f[2];
+  template <> struct W<float> {
+    constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
+    constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
  };
  
+  // SIMD vector types
+  template <typename T>
+  struct vec {
+    alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
+  };
+  
+  typedef vec<float>   vecf;
+  typedef vec<double>  vecd;
+  
  struct Vsplat{
-    //Complex float
-    inline u128f operator()(float a, float b){
-      u128f out; 
-      out.f[0] = a;
-      out.f[1] = b;
-      out.f[2] = a;
-      out.f[3] = b;
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(T a, T b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 2)
+      {
+        out.v[i]   = a;
+        out.v[i+1] = b;
+      }
+
      return out;
    }
-    // Real float
-    inline u128f operator()(float a){
-      u128f out; 
-      out.f[0] = a;
-      out.f[1] = a;
-      out.f[2] = a;
-      out.f[3] = a;
+    
+    // Real
+    template <typename T>
+    inline vec<T> operator()(T a){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 1)
+      {
+        out.v[i] = a;
+      }
+      
      return out;
    }
-    //Complex double
-    inline u128d operator()(double a, double b){
-      u128d out; 
-      out.f[0] = a;
-      out.f[1] = b;
-      return out;
-    }
-    //Real double
-    inline u128d operator()(double a){
-      u128d out; 
-      out.f[0] = a;
-      out.f[1] = a;
-      return out;
-    }
-    //Integer
+    
+    // Integer
    inline int operator()(Integer a){
      return a;
    }
  };

  struct Vstore{
-    //Float 
-    inline void operator()(u128f a, float* F){
-      memcpy(F,a.f,4*sizeof(float));
-    }
-    //Double
-    inline void operator()(u128d a, double* D){
-      memcpy(D,a.f,2*sizeof(double));
+    // Real
+    template <typename T>
+    inline void operator()(vec<T> a, T *D){
+      *((vec<T> *)D) = a;
    }
    //Integer
-    inline void operator()(int a, Integer* I){
-      I[0] = a;
+    inline void operator()(int a, Integer *I){
+      *I = a;
    }

  };

  struct Vstream{
-    //Float
-    inline void operator()(float * a, u128f b){
-      memcpy(a,b.f,4*sizeof(float));
+    // Real
+    template <typename T>
+    inline void operator()(T * a, vec<T> b){
+      *((vec<T> *)a) = b;
    }
-    //Double
-    inline void operator()(double * a, u128d b){
-      memcpy(a,b.f,2*sizeof(double));
-    }
-
-
  };

  struct Vset{
-    // Complex float 
-    inline u128f operator()(Grid::ComplexF *a){
-      u128f out; 
-      out.f[0] = a[0].real();
-      out.f[1] = a[0].imag();
-      out.f[2] = a[1].real();
-      out.f[3] = a[1].imag();
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(std::complex<T> *a){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+        out.v[2*i]   = a[i].real();
+        out.v[2*i+1] = a[i].imag();
+      }
+      
      return out;
    }
-    // Complex double 
-    inline u128d operator()(Grid::ComplexD *a){
-      u128d out; 
-      out.f[0] = a[0].real();
-      out.f[1] = a[0].imag();
-      return out;
-    }
-    // Real float 
-    inline u128f operator()(float *a){
-      u128f out; 
-      out.f[0] = a[0];
-      out.f[1] = a[1];
-      out.f[2] = a[2];
-      out.f[3] = a[3];
-      return out;
-    }
-    // Real double
-    inline u128d operator()(double *a){
-      u128d out; 
-      out.f[0] = a[0];
-      out.f[1] = a[1];
+    
+    // Real
+    template <typename T>
+    inline vec<T> operator()(T *a){
+      vec<T> out;
+      
+      out = *((vec<T> *)a);
+      
      return out;
    }
+
    // Integer
    inline int operator()(Integer *a){
-      return a[0];
+      return *a;
    }
-
-
  };

+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    // Complex/Real
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 1)
+      {
+        out.v[i] = a.v[i] + b.v[i];
+      }
+      
+      return out;
+    }
+    
+    //I nteger
+    inline int operator()(int a, int b){
+      return a + b;
+    }
+  };
+
+  struct Sub{
+    // Complex/Real
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 1)
+      {
+        out.v[i] = a.v[i] - b.v[i];
+      }
+      
+      return out;
+    }
+    
+    //Integer
+    inline int operator()(int a, int b){
+      return a-b;
+    }
+  };
+
+  struct Mult{
+    // Real
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 1)
+      {
+        out.v[i] = a.v[i]*b.v[i];
+      }
+      
+      return out;
+    }
+    
+    // Integer
+    inline int operator()(int a, int b){
+      return a*b;
+    }
+  };
+  
+  #define cmul(a, b, c, i)\
+  c[i]   = a[i]*b[i]   - a[i+1]*b[i+1];\
+  c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
+  
+  struct MultComplex{
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+        cmul(a.v, b.v, out.v, 2*i);
+      }      
+      
+      return out;
+    }
+  };
+  
+  #undef cmul
+
+  struct Div{
+    // Real
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 1)
+      {
+        out.v[i] = a.v[i]/b.v[i];
+      }
+      
+      return out;
+    }
+  };
+  
+  #define conj(a, b, i)\
+  b[i]   = a[i];\
+  b[i+1] = -a[i+1];
+  
+  struct Conj{
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(vec<T> a){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+        conj(a.v, out.v, 2*i);
+      }
+      
+      return out;
+    }
+  };
+  
+  #undef conj
+
+  #define timesmi(a, b, i)\
+  b[i]   = a[i+1];\
+  b[i+1] = -a[i];
+  
+  struct TimesMinusI{
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+        timesmi(a.v, out.v, 2*i);
+      }
+      
+      return out;
+    }
+  };
+
+  #undef timesmi
+  
+  #define timesi(a, b, i)\
+  b[i]   = -a[i+1];\
+  b[i+1] = a[i];
+  
+  struct TimesI{
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+        timesi(a.v, out.v, 2*i);
+      }
+      
+      return out;
+    }
+  };
+  
+  #undef timesi
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  #define perm(a, b, n, w)\
+  unsigned int _mask = w >> (n + 1);\
+  VECTOR_FOR(i, w, 1)\
+  {\
+    b[i] = a[i^_mask];\
+  }
+  
+  #define DECL_PERMUTE_N(n)\
+  template <typename T>\
+  static inline vec<T> Permute##n(vec<T> in) {\
+    vec<T> out;\
+    perm(in.v, out.v, n, W<T>::r);\
+    return out;\
+  }
+  
+  struct Permute{
+    DECL_PERMUTE_N(0);
+    DECL_PERMUTE_N(1);
+    DECL_PERMUTE_N(2);
+    DECL_PERMUTE_N(3);
+  };
+  
+  #undef perm
+  #undef DECL_PERMUTE_N
+  
+  #define rot(a, b, n, w)\
+  VECTOR_FOR(i, w, 1)\
+  {\
+    b[i] = a[(i + n)%w];\
+  }
+  
+  struct Rotate{
+    template <typename T>
+    static inline vec<T> rotate(vec<T> in, int n){
+      vec<T> out;
+      
+      rot(in.v, out.v, n, W<T>::r);
+      
+      return out;
+    }
+  };
+
+  #undef rot
+  
+  #define acc(v, a, off, step, n)\
+  for (unsigned int i = off; i < n; i += step)\
+  {\
+    a += v[i];\
+  }
+  
  template <typename Out_type, typename In_type>
  struct Reduce{
    //Need templated class to overload output type
@@ -164,316 +382,67 @@ namespace Optimization {
      return 0;
    }
  };
-
-  /////////////////////////////////////////////////////
-  // Arithmetic operations
-  /////////////////////////////////////////////////////
-  struct Sum{
-    //Complex/Real float
-    inline u128f operator()(u128f a, u128f b){
-      u128f out;
-      out.f[0] = a.f[0] + b.f[0];
-      out.f[1] = a.f[1] + b.f[1];
-      out.f[2] = a.f[2] + b.f[2];
-      out.f[3] = a.f[3] + b.f[3];
-      return out;
-    }
-    //Complex/Real double
-    inline u128d operator()(u128d a, u128d b){
-      u128d out;
-      out.f[0] = a.f[0] + b.f[0];
-      out.f[1] = a.f[1] + b.f[1];
-      return out;
-    }
-    //Integer
-    inline int operator()(int a, int b){
-      return a + b;
-    }
-  };
-
-  struct Sub{
-    //Complex/Real float
-    inline u128f operator()(u128f a, u128f b){
-      u128f out;
-      out.f[0] = a.f[0] - b.f[0];
-      out.f[1] = a.f[1] - b.f[1];
-      out.f[2] = a.f[2] - b.f[2];
-      out.f[3] = a.f[3] - b.f[3];
-      return out;
-    }
-    //Complex/Real double
-    inline u128d operator()(u128d a, u128d b){
-      u128d out;
-      out.f[0] = a.f[0] - b.f[0];
-      out.f[1] = a.f[1] - b.f[1];
-      return out;
-    }
-    //Integer
-    inline int operator()(int a, int b){
-      return a-b;
-    }
-  };
-
-  struct MultComplex{
-    // Complex float
-    inline u128f operator()(u128f a, u128f b){
-      u128f out;
-      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
-      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
-      out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
-      out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
-      return out;
-    }
-    // Complex double
-    inline u128d operator()(u128d a, u128d b){
-      u128d out;
-      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
-      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
-      return out;
-    }
-  };
-
-  struct Mult{
-    //CK: Appear unneeded
-    // inline float  mac(float a, float b,double c){
-    //   return 0;
-    // }
-    // inline double mac(double a, double b,double c){
-    //   return 0;
-    // }
-
-    // Real float
-    inline u128f operator()(u128f a, u128f b){
-      u128f out;
-      out.f[0] = a.f[0]*b.f[0];
-      out.f[1] = a.f[1]*b.f[1];
-      out.f[2] = a.f[2]*b.f[2];
-      out.f[3] = a.f[3]*b.f[3];
-      return out;
-    }
-    // Real double
-    inline u128d operator()(u128d a, u128d b){
-      u128d out;
-      out.f[0] = a.f[0]*b.f[0];
-      out.f[1] = a.f[1]*b.f[1];
-      return out;
-    }
-    // Integer
-    inline int operator()(int a, int b){
-      return a*b;
-    }
-  };
-
-  struct Conj{
-    // Complex single
-    inline u128f operator()(u128f in){
-      u128f out;
-      out.f[0] = in.f[0];
-      out.f[1] = -in.f[1];
-      out.f[2] = in.f[2];
-      out.f[3] = -in.f[3];
-      return out;
-    }
-    // Complex double
-    inline u128d operator()(u128d in){
-      u128d out;
-      out.f[0] = in.f[0];
-      out.f[1] = -in.f[1];
-      return out;
-    }
-    // do not define for integer input
-  };
-
-  struct TimesMinusI{
-    //Complex single
-    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
-      u128f out;
-      out.f[0] = in.f[1];
-      out.f[1] = -in.f[0];
-      out.f[2] = in.f[3];
-      out.f[3] = -in.f[2];
-      return out;
-    }
-    //Complex double
-    inline u128d operator()(u128d in, u128d ret){
-      u128d out;
-      out.f[0] = in.f[1];
-      out.f[1] = -in.f[0];
-      return out;
-    }
-  };
-
-  struct TimesI{
-    //Complex single
-    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
-      u128f out;
-      out.f[0] = -in.f[1];
-      out.f[1] = in.f[0];
-      out.f[2] = -in.f[3];
-      out.f[3] = in.f[2];
-      return out;
-    }
-    //Complex double
-    inline u128d operator()(u128d in, u128d ret){
-      u128d out;
-      out.f[0] = -in.f[1];
-      out.f[1] = in.f[0];
-      return out;
-    }
-  };
-
-  //////////////////////////////////////////////
-  // Some Template specialization
-  struct Permute{
-    //We just have to mirror the permutes of Grid_sse4.h
-    static inline u128f Permute0(u128f in){ //AB CD -> CD AB
-      u128f out;
-      out.f[0] = in.f[2];
-      out.f[1] = in.f[3];
-      out.f[2] = in.f[0];
-      out.f[3] = in.f[1];
-      return out;
-    };
-    static inline u128f Permute1(u128f in){ //AB CD -> BA DC
-      u128f out;
-      out.f[0] = in.f[1];
-      out.f[1] = in.f[0];
-      out.f[2] = in.f[3];
-      out.f[3] = in.f[2];
-      return out;
-    };
-    static inline u128f Permute2(u128f in){
-      return in;
-    };
-    static inline u128f Permute3(u128f in){
-      return in;
-    };
-
-    static inline u128d Permute0(u128d in){ //AB -> BA
-      u128d out;
-      out.f[0] = in.f[1];
-      out.f[1] = in.f[0];
-      return out;      
-    };
-    static inline u128d Permute1(u128d in){
-      return in;
-    };
-    static inline u128d Permute2(u128d in){
-      return in;
-    };
-    static inline u128d Permute3(u128d in){
-      return in;
-    };
-
-  };
  
-  template < typename vtype > 
-    void permute(vtype &a, vtype b, int perm) {
-   };
-    
-  struct Rotate{
-
-    static inline u128f rotate(u128f in,int n){
-      u128f out;
-      switch(n){
-      case 0:
-        out.f[0] = in.f[0];
-        out.f[1] = in.f[1];
-        out.f[2] = in.f[2];
-        out.f[3] = in.f[3];
-        break;
-      case 1:
-        out.f[0] = in.f[1];
-        out.f[1] = in.f[2];
-        out.f[2] = in.f[3];
-        out.f[3] = in.f[0];
-        break;
-      case 2:
-        out.f[0] = in.f[2];
-        out.f[1] = in.f[3];
-        out.f[2] = in.f[0];
-        out.f[3] = in.f[1];
-        break;
-      case 3:
-        out.f[0] = in.f[3];
-        out.f[1] = in.f[0];
-        out.f[2] = in.f[1];
-        out.f[3] = in.f[2];
-        break;
-      default: assert(0);
-      }
-      return out;
-    }
-    static inline u128d rotate(u128d in,int n){
-      u128d out;
-      switch(n){
-      case 0:
-        out.f[0] = in.f[0];
-        out.f[1] = in.f[1];
-        break;
-      case 1:
-        out.f[0] = in.f[1];
-        out.f[1] = in.f[0];
-        break;
-      default: assert(0);
-      }
-      return out;
-    }
-  };
-
  //Complex float Reduce
-  template<>
-  inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
-    return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
+  template <>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, vecf>::operator()(vecf in){
+    float a = 0.f, b = 0.f;
+    
+    acc(in.v, a, 0, 2, W<float>::r);
+    acc(in.v, b, 1, 2, W<float>::r);
+    
+    return Grid::ComplexF(a, b);
  }
+  
  //Real float Reduce
  template<>
-  inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
-    return in.f[0] + in.f[1] + in.f[2] + in.f[3];
+  inline Grid::RealF Reduce<Grid::RealF, vecf>::operator()(vecf in){
+    float a = 0.;
+    
+    acc(in.v, a, 0, 1, W<float>::r);
+    
+    return a;
  }
  
-  
  //Complex double Reduce
  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
-    return Grid::ComplexD(in.f[0],in.f[1]);
+  inline Grid::ComplexD Reduce<Grid::ComplexD, vecd>::operator()(vecd in){
+    double a = 0., b = 0.;
+    
+    acc(in.v, a, 0, 2, W<double>::r);
+    acc(in.v, b, 1, 2, W<double>::r);
+    
+    return Grid::ComplexD(a, b);
  }
  
  //Real double Reduce
  template<>
-  inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
-    return in.f[0] + in.f[1];
+  inline Grid::RealD Reduce<Grid::RealD, vecd>::operator()(vecd in){
+    double a = 0.f;
+    
+    acc(in.v, a, 0, 1, W<double>::r);
+    
+    return a;
  }

  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, int>::operator()(int in){
-    // FIXME unimplemented
-   printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    return in;
  }
 }

 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 

-  typedef Optimization::u128f SIMD_Ftype;  // Single precision type
-  typedef Optimization::u128d SIMD_Dtype; // Double precision type
+  typedef Optimization::vecf SIMD_Ftype; // Single precision type
+  typedef Optimization::vecd SIMD_Dtype; // Double precision type
  typedef int SIMD_Itype; // Integer type

  // prefetch utilities
  inline void v_prefetch0(int size, const char *ptr){};
  inline void prefetch_HINT_T0(const char *ptr){};

-
-
-  // Gpermute function
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  }
-
-
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
@@ -481,16 +450,13 @@ namespace Optimization {
  typedef Optimization::Vstream  VstreamSIMD;
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;

- 
-
-
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
-
 }
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@@ -26,14 +26,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-//----------------------------------------------------------------------
-/*! @file Grid_knc.h
-  @brief Optimization libraries for AVX512 instructions set for KNC
-
-  Using intrinsics
-*/
-// Time-stamp: <2015-06-09 14:27:28 neo>
-//----------------------------------------------------------------------

 #include <immintrin.h>
 #include <zmmintrin.h>
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -244,7 +244,22 @@ namespace Optimization {
      return a*b;
    }
  };
-  
+
+  struct Div{
+    // Real double
+    inline vector4double operator()(vector4double a, vector4double b){
+      return vec_swdiv(a, b);
+    }
+
+    // Real float
+    FLOAT_WRAP_2(operator(), inline)
+
+    // Integer
+    inline int operator()(int a, int b){
+      return a/b;
+    }
+  };
+
  struct Conj{
    // Complex double
    inline vector4double operator()(vector4double v){
@@ -413,6 +428,7 @@ template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 typedef Optimization::Sum         SumSIMD;
 typedef Optimization::Sub         SubSIMD;
 typedef Optimization::Mult        MultSIMD;
+typedef Optimization::Div         DivSIMD;
 typedef Optimization::MultComplex MultComplexSIMD;
 typedef Optimization::Conj        ConjSIMD;
 typedef Optimization::TimesMinusI TimesMinusISIMD;
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -38,13 +38,13 @@ directory
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES

-#ifdef GENERIC_VEC
+#ifdef GEN
 #include "Grid_generic.h"
 #endif
 #ifdef SSE4
 #include "Grid_sse4.h"
 #endif
-#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4)
+#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
 #include "Grid_avx.h"
 #endif
 #if defined AVX512
@@ -130,7 +130,7 @@ class Grid_simd {

  Vector_type v;

-  static inline int Nsimd(void) {
+  static inline constexpr int Nsimd(void) {
    return sizeof(Vector_type) / sizeof(Scalar_type);
  }