Merge branch 'develop' into feature/hmc_generalise

2025-12-23 22:24:30 +00:00 · 2017-01-25 11:33:53 +00:00
parent 0baa20d292 d65e81518f
commit 17629b8d9e
88 changed files with 7904 additions and 430 deletions
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -29,6 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */

+#include <Grid/Eigen/Dense>
 #include <Grid.h>


@@ -48,18 +49,18 @@ namespace QCD {
 		   FourDimGrid,
 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
- { }
+ { 
+ }

 template<class Impl>  
 void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  FermionField tmp(psi._grid);

-  this->DW(psi,tmp,DaggerNo);
+  this->DW(psi,this->tmp(),DaggerNo);

  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
  }
 }

@@ -87,8 +88,8 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;

-    // Flops = 9*12*Ls*vol/2
-    RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
+    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
+    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  }
@@ -110,12 +111,11 @@ template<class Impl>
 void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  FermionField tmp(psi._grid);

-  this->DW(psi,tmp,DaggerYes);
+  this->DW(psi,this->tmp(),DaggerYes);

  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
  }
 }
 template<class Impl>  
@@ -138,6 +138,7 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
+// FIXME Redunant with the above routine; check this and eliminate
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
@@ -259,36 +260,33 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  FermionField tmp(psi._grid);

-  Meooe5D(psi,tmp); 
+  Meooe5D(psi,this->tmp()); 

  if ( psi.checkerboard == Odd ) {
-    this->DhopEO(tmp,chi,DaggerNo);
+    this->DhopEO(this->tmp(),chi,DaggerNo);
  } else {
-    this->DhopOE(tmp,chi,DaggerNo);
+    this->DhopOE(this->tmp(),chi,DaggerNo);
  }
 }

 template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
 {
-  FermionField tmp(psi._grid);
  // Apply 4d dslash
  if ( psi.checkerboard == Odd ) {
-    this->DhopEO(psi,tmp,DaggerYes);
+    this->DhopEO(psi,this->tmp(),DaggerYes);
  } else {
-    this->DhopOE(psi,tmp,DaggerYes);
+    this->DhopOE(psi,this->tmp(),DaggerYes);
  }
-  MeooeDag5D(tmp,chi); 
+  MeooeDag5D(this->tmp(),chi); 
 }

 template<class Impl>
 void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-  FermionField tmp(psi._grid);
-  Meo5D(psi,tmp);
+  Meo5D(psi,this->tmp());
  // Apply 4d dslash fragment
-  this->DhopDir(tmp,chi,dir,disp);
+  this->DhopDir(this->tmp(),chi,dir,disp);
 }
 // force terms; five routines; default to Dhop on diagonal
 template<class Impl>
@@ -459,9 +457,91 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
    dee[Ls-1] += delta_d;
  }  
+
+  int inv=1;
+  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
+  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
+
 }


+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
+						 Vector<iSinglet<Simd> > & Matp,
+						 Vector<iSinglet<Simd> > & Matm)
+{
+  int Ls=this->Ls;
+
+  GridBase *grid = this->FermionRedBlackGrid();
+  int LLs = grid->_rdimensions[0];
+
+  if ( LLs == Ls ) return; // Not vectorised in 5th direction
+
+  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXcd PplusMat ;
+  Eigen::MatrixXcd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+  
+  typedef typename SiteHalfSpinor::scalar_type scalar_type;
+  const int Nsimd=Simd::Nsimd();
+  Matp.resize(Ls*LLs);
+  Matm.resize(Ls*LLs);
+
+  for(int s2=0;s2<Ls;s2++){
+  for(int s1=0;s1<LLs;s1++){
+    int istride = LLs;
+    int ostride = 1;
+    Simd Vp;
+    Simd Vm;
+    scalar_type *sp = (scalar_type *)&Vp;
+    scalar_type *sm = (scalar_type *)&Vm;
+    for(int l=0;l<Nsimd;l++){
+      if ( switcheroo<Coeff_t>::iscomplex() ) {
+	sp[l] = PplusMat (l*istride+s1*ostride,s2);
+	sm[l] = PminusMat(l*istride+s1*ostride,s2);
+      } else { 
+      // if real
+	scalar_type tmp;
+	tmp = PplusMat (l*istride+s1*ostride,s2);
+	sp[l] = scalar_type(tmp.real(),tmp.real());
+	tmp = PminusMat(l*istride+s1*ostride,s2);
+	sm[l] = scalar_type(tmp.real(),tmp.real());
+      }
+    }
+    Matp[LLs*s2+s1] = Vp;
+    Matm[LLs*s2+s1] = Vm;
+  }}
+}
+

  FermOpTemplateInstantiate(CayleyFermion5D);
  GparityFermOpTemplateInstantiate(CayleyFermion5D);
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -33,6 +33,31 @@ namespace Grid {

  namespace QCD {

+     template<typename T> struct switcheroo   {  
+       static inline int iscomplex()  { return 0; } 
+
+       template<class vec>
+       static inline vec mult(vec a, vec b) {
+	 return real_mult(a,b);
+       }
+     };
+     template<> struct switcheroo<ComplexD> {  
+       static inline int iscomplex()  { return 1; } 
+
+       template<class vec>
+       static inline vec mult(vec a, vec b) {
+	 return a*b;
+       }
+     };
+     template<> struct switcheroo<ComplexF> {  
+       static inline int iscomplex()  { return 1; } 
+       template<class vec>
+       static inline vec mult(vec a, vec b) {
+	 return a*b;
+       }
+     };
+
+
    template<class Impl>
    class CayleyFermion5D : public WilsonFermion5D<Impl>
    {
@@ -75,7 +100,19 @@ namespace Grid {
 		  std::vector<Coeff_t> &lower,
 		  std::vector<Coeff_t> &diag,
 		  std::vector<Coeff_t> &upper);
+
      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
+      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd> > & Matp, Vector<iSinglet<Simd> > & Matm);
+
+      void MooeeInternalAsm(const FermionField &in, FermionField &out,
+			    int LLs, int site,
+			    Vector<iSinglet<Simd> > &Matp,
+			    Vector<iSinglet<Simd> > &Matm);
+      void MooeeInternalZAsm(const FermionField &in, FermionField &out,
+			    int LLs, int site,
+			    Vector<iSinglet<Simd> > &Matp,
+			    Vector<iSinglet<Simd> > &Matm);
+

      virtual void   Instantiatable(void)=0;

@@ -112,6 +149,12 @@ namespace Grid {
      std::vector<Coeff_t> ueem;    
      std::vector<Coeff_t> dee;    

+      // Matrices of 5d ee inverse params
+      Vector<iSinglet<Simd> >  MatpInv;
+      Vector<iSinglet<Simd> >  MatmInv;
+      Vector<iSinglet<Simd> >  MatpInvDag;
+      Vector<iSinglet<Simd> >  MatmInvDag;
+
      // Constructors
      CayleyFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -29,13 +29,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */

-#include <Grid/Eigen/Dense>
+
 #include <Grid.h>


 namespace Grid {
-namespace QCD {
-  /*
+namespace QCD {  /*
   * Dense matrix versions of routines
   */
 template<class Impl>
@@ -126,7 +125,6 @@ PARALLEL_FOR_LOOP
      for(int v=0;v<LLs;v++){

 	vprefetch(psi[ss+v+LLs]);
-	//	vprefetch(phi[ss+v+LLs]);

 	int vp= (v==LLs-1) ? 0     : v+1;
 	int vm= (v==0    ) ? LLs-1 : v-1;
@@ -145,9 +143,6 @@ PARALLEL_FOR_LOOP
 	Simd hm_11 = psi[ss+vm]()(1)(1); 
 	Simd hm_12 = psi[ss+vm]()(1)(2); 

-	//	if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
-	//	if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
-
 	if ( vp<=v ) {
 	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
@@ -165,42 +160,20 @@ PARALLEL_FOR_LOOP
 	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 	}

-	/*
-	if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
-	if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
-	if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
-	if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
-	*/	
-	Simd p_00  = d[v]()()() * phi[ss+v]()(0)(0)  + l[v]()()()*hm_00; 
-	Simd p_01  = d[v]()()() * phi[ss+v]()(0)(1)  + l[v]()()()*hm_01; 
-	Simd p_02  = d[v]()()() * phi[ss+v]()(0)(2)  + l[v]()()()*hm_02; 
-	Simd p_10  = d[v]()()() * phi[ss+v]()(1)(0)  + l[v]()()()*hm_10; 
-	Simd p_11  = d[v]()()() * phi[ss+v]()(1)(1)  + l[v]()()()*hm_11; 
-	Simd p_12  = d[v]()()() * phi[ss+v]()(1)(2)  + l[v]()()()*hm_12; 
-	Simd p_20  = d[v]()()() * phi[ss+v]()(2)(0)  + u[v]()()()*hp_00; 
-	Simd p_21  = d[v]()()() * phi[ss+v]()(2)(1)  + u[v]()()()*hp_01; 
-	Simd p_22  = d[v]()()() * phi[ss+v]()(2)(2)  + u[v]()()()*hp_02;  
-	Simd p_30  = d[v]()()() * phi[ss+v]()(3)(0)  + u[v]()()()*hp_10; 
-	Simd p_31  = d[v]()()() * phi[ss+v]()(3)(1)  + u[v]()()()*hp_11; 
-	Simd p_32  = d[v]()()() * phi[ss+v]()(3)(2)  + u[v]()()()*hp_12; 
+	// Can force these to real arithmetic and save 2x.
+	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02); 
+	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);  
+	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 

-	
-	//	if ( ss==0){
-	/*
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
-	}
-	*/
 	vstream(chi[ss+v]()(0)(0),p_00);
 	vstream(chi[ss+v]()(0)(1),p_01);
 	vstream(chi[ss+v]()(0)(2),p_02);
@@ -261,7 +234,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
-
+#if 0
    alignas(64) SiteHalfSpinor hp;
    alignas(64) SiteHalfSpinor hm;
    alignas(64) SiteSpinor fp;
@@ -287,9 +260,504 @@ PARALLEL_FOR_LOOP
      chi[ss+v] = chi[ss+v]     +l[v]*fm;

    }
+#else
+      for(int v=0;v<LLs;v++){
+
+	vprefetch(psi[ss+v+LLs]);
+
+	int vp= (v==LLs-1) ? 0     : v+1;
+	int vm= (v==0    ) ? LLs-1 : v-1;
+	
+	Simd hp_00 = psi[ss+vp]()(0)(0); 
+	Simd hp_01 = psi[ss+vp]()(0)(1); 
+	Simd hp_02 = psi[ss+vp]()(0)(2); 
+	Simd hp_10 = psi[ss+vp]()(1)(0); 
+	Simd hp_11 = psi[ss+vp]()(1)(1); 
+	Simd hp_12 = psi[ss+vp]()(1)(2); 
+	
+	Simd hm_00 = psi[ss+vm]()(2)(0); 
+	Simd hm_01 = psi[ss+vm]()(2)(1); 
+	Simd hm_02 = psi[ss+vm]()(2)(2); 
+	Simd hm_10 = psi[ss+vm]()(3)(0); 
+	Simd hm_11 = psi[ss+vm]()(3)(1); 
+	Simd hm_12 = psi[ss+vm]()(3)(2); 
+
+	if ( vp<=v ) {
+	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+	}
+	if ( vm>=v ) {
+	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+	}
+
+	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02); 
+	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
+
+	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);  
+	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+
+	vstream(chi[ss+v]()(0)(0),p_00);
+	vstream(chi[ss+v]()(0)(1),p_01);
+	vstream(chi[ss+v]()(0)(2),p_02);
+	vstream(chi[ss+v]()(1)(0),p_10);
+	vstream(chi[ss+v]()(1)(1),p_11);
+	vstream(chi[ss+v]()(1)(2),p_12);
+	vstream(chi[ss+v]()(2)(0),p_20);
+	vstream(chi[ss+v]()(2)(1),p_21);
+	vstream(chi[ss+v]()(2)(2),p_22);
+	vstream(chi[ss+v]()(3)(0),p_30);
+	vstream(chi[ss+v]()(3)(1),p_31);
+	vstream(chi[ss+v]()(3)(2),p_32);
+      }
+#endif
  }
  M5Dtime+=usecond();
 }
+
+
+#ifdef AVX512 
+#include <simd/Intel512common.h>
+#include <simd/Intel512avx.h>
+#include <simd/Intel512single.h>
+#endif 
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
+					     int LLs, int site,
+					     Vector<iSinglet<Simd> > &Matp,
+					     Vector<iSinglet<Simd> > &Matm)
+{
+#ifndef AVX512
+  {
+  SiteHalfSpinor BcastP;
+  SiteHalfSpinor BcastM;
+  SiteHalfSpinor SiteChiP;
+  SiteHalfSpinor SiteChiM;
+
+  // Ls*Ls * 2 * 12 * vol flops
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+        int s=s2+l*LLs;
+	int lex=s2+LLs*site;
+	
+	if ( s2==0 && l==0) {
+	  SiteChiP=zero;
+	  SiteChiM=zero;
+	}
+	
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	}}
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	}}
+
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
+	  SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
+	}}
+
+    }}
+    {
+      int lex = s1+LLs*site;
+      for(int sp=0;sp<2;sp++){
+      for(int co=0;co<Nc;co++){
+	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+      }}
+    }
+  }
+
+  }
+#else
+  {
+  // pointers
+    //  MASK_REGS;
+#define Chi_00 %%zmm1
+#define Chi_01 %%zmm2
+#define Chi_02 %%zmm3
+#define Chi_10 %%zmm4
+#define Chi_11 %%zmm5
+#define Chi_12 %%zmm6
+#define Chi_20 %%zmm7
+#define Chi_21 %%zmm8
+#define Chi_22 %%zmm9
+#define Chi_30 %%zmm10
+#define Chi_31 %%zmm11
+#define Chi_32 %%zmm12
+
+#define BCAST0   %%zmm13
+#define BCAST1   %%zmm14
+#define BCAST2   %%zmm15
+#define BCAST3   %%zmm16
+#define BCAST4   %%zmm17
+#define BCAST5   %%zmm18
+#define BCAST6   %%zmm19
+#define BCAST7   %%zmm20
+#define BCAST8   %%zmm21
+#define BCAST9   %%zmm22
+#define BCAST10  %%zmm23
+#define BCAST11  %%zmm24
+
+  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      int lex=s2+LLs*site;
+      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+      uint64_t a2 = (uint64_t)&psi[lex];
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	if ( (s2+l)==0 ) {
+	  asm (
+  	           VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
+  	           VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
+  	           VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
+		   VBCASTCDUP(0,%2,BCAST0)   
+		   VBCASTCDUP(1,%2,BCAST1)   
+		   VBCASTCDUP(2,%2,BCAST2)   
+		   VBCASTCDUP(3,%2,BCAST3)   
+		   VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
+		   VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
+		   VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
+		   VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
+		   VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
+		   VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
+		   VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
+		   VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
+		   VMULMEM (0,%1,BCAST8,Chi_22)         
+		   VMULMEM (0,%1,BCAST9,Chi_30)
+		   VMULMEM (0,%1,BCAST10,Chi_31)       
+		   VMULMEM (0,%1,BCAST11,Chi_32)
+		   : : "r" (a0), "r" (a1), "r" (a2)  );
+	} else { 
+	  asm (
+		   VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
+		   VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
+		   VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
+		   VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
+		   VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
+		   VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
+		   VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
+		   VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
+		   VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
+		   VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
+		   VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
+		   VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
+		   : : "r" (a0), "r" (a1), "r" (a2)  );
+	}
+	a0 = a0+incr;
+	a1 = a1+incr;
+	a2 = a2+sizeof(Simd::scalar_type);
+      }}
+    {
+      int lexa = s1+LLs*site;
+      asm (
+	       VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
+	       VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
+	       VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
+	       VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
+	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+    }
+  }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+#endif
+};
+
+  // Z-mobius version
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionField &chi,
+					     int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
+{
+#ifndef AVX512
+  {
+  SiteHalfSpinor BcastP;
+  SiteHalfSpinor BcastM;
+  SiteHalfSpinor SiteChiP;
+  SiteHalfSpinor SiteChiM;
+
+  // Ls*Ls * 2 * 12 * vol flops
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+        int s=s2+l*LLs;
+	int lex=s2+LLs*site;
+	
+	if ( s2==0 && l==0) {
+	  SiteChiP=zero;
+	  SiteChiM=zero;
+	}
+	
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	}}
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	}}
+
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co); 
+	  SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co); 
+	}}
+
+
+    }}
+    {
+      int lex = s1+LLs*site;
+      for(int sp=0;sp<2;sp++){
+      for(int co=0;co<Nc;co++){
+	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+      }}
+    }
+  }
+
+  }
+#else
+  {
+  // pointers
+  //  MASK_REGS;
+#define Chi_00 %zmm0
+#define Chi_01 %zmm1
+#define Chi_02 %zmm2
+#define Chi_10 %zmm3
+#define Chi_11 %zmm4
+#define Chi_12 %zmm5
+#define Chi_20 %zmm6
+#define Chi_21 %zmm7
+#define Chi_22 %zmm8
+#define Chi_30 %zmm9
+#define Chi_31 %zmm10
+#define Chi_32 %zmm11
+#define pChi_00 %%zmm0
+#define pChi_01 %%zmm1
+#define pChi_02 %%zmm2
+#define pChi_10 %%zmm3
+#define pChi_11 %%zmm4
+#define pChi_12 %%zmm5
+#define pChi_20 %%zmm6
+#define pChi_21 %%zmm7
+#define pChi_22 %%zmm8
+#define pChi_30 %%zmm9
+#define pChi_31 %%zmm10
+#define pChi_32 %%zmm11
+
+#define BCAST_00   %zmm12
+#define  SHUF_00   %zmm13
+#define BCAST_01   %zmm14
+#define  SHUF_01   %zmm15
+#define BCAST_02   %zmm16
+#define  SHUF_02   %zmm17
+#define BCAST_10   %zmm18
+#define  SHUF_10   %zmm19
+#define BCAST_11   %zmm20
+#define  SHUF_11   %zmm21
+#define BCAST_12   %zmm22
+#define  SHUF_12   %zmm23
+
+#define Mp  %zmm24
+#define Mps %zmm25
+#define Mm  %zmm26
+#define Mms %zmm27
+#define N 8
+  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      int lex=s2+LLs*site;
+      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+      uint64_t a2 = (uint64_t)&psi[lex];
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	if ( (s2+l)==0 ) {
+	  LOAD64(%r8,a0);
+	  LOAD64(%r9,a1);
+	  LOAD64(%r10,a2);
+	  asm (
+	       VLOAD(0,%r8,Mp)// i r
+	       VLOAD(0,%r9,Mm)
+	       VSHUF(Mp,Mps)  // r i 
+	       VSHUF(Mm,Mms)
+	       VPREFETCH1(12,%r10)  	     VPREFETCH1(13,%r10)
+	       VPREFETCH1(14,%r10)  	     VPREFETCH1(15,%r10)         
+
+	       VMULIDUP(0*N,%r10,Mps,Chi_00)
+	       VMULIDUP(1*N,%r10,Mps,Chi_01)
+	       VMULIDUP(2*N,%r10,Mps,Chi_02)
+	       VMULIDUP(3*N,%r10,Mps,Chi_10)
+	       VMULIDUP(4*N,%r10,Mps,Chi_11)
+	       VMULIDUP(5*N,%r10,Mps,Chi_12)
+
+	       VMULIDUP(6*N ,%r10,Mms,Chi_20)
+	       VMULIDUP(7*N ,%r10,Mms,Chi_21)
+	       VMULIDUP(8*N ,%r10,Mms,Chi_22)
+	       VMULIDUP(9*N ,%r10,Mms,Chi_30)
+	       VMULIDUP(10*N,%r10,Mms,Chi_31)
+	       VMULIDUP(11*N,%r10,Mms,Chi_32)
+
+	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
+	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
+	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+	       VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
+	       VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
+	       VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
+	       VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
+	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+	       );
+	} else { 
+	  LOAD64(%r8,a0);
+	  LOAD64(%r9,a1);
+	  LOAD64(%r10,a2);
+	  asm (
+	       VLOAD(0,%r8,Mp)
+	       VSHUF(Mp,Mps)
+
+	       VLOAD(0,%r9,Mm)
+	       VSHUF(Mm,Mms)
+
+	       VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) //  Mri * Pii +- Cir
+	       VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
+	       VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
+	       VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
+	       VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
+	       VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
+
+	       VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
+	       VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
+	       VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
+	       VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
+	       VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
+	       VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
+
+	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) //  Cir = Mir * Prr +- ( Mri * Pii +- Cir) 
+	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) //  Ci = MiPr + Ci + MrPi ;    Cr = MrPr - ( MiPi - Cr)
+	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+	       VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
+	       VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
+	       VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
+	       VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
+	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+	       );
+	}
+	a0 = a0+incr;
+	a1 = a1+incr;
+	a2 = a2+sizeof(Simd::scalar_type);
+      }}
+    {
+      int lexa = s1+LLs*site;
+      /*
+      SiteSpinor tmp;
+      asm (
+	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	       : : "r" ((uint64_t)&tmp) : "memory" );
+      */
+
+      asm (
+	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+      //      if ( 1 || (site==0) ) { 
+      //	std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
+      //      }
+    }
+  }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+
+#endif
+};
+
+
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
@@ -299,108 +767,41 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField

  chi.checkerboard=psi.checkerboard;
  
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Vector<iSinglet<Simd> >  Matp;
+  Vector<iSinglet<Simd> >  Matm;
+  Vector<iSinglet<Simd> >  *_Matp;
+  Vector<iSinglet<Simd> >  *_Matm;
  
-  for(int s=0;s<Ls;s++){
-    Pplus(s,s) = bee[s];
-    Pminus(s,s)= bee[s];
+  //  MooeeInternalCompute(dag,inv,Matp,Matm);
+  if ( inv && dag ) { 
+    _Matp = &MatpInvDag;
+    _Matm = &MatmInvDag;
  }
-  
-  for(int s=0;s<Ls-1;s++){
-    Pminus(s,s+1) = -cee[s];
+  if ( inv && (!dag) ) { 
+    _Matp = &MatpInv;
+    _Matm = &MatmInv;
+  } 
+  if ( !inv ) {
+    MooeeInternalCompute(dag,inv,Matp,Matm);
+    _Matp = &Matp;
+    _Matm = &Matm;
  }
-  
-  for(int s=0;s<Ls-1;s++){
-    Pplus(s+1,s) = -cee[s+1];
-  }
-  Pplus (0,Ls-1) = mass*cee[0];
-  Pminus(Ls-1,0) = mass*cee[Ls-1];
-  
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-  
-  if ( inv ) {
-    PplusMat =Pplus.inverse();
-    PminusMat=Pminus.inverse();
-  } else { 
-    PplusMat =Pplus;
-    PminusMat=Pminus;
-  }
-  
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-  
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd=Simd::Nsimd();
-  Vector<iSinglet<Simd> > Matp(Ls*LLs);
-  Vector<iSinglet<Simd> > Matm(Ls*LLs);
+  assert(_Matp->size()==Ls*LLs);

-  for(int s2=0;s2<Ls;s2++){
-  for(int s1=0;s1<LLs;s1++){
-    int istride = LLs;
-    int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type *)&Vp;
-      scalar_type *sm = (scalar_type *)&Vm;
-      for(int l=0;l<Nsimd;l++){
-	sp[l] = PplusMat (l*istride+s1*ostride ,s2);
-	sm[l] = PminusMat(l*istride+s1*ostride,s2);
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }
-  }
-  
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
-  // Dynamic allocate on stack to get per thread without serialised heap acces
-#pragma omp parallel  
-  {

-    Vector<SiteHalfSpinor> SitePplus(LLs);
-    Vector<SiteHalfSpinor> SitePminus(LLs);
-    Vector<SiteHalfSpinor> SiteChiP(LLs);
-    Vector<SiteHalfSpinor> SiteChiM(LLs);
-    Vector<SiteSpinor>     SiteChi(LLs);
-
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-
-#pragma omp for 
-  for(auto site=0;site<vol;site++){
-
-    for(int s=0;s<LLs;s++){
-      int lex = s+LLs*site;
-      spProj5p(SitePplus[s] ,psi[lex]);
-      spProj5m(SitePminus[s],psi[lex]);
-      SiteChiP[s]=zero;
-      SiteChiM[s]=zero;
+  if ( switcheroo<Coeff_t>::iscomplex() ) {
+  PARALLEL_FOR_LOOP
+    for(auto site=0;site<vol;site++){
+      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    }
-      
-    int s=0;
-    for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-      for(int s2=0;s2<LLs;s2++){ // Column loop of right hand side
-	vbroadcast(BcastP,SitePplus [s2],l);
-	vbroadcast(BcastM,SitePminus[s2],l);
-	for(int s1=0;s1<LLs;s1++){ // Column loop of reduction variables
-	  SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
-	  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
-	}
-      s++;
-    }}
-
-    for(int s=0;s<LLs;s++){
-      int lex = s+LLs*site;
-      spRecon5p(SiteChi[s],SiteChiP[s]);
-      accumRecon5m(SiteChi[s],SiteChiM[s]);
-      chi[lex] = SiteChi[s]*0.5;
+  } else { 
+  PARALLEL_FOR_LOOP
+    for(auto site=0;site<vol;site++){
+      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    }
  }
-  }
  MooeeInvTime+=usecond();
 }

@@ -414,4 +815,5 @@ template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const Fermion
 template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);

+
 }}
--- a/lib/qcd/action/fermion/FermionOperator.h
+++ b/lib/qcd/action/fermion/FermionOperator.h
@@ -48,6 +48,8 @@ namespace Grid {

      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};

+      virtual FermionField &tmp(void) = 0;
+
      GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
      GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -48,10 +48,12 @@ namespace QCD {
  //    typedef typename XXX         GaugeField;
  //    typedef typename XXX      GaugeActField;
  //    typedef typename XXX       FermionField;
+  //    typedef typename XXX    PropagatorField;
  //    typedef typename XXX  DoubledGaugeField;
  //    typedef typename XXX         SiteSpinor;
-  //    typedef typename XXX     SiteHalfSpinor;        
-  //    typedef typename XXX         Compressor;        
+  //    typedef typename XXX     SitePropagator;
+  //    typedef typename XXX     SiteHalfSpinor;	
+  //    typedef typename XXX         Compressor;	
  //
  // and Methods:
  //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
@@ -94,14 +96,16 @@ namespace QCD {
  ////////////////////////////////////////////////////////////////////////
  
 #define INHERIT_FIMPL_TYPES(Impl)\
-  typedef typename Impl::FermionField           FermionField;           \
-  typedef typename Impl::DoubledGaugeField DoubledGaugeField;           \
-  typedef typename Impl::SiteSpinor               SiteSpinor;           \
-  typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;           \
-  typedef typename Impl::Compressor               Compressor;           \
-  typedef typename Impl::StencilImpl             StencilImpl;           \
-  typedef typename Impl::ImplParams ImplParams;                         \
-  typedef typename Impl::Coeff_t       Coeff_t;
+  typedef typename Impl::FermionField           FermionField;		\
+  typedef typename Impl::PropagatorField     PropagatorField;		\
+  typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
+  typedef typename Impl::SiteSpinor               SiteSpinor;		\
+  typedef typename Impl::SitePropagator       SitePropagator;		\
+  typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
+  typedef typename Impl::Compressor               Compressor;		\
+  typedef typename Impl::StencilImpl             StencilImpl;		\
+  typedef typename Impl::ImplParams               ImplParams;	        \
+  typedef typename Impl::Coeff_t                     Coeff_t;           \
  
 #define INHERIT_IMPL_TYPES(Base) \
  INHERIT_GIMPL_TYPES(Base)      \
@@ -127,14 +131,17 @@ namespace QCD {
    INHERIT_GIMPL_TYPES(Gimpl);
      
    template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+    template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
    template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
    template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
    
    typedef iImplSpinor<Simd>            SiteSpinor;
+    typedef iImplPropagator<Simd>        SitePropagator;
    typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
    typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
    
    typedef Lattice<SiteSpinor>            FermionField;
+    typedef Lattice<SitePropagator>        PropagatorField;
    typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
    
    typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
@@ -216,14 +223,17 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
  INHERIT_GIMPL_TYPES(Gimpl);
  
  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
  
  typedef iImplSpinor<Simd> SiteSpinor;
+  typedef iImplPropagator<Simd> SitePropagator;
  typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
  typedef Lattice<SiteSpinor> FermionField;
+  typedef Lattice<SitePropagator> PropagatorField;
  
  // Make the doubled gauge field a *scalar*
  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
@@ -352,14 +362,17 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 INHERIT_GIMPL_TYPES(Gimpl);
      
 template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
+ template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>, Ngp >;
 template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
 template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
      
 typedef iImplSpinor<Simd> SiteSpinor;
+ typedef iImplPropagator<Simd> SitePropagator;
 typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
 typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
 
 typedef Lattice<SiteSpinor> FermionField;
+ typedef Lattice<SitePropagator> PropagatorField;
 typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
 
 typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -61,7 +61,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      LebesgueEvenOdd(_cbgrid),
      Umu(&Fgrid),
      UmuEven(&Hgrid),
-      UmuOdd(&Hgrid) {
+      UmuOdd(&Hgrid),
+      _tmp(&Hgrid)
+{
  // Allocate the required comms buffer
  ImportGauge(_Umu);
 }
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -58,6 +58,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  GridBase *FermionGrid(void) { return _grid; }
  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }

+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }
+
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -61,7 +61,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  UmuEven(_FourDimRedBlackGrid),
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
-  LebesgueEvenOdd(_FourDimRedBlackGrid)
+  LebesgueEvenOdd(_FourDimRedBlackGrid),
+  _tmp(&FiveDimRedBlackGrid)
 {
  if (Impl::LsVectorised) { 

--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -74,6 +74,9 @@ namespace QCD {
     typedef WilsonKernels<Impl> Kernels;
     PmuStat stat;

+     FermionField _tmp;
+     FermionField &tmp(void) { return _tmp; }
+
     void Report(void);
     void ZeroCounters(void);
     double DhopCalls;