Lots of debug on performance Mobius

2026-07-24 04:23:28 +01:00 · 2016-12-08 17:28:28 +00:00
parent ff71a8e847
commit fb8d4b2357
8 changed files with 304 additions and 87 deletions
@@ -54,12 +54,11 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  FermionField tmp(psi._grid);
-  this->DW(psi,tmp,DaggerNo);
+  this->DW(psi,this->tmp(),DaggerNo);
  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
  }
 }
@@ -87,8 +86,8 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
-    // Flops = 9*12*Ls*vol/2
+    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
-    RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
+    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  }
@@ -110,12 +109,11 @@ template<class Impl>
 void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  FermionField tmp(psi._grid);
-  this->DW(psi,tmp,DaggerYes);
+  this->DW(psi,this->tmp(),DaggerYes);
  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
  }
 }
 template<class Impl>  
@@ -138,6 +136,7 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
 // FIXME Redunant with the above routine; check this and eliminate
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
@@ -259,36 +258,33 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  FermionField tmp(psi._grid);
-  Meooe5D(psi,tmp); 
+  Meooe5D(psi,this->tmp()); 
  if ( psi.checkerboard == Odd ) {
-    this->DhopEO(tmp,chi,DaggerNo);
+    this->DhopEO(this->tmp(),chi,DaggerNo);
  } else {
-    this->DhopOE(tmp,chi,DaggerNo);
+    this->DhopOE(this->tmp(),chi,DaggerNo);
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
 {
  FermionField tmp(psi._grid);
  // Apply 4d dslash
  if ( psi.checkerboard == Odd ) {
-    this->DhopEO(psi,tmp,DaggerYes);
+    this->DhopEO(psi,this->tmp(),DaggerYes);
  } else {
-    this->DhopOE(psi,tmp,DaggerYes);
+    this->DhopOE(psi,this->tmp(),DaggerYes);
  }
-  MeooeDag5D(tmp,chi); 
+  MeooeDag5D(this->tmp(),chi); 
 }
 template<class Impl>
 void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-  FermionField tmp(psi._grid);
+  Meo5D(psi,this->tmp());
  Meo5D(psi,tmp);
  // Apply 4d dslash fragment
-  this->DhopDir(tmp,chi,dir,disp);
+  this->DhopDir(this->tmp(),chi,dir,disp);
 }
 // force terms; five routines; default to Dhop on diagonal
 template<class Impl>
@@ -76,6 +76,11 @@ namespace Grid {
 		  std::vector<Coeff_t> &diag,
 		  std::vector<Coeff_t> &upper);
      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
      void MooeeInternalAsm(const FermionField &in, FermionField &out,
 			    int LLs, int site,
 			    Vector<iSinglet<Simd> > &Matp,
 			    Vector<iSinglet<Simd> > &Matm);
      virtual void   Instantiatable(void)=0;
@@ -34,8 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
-namespace QCD {
+namespace QCD {  /*
  /*
   * Dense matrix versions of routines
   */
 template<class Impl>
@@ -126,7 +125,6 @@ PARALLEL_FOR_LOOP
      for(int v=0;v<LLs;v++){
 	vprefetch(psi[ss+v+LLs]);
 	//	vprefetch(phi[ss+v+LLs]);
 	int vp= (v==LLs-1) ? 0     : v+1;
 	int vm= (v==0    ) ? LLs-1 : v-1;
@@ -145,9 +143,6 @@ PARALLEL_FOR_LOOP
 	Simd hm_11 = psi[ss+vm]()(1)(1); 
 	Simd hm_12 = psi[ss+vm]()(1)(2); 
 	//	if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
 	//	if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
 	if ( vp<=v ) {
 	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
@@ -165,42 +160,20 @@ PARALLEL_FOR_LOOP
 	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 	}
-	/*
+	// Can force these to real arithmetic and save 2x.
-	if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
+	Simd p_00  = real_mult(d[v]()()(), phi[ss+v]()(0)(0))  + real_mult(l[v]()()(),hm_00); 
-	if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
+	Simd p_01  = real_mult(d[v]()()(), phi[ss+v]()(0)(1))  + real_mult(l[v]()()(),hm_01); 
-	if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
+	Simd p_02  = real_mult(d[v]()()(), phi[ss+v]()(0)(2))  + real_mult(l[v]()()(),hm_02); 
-	if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
+	Simd p_10  = real_mult(d[v]()()(), phi[ss+v]()(1)(0))  + real_mult(l[v]()()(),hm_10); 
-	*/	
+	Simd p_11  = real_mult(d[v]()()(), phi[ss+v]()(1)(1))  + real_mult(l[v]()()(),hm_11); 
-	Simd p_00  = d[v]()()() * phi[ss+v]()(0)(0)  + l[v]()()()*hm_00; 
+	Simd p_12  = real_mult(d[v]()()(), phi[ss+v]()(1)(2))  + real_mult(l[v]()()(),hm_12); 
-	Simd p_01  = d[v]()()() * phi[ss+v]()(0)(1)  + l[v]()()()*hm_01; 
+	Simd p_20  = real_mult(d[v]()()(), phi[ss+v]()(2)(0))  + real_mult(u[v]()()(),hp_00); 
-	Simd p_02  = d[v]()()() * phi[ss+v]()(0)(2)  + l[v]()()()*hm_02; 
+	Simd p_21  = real_mult(d[v]()()(), phi[ss+v]()(2)(1))  + real_mult(u[v]()()(),hp_01); 
-	Simd p_10  = d[v]()()() * phi[ss+v]()(1)(0)  + l[v]()()()*hm_10; 
+	Simd p_22  = real_mult(d[v]()()(), phi[ss+v]()(2)(2))  + real_mult(u[v]()()(),hp_02);  
-	Simd p_11  = d[v]()()() * phi[ss+v]()(1)(1)  + l[v]()()()*hm_11; 
+	Simd p_30  = real_mult(d[v]()()(), phi[ss+v]()(3)(0))  + real_mult(u[v]()()(),hp_10); 
-	Simd p_12  = d[v]()()() * phi[ss+v]()(1)(2)  + l[v]()()()*hm_12; 
+	Simd p_31  = real_mult(d[v]()()(), phi[ss+v]()(3)(1))  + real_mult(u[v]()()(),hp_11); 
-	Simd p_20  = d[v]()()() * phi[ss+v]()(2)(0)  + u[v]()()()*hp_00; 
+	Simd p_32  = real_mult(d[v]()()(), phi[ss+v]()(3)(2))  + real_mult(u[v]()()(),hp_12); 
 	Simd p_21  = d[v]()()() * phi[ss+v]()(2)(1)  + u[v]()()()*hp_01; 
 	Simd p_22  = d[v]()()() * phi[ss+v]()(2)(2)  + u[v]()()()*hp_02;  
 	Simd p_30  = d[v]()()() * phi[ss+v]()(3)(0)  + u[v]()()()*hp_10; 
 	Simd p_31  = d[v]()()() * phi[ss+v]()(3)(1)  + u[v]()()()*hp_11; 
 	Simd p_32  = d[v]()()() * phi[ss+v]()(3)(2)  + u[v]()()()*hp_12; 
 	//	if ( ss==0){
 	/*
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
 	}
 	*/
 	vstream(chi[ss+v]()(0)(0),p_00);
 	vstream(chi[ss+v]()(0)(1),p_01);
 	vstream(chi[ss+v]()(0)(2),p_02);
@@ -261,7 +234,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
-
+#if 0
    alignas(64) SiteHalfSpinor hp;
    alignas(64) SiteHalfSpinor hm;
    alignas(64) SiteSpinor fp;
@@ -287,9 +260,231 @@ PARALLEL_FOR_LOOP
      chi[ss+v] = chi[ss+v]     +l[v]*fm;
    }
 #else
      for(int v=0;v<LLs;v++){
 	vprefetch(psi[ss+v+LLs]);
 	int vp= (v==LLs-1) ? 0     : v+1;
 	int vm= (v==0    ) ? LLs-1 : v-1;
 	Simd hp_00 = psi[ss+vp]()(0)(0); 
 	Simd hp_01 = psi[ss+vp]()(0)(1); 
 	Simd hp_02 = psi[ss+vp]()(0)(2); 
 	Simd hp_10 = psi[ss+vp]()(1)(0); 
 	Simd hp_11 = psi[ss+vp]()(1)(1); 
 	Simd hp_12 = psi[ss+vp]()(1)(2); 
 	Simd hm_00 = psi[ss+vm]()(2)(0); 
 	Simd hm_01 = psi[ss+vm]()(2)(1); 
 	Simd hm_02 = psi[ss+vm]()(2)(2); 
 	Simd hm_10 = psi[ss+vm]()(3)(0); 
 	Simd hm_11 = psi[ss+vm]()(3)(1); 
 	Simd hm_12 = psi[ss+vm]()(3)(2); 
 	if ( vp<=v ) {
 	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 	}
 	if ( vm>=v ) {
 	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 	}
 	Simd p_00  = real_mult(d[v]()()(), phi[ss+v]()(0)(0))  + real_mult(u[v]()()(),hp_00); 
 	Simd p_01  = real_mult(d[v]()()(), phi[ss+v]()(0)(1))  + real_mult(u[v]()()(),hp_01); 
 	Simd p_02  = real_mult(d[v]()()(), phi[ss+v]()(0)(2))  + real_mult(u[v]()()(),hp_02); 
 	Simd p_10  = real_mult(d[v]()()(), phi[ss+v]()(1)(0))  + real_mult(u[v]()()(),hp_10); 
 	Simd p_11  = real_mult(d[v]()()(), phi[ss+v]()(1)(1))  + real_mult(u[v]()()(),hp_11); 
 	Simd p_12  = real_mult(d[v]()()(), phi[ss+v]()(1)(2))  + real_mult(u[v]()()(),hp_12); 
 	Simd p_20  = real_mult(d[v]()()(), phi[ss+v]()(2)(0))  + real_mult(l[v]()()(),hm_00); 
 	Simd p_21  = real_mult(d[v]()()(), phi[ss+v]()(2)(1))  + real_mult(l[v]()()(),hm_01); 
 	Simd p_22  = real_mult(d[v]()()(), phi[ss+v]()(2)(2))  + real_mult(l[v]()()(),hm_02);  
 	Simd p_30  = real_mult(d[v]()()(), phi[ss+v]()(3)(0))  + real_mult(l[v]()()(),hm_10); 
 	Simd p_31  = real_mult(d[v]()()(), phi[ss+v]()(3)(1))  + real_mult(l[v]()()(),hm_11); 
 	Simd p_32  = real_mult(d[v]()()(), phi[ss+v]()(3)(2))  + real_mult(l[v]()()(),hm_12); 
 	vstream(chi[ss+v]()(0)(0),p_00);
 	vstream(chi[ss+v]()(0)(1),p_01);
 	vstream(chi[ss+v]()(0)(2),p_02);
 	vstream(chi[ss+v]()(1)(0),p_10);
 	vstream(chi[ss+v]()(1)(1),p_11);
 	vstream(chi[ss+v]()(1)(2),p_12);
 	vstream(chi[ss+v]()(2)(0),p_20);
 	vstream(chi[ss+v]()(2)(1),p_21);
 	vstream(chi[ss+v]()(2)(2),p_22);
 	vstream(chi[ss+v]()(3)(0),p_30);
 	vstream(chi[ss+v]()(3)(1),p_31);
 	vstream(chi[ss+v]()(3)(2),p_32);
      }
 #endif
  }
  M5Dtime+=usecond();
 }
 #include <simd/Intel512common.h>
 #include <simd/Intel512avx.h>
 #include <simd/Intel512single.h>
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
 					     int LLs, int site,
 					     Vector<iSinglet<Simd> > &Matp,
 					     Vector<iSinglet<Simd> > &Matm)
 {
 #if 0
  {
  SiteHalfSpinor BcastP;
  SiteHalfSpinor BcastM;
  SiteHalfSpinor SiteChiP;
  SiteHalfSpinor SiteChiM;
  // Ls*Ls * 2 * 12 * vol flops
  for(int s1=0;s1<LLs;s1++){ 
    for(int s2=0;s2<LLs;s2++){ 
      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
        int s=s2+l*LLs;
 	int lex=s2+LLs*site;
 	if ( s2==0 && l==0) {
 	  SiteChiP=zero;
 	  SiteChiM=zero;
 	}
 	for(int sp=0;sp<2;sp++){
        for(int co=0;co<Nc;co++){
 	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
 	}}
 	for(int sp=0;sp<2;sp++){
        for(int co=0;co<Nc;co++){
 	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
 	}}
 	for(int sp=0;sp<2;sp++){
        for(int co=0;co<Nc;co++){
 	  SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
 	  SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
 	}}
    }}
    {
      int lex = s1+LLs*site;
      for(int sp=0;sp<2;sp++){
      for(int co=0;co<Nc;co++){
 	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
 	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
      }}
    }
  }
  }
 #else
  {
  // pointers
    //  MASK_REGS;
 #define Chi_00 %%zmm1
 #define Chi_01 %%zmm2
 #define Chi_02 %%zmm3
 #define Chi_10 %%zmm4
 #define Chi_11 %%zmm5
 #define Chi_12 %%zmm6
 #define Chi_20 %%zmm7
 #define Chi_21 %%zmm8
 #define Chi_22 %%zmm9
 #define Chi_30 %%zmm10
 #define Chi_31 %%zmm11
 #define Chi_32 %%zmm12
 #define BCAST0   %%zmm13
 #define BCAST1   %%zmm14
 #define BCAST2   %%zmm15
 #define BCAST3   %%zmm16
 #define BCAST4   %%zmm17
 #define BCAST5   %%zmm18
 #define BCAST6   %%zmm19
 #define BCAST7   %%zmm20
 #define BCAST8   %%zmm21
 #define BCAST9   %%zmm22
 #define BCAST10  %%zmm23
 #define BCAST11  %%zmm24
  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
  for(int s1=0;s1<LLs;s1++){ 
    for(int s2=0;s2<LLs;s2++){ 
      int lex=s2+LLs*site;
      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
      uint64_t a2 = (uint64_t)&psi[lex];
      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
 	if ( (s2+l)==0 ) {
 	  asm (
  	           VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
  	           VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
  	           VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
 		   VBCASTCDUP(0,%2,BCAST0)   
 		   VBCASTCDUP(1,%2,BCAST1)   
 		   VBCASTCDUP(2,%2,BCAST2)   
 		   VBCASTCDUP(3,%2,BCAST3)   
 		   VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
 		   VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
 		   VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
 		   VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
 		   VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
 		   VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
 		   VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
 		   VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
 		   VMULMEM (0,%1,BCAST8,Chi_22)         
 		   VMULMEM (0,%1,BCAST9,Chi_30)
 		   VMULMEM (0,%1,BCAST10,Chi_31)       
 		   VMULMEM (0,%1,BCAST11,Chi_32)
 		   : : "r" (a0), "r" (a1), "r" (a2)  );
 	} else { 
 	  asm (
 		   VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
 		   VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
 		   VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
 		   VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
 		   VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
 		   VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
 		   VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
 		   VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
 		   VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
 		   VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
 		   VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
 		   VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
 		   : : "r" (a0), "r" (a1), "r" (a2)  );
 	}
 	a0 = a0+incr;
 	a1 = a1+incr;
 	a2 = a2+sizeof(Simd::scalar_type);
      }}
    {
      int lexa = s1+LLs*site;
      asm (
 	       VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
 	       VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
 	       VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
 	       VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
 	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
    }
  }
  }
 #endif
 };
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
@@ -342,37 +537,38 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
  for(int s1=0;s1<LLs;s1++){
    int istride = LLs;
    int ostride = 1;
-      Simd Vp;
+    Simd Vp;
-      Simd Vm;
+    Simd Vm;
-      scalar_type *sp = (scalar_type *)&Vp;
+    scalar_type *sp = (scalar_type *)&Vp;
-      scalar_type *sm = (scalar_type *)&Vm;
+    scalar_type *sm = (scalar_type *)&Vm;
-      for(int l=0;l<Nsimd;l++){
+    for(int l=0;l<Nsimd;l++){
-	sp[l] = PplusMat (l*istride+s1*ostride ,s2);
+      sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	sm[l] = PminusMat(l*istride+s1*ostride,s2);
+      sp[l] = scalar_type(sp[l].real(),sp[l].real());
-      }
+      sm[l] = PminusMat(l*istride+s1*ostride,s2);
-      Matp[LLs*s2+s1] = Vp;
+      sm[l] = scalar_type(sm[l].real(),sm[l].real());
      Matm[LLs*s2+s1] = Vm;
    }
-  }
+    Matp[LLs*s2+s1] = Vp;
    Matm[LLs*s2+s1] = Vm;
  }}
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  // Dynamic allocate on stack to get per thread without serialised heap acces
 #if 0
 #pragma omp parallel  
  {
-
+    std::vector<SiteHalfSpinor> SitePplus(LLs);
-    Vector<SiteHalfSpinor> SitePplus(LLs);
+    std::vector<SiteHalfSpinor> SitePminus(LLs);
-    Vector<SiteHalfSpinor> SitePminus(LLs);
+    std::vector<SiteHalfSpinor> SiteChiP(LLs);
-    Vector<SiteHalfSpinor> SiteChiP(LLs);
+    std::vector<SiteHalfSpinor> SiteChiM(LLs);
-    Vector<SiteHalfSpinor> SiteChiM(LLs);
+    std::vector<SiteSpinor>     SiteChi(LLs);
    Vector<SiteSpinor>     SiteChi(LLs);
    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;
 #pragma omp for 
  for(auto site=0;site<vol;site++){
-
+    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spProj5p(SitePplus[s] ,psi[lex]);
@@ -390,8 +586,9 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
 	  SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
 	  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
 	}
-      s++;
+	s++;
-    }}
+      }
    }
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
@@ -399,8 +596,16 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
      accumRecon5m(SiteChi[s],SiteChiM[s]);
      chi[lex] = SiteChi[s]*0.5;
    }
  }}
 #else    
  PARALLEL_FOR_LOOP
  for(auto site=0;site<vol;site++){
    MooeeInternalAsm(psi,chi,
 		     LLs,site,
 		     Matp,Matm);
  }
-  }
+#endif
  MooeeInvTime+=usecond();
 }
@@ -48,6 +48,8 @@ namespace Grid {
      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
      virtual FermionField &tmp(void) = 0;
      GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
      GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
@@ -61,7 +61,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      LebesgueEvenOdd(_cbgrid),
      Umu(&Fgrid),
      UmuEven(&Hgrid),
-      UmuOdd(&Hgrid) {
+      UmuOdd(&Hgrid),
      _tmp(&Hgrid)
 {
  // Allocate the required comms buffer
  ImportGauge(_Umu);
 }
@@ -58,6 +58,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  GridBase *FermionGrid(void) { return _grid; }
  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
@@ -60,7 +60,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  UmuEven(_FourDimRedBlackGrid),
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
-  LebesgueEvenOdd(_FourDimRedBlackGrid)
+  LebesgueEvenOdd(_FourDimRedBlackGrid),
  _tmp(&FiveDimRedBlackGrid)
 {
  if (Impl::LsVectorised) { 
@@ -74,6 +74,9 @@ namespace QCD {
     typedef WilsonKernels<Impl> Kernels;
     PmuStat stat;
     FermionField _tmp;
     FermionField &tmp(void) { return _tmp; }
     void Report(void);
     void ZeroCounters(void);
     double DhopCalls;