Merge remote-tracking branch 'upstream/master'

Conflicts: lib/math/Grid_math_tensors.h lib/simd/Grid_vector_types.h
2025-07-27 01:37:07 +01:00 · 2015-05-26 13:14:06 +09:00
parent 377083e6ae c2ffb1a098
commit 9ad6d0c65f
39 changed files with 1091 additions and 439 deletions
--- a/benchmarks/Grid_comms.cc
+++ b/benchmarks/Grid_comms.cc
@@ -23,7 +23,7 @@ int main (int argc, char ** argv)



-  for(int lat=4;lat<=16;lat+=4){
+  for(int lat=4;lat<=32;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){

      std::vector<int> latt_size  ({lat,lat,lat,lat});
@@ -94,8 +94,7 @@ int main (int argc, char ** argv)
  std::cout << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;


-
-  for(int lat=4;lat<=16;lat+=4){
+  for(int lat=4;lat<=32;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){

      std::vector<int> latt_size  ({lat,lat,lat,lat});
--- a/benchmarks/Grid_su3_expr.cc
+++ b/benchmarks/Grid_su3_expr.cc
@@ -0,0 +1,11 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+void su3_test_mult_expression(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y)
+{
+  z=x*y;
+}
+
--- a/benchmarks/Grid_su3_test.cc
+++ b/benchmarks/Grid_su3_test.cc
@@ -0,0 +1,11 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+void su3_test_mult_routine(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y)
+{
+  mult(z,x,y);
+}
--- a/benchmarks/Grid_wilson.cc
+++ b/benchmarks/Grid_wilson.cc
@@ -24,7 +24,8 @@ int main (int argc, char ** argv)
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
-  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);

  int threads = GridThread::GetThreads();
  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@@ -36,11 +37,11 @@ int main (int argc, char ** argv)
  //  pRNG.SeedFixedIntegers(seeds);
  pRNG.SeedRandomDevice();

-  LatticeFermion src(&Grid); random(pRNG,src);
+  LatticeFermion src   (&Grid); random(pRNG,src);
  LatticeFermion result(&Grid); result=zero;
  LatticeFermion    ref(&Grid);    ref=zero;
-  LatticeFermion    err(&Grid);    
  LatticeFermion    tmp(&Grid);    tmp=zero;
+  LatticeFermion    err(&Grid);    tmp=zero;
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);

@@ -51,8 +52,11 @@ int main (int argc, char ** argv)

  // Only one non-zero (y)
  Umu=zero;
+  Complex cone(1.0,0.0);
  for(int nn=0;nn<Nd;nn++){
    random(pRNG,U[nn]);
+    if (nn!=0) U[nn]=zero;
+    else U[nn] = cone;
    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
  }

@@ -78,7 +82,7 @@ int main (int argc, char ** argv)
  }

  RealD mass=0.1;
-  WilsonMatrix Dw(Umu,mass);
+  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
  
  std::cout << "Calling Dw"<<std::endl;
  int ncall=1000;
@@ -93,7 +97,7 @@ int main (int argc, char ** argv)
  std::cout << "norm result "<< norm2(result)<<std::endl;
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
  std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-  err = ref -result;
+  err = ref-result; 
  std::cout << "norm diff   "<< norm2(err)<<std::endl;


@@ -129,9 +133,8 @@ int main (int argc, char ** argv)
  std::cout << "Called DwDag"<<std::endl;
  std::cout << "norm result "<< norm2(result)<<std::endl;
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
-  err = ref -result;
+  err = ref-result; 
  std::cout << "norm diff   "<< norm2(err)<<std::endl;

-
  Grid_finalize();
 }
--- a/benchmarks/Grid_wilson_cg_prec.cc
+++ b/benchmarks/Grid_wilson_cg_prec.cc
@@ -0,0 +1,61 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeFermion src(&Grid); random(pRNG,src);
+  RealD nrm = norm2(src);
+  LatticeFermion result(&Grid); result=zero;
+  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+
+  std::vector<LatticeColourMatrix> U(4,&Grid);
+
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
+  }
+  
+  RealD mass=0.5;
+  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
+
+  //  HermitianOperator<WilsonMatrix,LatticeFermion> HermOp(Dw);
+  //  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+  //  CG(HermOp,src,result);
+  
+  LatticeFermion    src_o(&RBGrid);
+  LatticeFermion result_o(&RBGrid);
+  pickCheckerboard(Odd,src_o,src);
+  result_o=zero;
+
+  HermitianCheckerBoardedOperator<WilsonMatrix,LatticeFermion> HermOpEO(Dw);
+  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+  CG(HermOpEO,src_o,result_o);
+
+  Grid_finalize();
+}
--- a/benchmarks/Grid_wilson_cg_schur.cc
+++ b/benchmarks/Grid_wilson_cg_schur.cc
@@ -0,0 +1,48 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+
+  LatticeFermion    src(&Grid); random(pRNG,src);
+  LatticeFermion result(&Grid); result=zero;
+  LatticeFermion resid(&Grid); 
+
+  RealD mass=0.5;
+  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
+
+  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+  SchurRedBlackSolve<LatticeFermion> SchurSolver(CG);
+
+  SchurSolver(Dw,src,result);
+  
+  Grid_finalize();
+}
--- a/benchmarks/Grid_wilson_cg_unprec.cc
+++ b/benchmarks/Grid_wilson_cg_unprec.cc
@@ -24,13 +24,14 @@ int main (int argc, char ** argv)
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
-  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);

  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);

  LatticeFermion src(&Grid); random(pRNG,src);
-  std::cout << "src norm" << norm2(src)<<std::endl;
+  RealD nrm = norm2(src);
  LatticeFermion result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);

@@ -46,11 +47,11 @@ int main (int argc, char ** argv)
  }
  
  RealD mass=0.5;
-  WilsonMatrix Dw(Umu,mass);
+  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
+
  HermitianOperator<WilsonMatrix,LatticeFermion> HermOp(Dw);
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
  CG(HermOp,src,result);

-  
  Grid_finalize();
 }
--- a/benchmarks/Grid_wilson_evenodd.cc
+++ b/benchmarks/Grid_wilson_evenodd.cc
@@ -0,0 +1,201 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+
+  int threads = GridThread::GetThreads();
+  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> seeds({1,2,3,4});
+
+  GridParallelRNG          pRNG(&Grid);
+  //  std::vector<int> seeds({1,2,3,4});
+  //  pRNG.SeedFixedIntegers(seeds);
+  pRNG.SeedRandomDevice();
+
+  LatticeFermion src   (&Grid); random(pRNG,src);
+  LatticeFermion phi   (&Grid); random(pRNG,phi);
+  LatticeFermion chi   (&Grid); random(pRNG,chi);
+  LatticeFermion result(&Grid); result=zero;
+  LatticeFermion    ref(&Grid);    ref=zero;
+  LatticeFermion    tmp(&Grid);    tmp=zero;
+  LatticeFermion    err(&Grid);    tmp=zero;
+  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+  std::vector<LatticeColourMatrix> U(4,&Grid);
+
+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
+  // Only one non-zero (y)
+  Umu=zero;
+  for(int nn=0;nn<Nd;nn++){
+    random(pRNG,U[nn]);
+    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
+  }
+
+  RealD mass=0.1;
+
+  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
+
+  LatticeFermion src_e   (&RBGrid);
+  LatticeFermion src_o   (&RBGrid);
+  LatticeFermion r_e   (&RBGrid);
+  LatticeFermion r_o   (&RBGrid);
+  LatticeFermion r_eo  (&Grid);
+
+  const int Even=0;
+  const int Odd=1;
+  std::cout<<"=========================================================="<<std::endl;
+  std::cout<<"= Testing that Deo + Doe = Dunprec "<<std::endl;
+  std::cout<<"=========================================================="<<std::endl;
+
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+
+  Dw.Meooe(src_e,r_o);  std::cout<<"Applied Meo"<<std::endl;
+  Dw.Meooe(src_o,r_e);  std::cout<<"Applied Moe"<<std::endl;
+  Dw.Dhop (src,ref,0);
+
+  setCheckerboard(r_eo,r_o);
+  setCheckerboard(r_eo,r_e);
+
+  ref = (-0.5)*ref;
+  err= ref - r_eo;
+  std::cout << "EO norm diff   "<< norm2(err)<< " "<<norm2(ref)<< " " << norm2(r_eo) <<std::endl;
+
+  LatticeComplex cerr(&Grid);
+  cerr = localInnerProduct(err,err);
+
+  std::cout<<"=============================================================="<<std::endl;
+  std::cout<<"= Test Ddagger is the dagger of D by requiring                "<<std::endl;
+  std::cout<<"=  < phi | Deo | chi > * = < chi | Deo^dag| phi>  "<<std::endl;
+  std::cout<<"=============================================================="<<std::endl;
+  
+  LatticeFermion chi_e   (&RBGrid);
+  LatticeFermion chi_o   (&RBGrid);
+
+  LatticeFermion dchi_e  (&RBGrid);
+  LatticeFermion dchi_o  (&RBGrid);
+
+  LatticeFermion phi_e   (&RBGrid);
+  LatticeFermion phi_o   (&RBGrid);
+
+  LatticeFermion dphi_e  (&RBGrid);
+  LatticeFermion dphi_o  (&RBGrid);
+
+
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+  pickCheckerboard(Even,phi_e,phi);
+  pickCheckerboard(Odd ,phi_o,phi);
+
+  Dw.Meooe(chi_e,dchi_o);
+  Dw.Meooe(chi_o,dchi_e);
+  Dw.MeooeDag(phi_e,dphi_o);
+  Dw.MeooeDag(phi_o,dphi_e);
+
+  ComplexD pDce = innerProduct(phi_e,dchi_e);
+  ComplexD pDco = innerProduct(phi_o,dchi_o);
+  ComplexD cDpe = innerProduct(chi_e,dphi_e);
+  ComplexD cDpo = innerProduct(chi_o,dphi_o);
+
+  std::cout <<"e "<<pDce<<" "<<cDpe <<std::endl;
+  std::cout <<"o "<<pDco<<" "<<cDpo <<std::endl;
+
+  std::cout <<"pDce - conj(cDpo) "<< pDce-conj(cDpo) <<std::endl;
+  std::cout <<"pDco - conj(cDpe) "<< pDco-conj(cDpe) <<std::endl;
+
+  std::cout<<"=============================================================="<<std::endl;
+  std::cout<<"= Test MeeInv Mee = 1                                         "<<std::endl;
+  std::cout<<"=============================================================="<<std::endl;
+
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+
+  Dw.Mooee(chi_e,src_e);
+  Dw.MooeeInv(src_e,phi_e);
+
+  Dw.Mooee(chi_o,src_o);
+  Dw.MooeeInv(src_o,phi_o);
+  
+  setCheckerboard(phi,phi_e);
+  setCheckerboard(phi,phi_o);
+
+  err = phi-chi;
+  std::cout << "norm diff   "<< norm2(err)<< std::endl;
+
+  std::cout<<"=============================================================="<<std::endl;
+  std::cout<<"= Test MeeInvDag MeeDag = 1                                   "<<std::endl;
+  std::cout<<"=============================================================="<<std::endl;
+
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+
+  Dw.MooeeDag(chi_e,src_e);
+  Dw.MooeeInvDag(src_e,phi_e);
+
+  Dw.MooeeDag(chi_o,src_o);
+  Dw.MooeeInvDag(src_o,phi_o);
+  
+  setCheckerboard(phi,phi_e);
+  setCheckerboard(phi,phi_o);
+
+  err = phi-chi;
+  std::cout << "norm diff   "<< norm2(err)<< std::endl;
+
+  std::cout<<"=============================================================="<<std::endl;
+  std::cout<<"= Test MpcDagMpc is Hermitian              "<<std::endl;
+  std::cout<<"=============================================================="<<std::endl;
+  
+  random(pRNG,phi);
+  random(pRNG,chi);
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+  pickCheckerboard(Even,phi_e,phi);
+  pickCheckerboard(Odd ,phi_o,phi);
+  RealD t1,t2;
+
+  Dw.MpcDagMpc(chi_e,dchi_e,t1,t2);
+  Dw.MpcDagMpc(chi_o,dchi_o,t1,t2);
+
+  Dw.MpcDagMpc(phi_e,dphi_e,t1,t2);
+  Dw.MpcDagMpc(phi_o,dphi_o,t1,t2);
+
+  pDce = innerProduct(phi_e,dchi_e);
+  pDco = innerProduct(phi_o,dchi_o);
+  cDpe = innerProduct(chi_e,dphi_e);
+  cDpo = innerProduct(chi_o,dphi_o);
+
+  std::cout <<"e "<<pDce<<" "<<cDpe <<std::endl;
+  std::cout <<"o "<<pDco<<" "<<cDpo <<std::endl;
+
+  std::cout <<"pDce - conj(cDpo) "<< pDco-conj(cDpo) <<std::endl;
+  std::cout <<"pDco - conj(cDpe) "<< pDce-conj(cDpe) <<std::endl;
+  
+  Grid_finalize();
+}
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -5,18 +5,27 @@ AM_LDFLAGS = -L$(top_builddir)/lib
 #
 # Test code
 #
-bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec
+bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec Grid_wilson_evenodd  Grid_wilson_cg_prec Grid_wilson_cg_schur

 Grid_wilson_SOURCES = Grid_wilson.cc
 Grid_wilson_LDADD = -lGrid

+Grid_wilson_evenodd_SOURCES = Grid_wilson_evenodd.cc
+Grid_wilson_evenodd_LDADD = -lGrid
+
 Grid_wilson_cg_unprec_SOURCES = Grid_wilson_cg_unprec.cc
 Grid_wilson_cg_unprec_LDADD = -lGrid

+Grid_wilson_cg_prec_SOURCES = Grid_wilson_cg_prec.cc
+Grid_wilson_cg_prec_LDADD = -lGrid
+
+Grid_wilson_cg_schur_SOURCES = Grid_wilson_cg_schur.cc
+Grid_wilson_cg_schur_LDADD = -lGrid
+
 Grid_comms_SOURCES = Grid_comms.cc
 Grid_comms_LDADD = -lGrid

-Grid_su3_SOURCES = Grid_su3.cc
+Grid_su3_SOURCES = Grid_su3.cc Grid_su3_test.cc Grid_su3_expr.cc
 Grid_su3_LDADD = -lGrid

 Grid_memory_bandwidth_SOURCES = Grid_memory_bandwidth.cc
--- a/lib/Grid_init.cc
+++ b/lib/Grid_init.cc
@@ -11,6 +11,7 @@
 #include <sys/time.h>
 #include <signal.h>
 #include <iostream>
+#include <iterator>
 #include <Grid.h>
 #include <algorithm>
 #include <iterator>
--- a/lib/Grid_simd.h
+++ b/lib/Grid_simd.h
@@ -13,28 +13,6 @@

 typedef uint32_t Integer;

-#ifdef SSE4
-#include <pmmintrin.h>
-#endif
-#if defined(AVX1) || defined (AVX2)
-#include <immintrin.h>
-    
-// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
-#ifndef _mm256_set_m128i
-#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
-#endif
-
-#endif
-
-#ifdef AVX512
-#include <immintrin.h>
-#ifndef KNC_ONLY_STORES
-#define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
-#define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
-#endif
-
-#endif
-
 namespace Grid {

  typedef  float  RealF;
@@ -117,7 +95,7 @@ namespace Grid {
  template<>            inline void zeroit(RealF &arg){ arg=0; };
  template<>            inline void zeroit(RealD &arg){ arg=0; };

-
+  // Eventually delete this part
 #if defined (SSE4)
    typedef __m128 fvec;
    typedef __m128d dvec;
@@ -146,6 +124,7 @@ namespace Grid {
    typedef vector4double dvec;
    typedef vector4double zvec;
 #endif
+  
 #if defined (AVX1) || defined (AVX2) || defined (AVX512)
    inline void v_prefetch0(int size, const char *ptr){
          for(int i=0;i<size;i+=64){ //  Define L1 linesize above// What about SSE?
--- a/lib/Grid_stencil.h
+++ b/lib/Grid_stencil.h
@@ -120,8 +120,8 @@ namespace Grid {
 	  // Map to always positive shift modulo global full dimension.
 	  int shift = (displacement+fd)%fd;
 	  
-     	  int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
-	  assert (checkerboard== _checkerboard);
+	  //     	  int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
+	  assert (source.checkerboard== _checkerboard);

 	  // the permute type
 	  int simd_layout     = _grid->_simd_layout[dimension];
--- a/lib/Grid_threads.h
+++ b/lib/Grid_threads.h
@@ -5,6 +5,8 @@
 #define GRID_OMP
 #endif

+#define UNROLL  _Pragma("unroll")
+
 #ifdef GRID_OMP
 #include <omp.h>
 #define PARALLEL_FOR_LOOP _Pragma("omp parallel for")
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -25,7 +25,7 @@ namespace Grid {
  /////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field> class HermitianOperatorBase : public LinearOperatorBase<Field> {
    public:
-      virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2);
+      virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2)=0;
      void AdjOp(const Field &in, Field &out) {
 	Op(in,out);
      };
@@ -106,6 +106,10 @@ namespace Grid {
    public:
      virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
    };
+    template<class Field> class HermitianOperatorFunction {
+    public:
+      virtual void operator() (HermitianOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
+    };

    // FIXME : To think about

--- a/lib/algorithms/SparseMatrix.h
+++ b/lib/algorithms/SparseMatrix.h
@@ -10,6 +10,7 @@ namespace Grid {
  /////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field> class SparseMatrixBase {
    public:
+      GridBase *_grid;
      // Full checkerboar operations
      virtual RealD M    (const Field &in, Field &out)=0;
      virtual RealD Mdag (const Field &in, Field &out)=0;
@@ -18,6 +19,7 @@ namespace Grid {
 	ni=M(in,tmp);
 	no=Mdag(tmp,out);
      }
+      SparseMatrixBase(GridBase *grid) : _grid(grid) {};
    };

  /////////////////////////////////////////////////////////////////////////////////////////////
@@ -25,7 +27,7 @@ namespace Grid {
  /////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
    public:
-      
+      GridBase *_cbgrid;
      // half checkerboard operaions
      virtual  void Meooe    (const Field &in, Field &out)=0;
      virtual  void Mooee    (const Field &in, Field &out)=0;
@@ -44,9 +46,7 @@ namespace Grid {
 	Meooe(out,tmp);

 	Mooee(in,out);
-	out=out-tmp; // axpy_norm
-	RealD n=norm2(out);
-	return n;
+	return axpy_norm(out,-1.0,tmp,out);
      }
      virtual  RealD MpcDag   (const Field &in, Field &out){
 	Field tmp(in._grid);
@@ -56,15 +56,15 @@ namespace Grid {
 	MeooeDag(out,tmp);

 	MooeeDag(in,out);
-	out=out-tmp; // axpy_norm
-	RealD n=norm2(out);
-	return n;
+	return axpy_norm(out,-1.0,tmp,out);
      }
-      virtual void MpcDagMpc(const Field &in, Field &out,RealD ni,RealD no) {
+      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
 	Field tmp(in._grid);
 	ni=Mpc(in,tmp);
-	no=Mpc(tmp,out);
+	no=MpcDag(tmp,out);
+	//	std::cout<<"MpcDagMpc "<<ni<<" "<<no<<std::endl;
      }
+      CheckerBoardedSparseMatrixBase(GridBase *grid,GridBase *cbgrid) : SparseMatrixBase<Field>(grid), _cbgrid(cbgrid) {};
    };

 }
--- a/lib/algorithms/approx/bigfloat.h
+++ b/lib/algorithms/approx/bigfloat.h
@@ -31,7 +31,7 @@ public:
  bigfloat(const double d) { mpf_init_set_d(x, d); }  
  bigfloat(const char *str) { mpf_init_set_str(x, (char*)str, 10); }
  ~bigfloat(void) { mpf_clear(x); }
-  operator const double (void) const { return (double)mpf_get_d(x); }
+  operator double (void) const { return (double)mpf_get_d(x); }
  static void setDefaultPrecision(unsigned long dprec) {
    unsigned long bprec =  (unsigned long)(3.321928094 * (double)dprec);
    mpf_set_default_prec(bprec);
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -9,18 +9,21 @@ namespace Grid {
    /////////////////////////////////////////////////////////////

  template<class Field> 
-    class ConjugateGradient :  public OperatorFunction<Field> {
+    class ConjugateGradient : public HermitianOperatorFunction<Field> {
 public:                                                
    RealD   Tolerance;
    Integer MaxIterations;
-
+    int verbose;
    ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
-      std::cout << Tolerance<<std::endl;
+      verbose=0;
    };

-    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi) {assert(0);};
+
    void operator() (HermitianOperatorBase<Field> &Linop,const Field &src, Field &psi){

+      psi.checkerboard = src.checkerboard;
+      conformable(psi,src);
+
      RealD cp,c,a,d,b,ssq,qq,b_pred;
      
      Field   p(src);
@@ -38,14 +41,16 @@ public:
      a  =norm2(p);
      cp =a;
      ssq=norm2(src);
-      
-      std::cout <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
-      std::cout <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
-      std::cout <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
-      std::cout <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
-      std::cout <<std::setprecision(4)<< "ConjugateGradient:     r "<<cp   <<std::endl;
-      std::cout <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
-      
+
+      if ( verbose ) {
+	std::cout <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
+	std::cout <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
+	std::cout <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
+	std::cout <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
+	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
+	std::cout <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
+      }
+
      RealD rsq =  Tolerance* Tolerance*ssq;
      
      //Check if guess is really REALLY good :)
@@ -61,14 +66,16 @@ public:
 	c=cp;
 	
 	Linop.OpAndNorm(p,mmp,d,qq);
-	  
-	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  d,qq "<<d<< " "<<qq <<std::endl;
+
+	RealD    qqck = norm2(mmp);
+	ComplexD dck  = innerProduct(p,mmp);
+	//	if (verbose) std::cout <<std::setprecision(4)<< "ConjugateGradient:  d,qq "<<d<< " "<<qq <<" qqcheck "<< qqck<< " dck "<< dck<<std::endl;
      
 	a      = c/d;
 	b_pred = a*(a*qq-d)/c;
-	
-	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  a,bp "<<a<< " "<<b_pred <<std::endl;

+
+	//	if (verbose) std::cout <<std::setprecision(4)<< "ConjugateGradient:  a,bp "<<a<< " "<<b_pred <<std::endl;
 	cp = axpy_norm(r,-a,mmp,r);
 	b = cp/c;
 	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,b "<<cp<< " "<<b <<std::endl;
@@ -77,7 +84,16 @@ public:
 	psi= a*p+psi;
 	p  = p*b+r;
 	  
-	std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	if (verbose) std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+	// Hack
+	if (0) {
+	  Field   tt(src);
+      	  Linop.Op(psi,mmp);
+	  tt=mmp-src;
+	  RealD resnorm = norm2(tt);
+	  std::cout<<"ConjugateGradient: Iteration " <<k<<" true residual "<<resnorm << " computed " << cp <<std::endl;
+	}

 	// Stopping condition
 	if ( cp <= rsq ) { 
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -1,6 +1,7 @@
 #ifndef GRID_SCHUR_RED_BLACK_H
 #define GRID_SCHUR_RED_BLACK_H

+
  /*
   * Red black Schur decomposition
   *
@@ -25,80 +26,86 @@
   *     M psi = eta
   ***********************
   *Odd
-   * i)   (D_oo)^{\dag} D_oo psi_o = (D_oo)^\dag L^{-1}  eta_o
-   *                        eta_o' = D_oo (eta_o - Moe Mee^{-1} eta_e)
+   * i)   (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
+   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *Even
   * ii)  Mee psi_e + Meo psi_o = src_e
   *
   *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
   *
   */
-
 namespace Grid {

  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackSolve : public OperatorFunction<Field>{
+  template<class Field> class SchurRedBlackSolve {
  private:
-    SparseMatrixBase<Field> & _Matrix;
-    OperatorFunction<Field> & _HermitianRBSolver;
+    HermitianOperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
  public:

    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
-  SchurRedBlackSolve(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianRBSolver) 
-    :  _Matrix(Matrix), _HermitianRBSolver(HermitianRBSolver) { 
+  SchurRedBlackSolve(HermitianOperatorFunction<Field> &HermitianRBSolver)  :
+     _HermitianRBSolver(HermitianRBSolver) 
+    { 
      CBfactorise=0;
    };

-    void operator() (const Field &in, Field &out){
+    template<class Matrix>
+      void operator() (Matrix & _Matrix,const Field &in, Field &out){

      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME need to make eo grid from full grid.
      // FIXME use CBfactorise to control schur decomp
-      const int Even=0;
-      const int Odd =1;
-
-      // Make a cartesianRedBlack from full Grid
-      GridRedBlackCartesian grid(in._grid);
+      GridBase *grid = _Matrix._cbgrid;
+      GridBase *fgrid= _Matrix._grid;
 
-      Field src_e(&grid);
-      Field src_o(&grid);
-      Field sol_e(&grid);
-      Field sol_o(&grid);
-      Field   tmp(&grid);
-      Field  Mtmp(&grid);
-      
+      Field src_e(grid);
+      Field src_o(grid);
+      Field sol_e(grid);
+      Field sol_o(grid);
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+      Field resid(fgrid);
+
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);

      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     //    MooeeInv(source[Even],tmp,DaggerNo,Even);
-      _Matrix.Meooe   (tmp,Mtmp);      //    Meo     (tmp,src,Odd,DaggerNo);
-      tmp=src_o-Mtmp;                  //    axpy    (tmp,src,source[Odd],-1.0);
-      _Matrix.MpcDag(tmp,src_o);       //    Mprec(tmp,src,Mtmp,DaggerYes);  
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+      _Matrix.MpcDag(tmp,src_o);       assert(src_o.checkerboard ==Odd);       

      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
-      _HermitianRBSolver(src_o,sol_o); //  CGNE_prec_MdagM(solution[Odd],src);
+      HermitianCheckerBoardedOperator<Matrix,Field> _HermOpEO(_Matrix);
+      std::cout << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);

      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        // Meo(solution[Odd],tmp,Even,DaggerNo);
-      src_e = src_e-tmp;               // axpy(src,tmp,source[Even],-1.0);
-      _Matrix.MooeeInv(src_e,sol_e);   // MooeeInv(src,solution[Even],DaggerNo,Even);
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
     
-      setCheckerboard(out,sol_e);
-      setCheckerboard(out,sol_o);
- 
+      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+
+      // Verify the unprec residual
+      _Matrix.M(out,resid); 
+      resid = resid-in;
+      RealD ns = norm2(in);
+      RealD nr = norm2(resid);
+
+      std::cout << "SchurRedBlack solver true unprec resid "<< sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
    }     
  };

--- a/lib/cartesian/Grid_cartesian_red_black.h
+++ b/lib/cartesian/Grid_cartesian_red_black.h
@@ -4,6 +4,13 @@

 namespace Grid {

+    static const int CbRed  =0;
+    static const int CbBlack=1;
+    static const int Even   =CbRed;
+    static const int Odd    =CbBlack;
+    static const int DaggerNo=0;
+    static const int DaggerYes=1;
+
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
 {
@@ -44,6 +51,9 @@ public:
            return source_cb;
        }
    };
+
+    GridRedBlackCartesian(GridBase *base) : GridRedBlackCartesian(base->_fdimensions,base->_simd_layout,base->_processors)  {};
+
    GridRedBlackCartesian(std::vector<int> &dimensions,
 			  std::vector<int> &simd_layout,
 			  std::vector<int> &processor_grid ) : GridBase(processor_grid)
--- a/lib/lattice/Grid_lattice_ET.h
+++ b/lib/lattice/Grid_lattice_ET.h
@@ -91,6 +91,47 @@ inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression<
  GridFromExpression(grid,std::get<2>(expr.second));
 }

+
+//////////////////////////////////////////////////////////////////////////
+// Obtain the CB from an expression, ensuring conformable. This must follow a tree recursion
+//////////////////////////////////////////////////////////////////////////
+template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
+inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
+{
+  if ( (cb==Odd) || (cb==Even) ) {
+    assert(cb==lat.checkerboard);
+  } 
+  cb=lat.checkerboard;
+  //  std::cout<<"Lattice leaf cb "<<cb<<std::endl;
+}
+template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
+inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
+{
+  //  std::cout<<"Non lattice leaf cb"<<cb<<std::endl;
+}
+template <typename Op, typename T1>
+inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
+{
+  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
+  //  std::cout<<"Unary node cb "<<cb<<std::endl;
+}
+
+template <typename Op, typename T1, typename T2>
+inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &expr) 
+{
+  CBFromExpression(cb,std::get<0>(expr.second));// recurse
+  CBFromExpression(cb,std::get<1>(expr.second));
+  //  std::cout<<"Binary node cb "<<cb<<std::endl;
+}
+template <typename Op, typename T1, typename T2, typename T3>
+inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
+{
+  CBFromExpression(cb,std::get<0>(expr.second));// recurse
+  CBFromExpression(cb,std::get<1>(expr.second));
+  CBFromExpression(cb,std::get<2>(expr.second));
+  //  std::cout<<"Trinary node cb "<<cb<<std::endl;
+}
+
 ////////////////////////////////////////////
 // Unary operators and funcs
 ////////////////////////////////////////////
--- a/lib/lattice/Grid_lattice_arith.h
+++ b/lib/lattice/Grid_lattice_arith.h
@@ -9,45 +9,68 @@ namespace Grid {
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class obj1,class obj2,class obj3> strong_inline
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
    conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
-      //      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#else
+      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
    }
  }
  
  template<class obj1,class obj2,class obj3> strong_inline
    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
    conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
+#else
+      mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
    }
  }
  
  template<class obj1,class obj2,class obj3> strong_inline
    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
    conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
+#else
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
    conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
+#else
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
    }
  }
  
@@ -56,6 +79,7 @@ PARALLEL_FOR_LOOP
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class obj1,class obj2,class obj3> strong_inline
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    ret.checkerboard = lhs.checkerboard;
    conformable(lhs,ret);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
@@ -67,7 +91,8 @@ PARALLEL_FOR_LOOP
  
  template<class obj1,class obj2,class obj3> strong_inline
    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-    conformable(lhs,ret);
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,lhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
      obj1 tmp;
@@ -78,22 +103,32 @@ PARALLEL_FOR_LOOP
  
  template<class obj1,class obj2,class obj3> strong_inline
    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-    conformable(lhs,ret);
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,lhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
+#else 
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
+#endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    ret.checkerboard = lhs.checkerboard;
    conformable(lhs,ret);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
+#else 
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs);
+#endif
    }
  }

@@ -102,84 +137,112 @@ PARALLEL_FOR_LOOP
  //////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class obj1,class obj2,class obj3> strong_inline
    void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
+#else 
+      mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
    }
  }
  
  template<class obj1,class obj2,class obj3> strong_inline
    void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
+#else 
+      mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
    }
  }
  
  template<class obj1,class obj2,class obj3> strong_inline
    void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
+#else 
+      sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
+#else 
+      add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
    }
  }
  
  template<class sobj,class vobj> strong_inline
  void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    ret.checkerboard = x.checkerboard;
+    conformable(ret,x);
    conformable(x,y);
-#pragma omp parallel for
+PARALLEL_FOR_LOOP
    for(int ss=0;ss<x._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+y._odata[ss];
      vstream(ret._odata[ss],tmp);
+#else
+      ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
+#endif
    }
  }
  template<class sobj,class vobj> strong_inline
  void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    ret.checkerboard = x.checkerboard;
+    conformable(ret,x);
    conformable(x,y);
-#pragma omp parallel for
+PARALLEL_FOR_LOOP
    for(int ss=0;ss<x._grid->oSites();ss++){
+#ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
      vstream(ret._odata[ss],tmp);
+#else
+      ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
+#endif
    }
  }

  template<class sobj,class vobj> strong_inline
  RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    ret.checkerboard = x.checkerboard;
+    conformable(ret,x);
    conformable(x,y);
-#pragma omp parallel for
-    for(int ss=0;ss<x._grid->oSites();ss++){
-      vobj tmp = a*x._odata[ss]+y._odata[ss];
-      vstream(ret._odata[ss],tmp);
-    }
+    axpy(ret,a,x,y);
    return norm2(ret);
  }
  template<class sobj,class vobj> strong_inline
  RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    ret.checkerboard = x.checkerboard;
+    conformable(ret,x);
    conformable(x,y);
-#pragma omp parallel for
-    for(int ss=0;ss<x._grid->oSites();ss++){
-      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
-      vstream(ret._odata[ss],tmp);
-    }
+    axpby(ret,a,b,x,y);
    return norm2(ret); // FIXME implement parallel norm in ss loop
  }

--- a/lib/lattice/Grid_lattice_base.h
+++ b/lib/lattice/Grid_lattice_base.h
@@ -1,6 +1,8 @@
 #ifndef GRID_LATTICE_BASE_H
 #define GRID_LATTICE_BASE_H

+#define STREAMING_STORES
+
 namespace Grid {

 // TODO: 
@@ -64,60 +66,136 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  template <typename Op, typename T1>                         strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
-      vobj tmp= eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
    }
    return *this;
  }
  template <typename Op, typename T1,typename T2>             strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
-      vobj tmp= eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
    }
    return *this;
  }
  template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
-      vobj tmp= eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss] = eval(ss,expr);
+#endif
    }
    return *this;
  }
  //GridFromExpression is tricky to do
  template<class Op,class T1>
    Lattice(const LatticeUnaryExpression<Op,T1> & expr):    _grid(nullptr){
+
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
    _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
-      _odata[ss] = eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
+      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=_eval(ss,expr);
+#endif
    }
  };
  template<class Op,class T1, class T2>
  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr):    _grid(nullptr){
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
    _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
-      _odata[ss] = eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(tmp,ss,expr);
+      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
    }
  };
  template<class Op,class T1, class T2, class T3>
  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr):    _grid(nullptr){
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
    _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
-      _odata[ss] = eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
+      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
    }
  };

@@ -140,6 +218,7 @@ PARALLEL_FOR_LOOP
        return *this;
    }
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
+      this->checkerboard = r.checkerboard;
      conformable(*this,r);
      std::cout<<"Lattice operator ="<<std::endl;
 PARALLEL_FOR_LOOP
--- a/lib/math/Grid_math_arith_mul.h
+++ b/lib/math/Grid_math_arith_mul.h
@@ -16,19 +16,18 @@ strong_inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> *
 template<class rrtype,class ltype,class rtype,int N>
 strong_inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
  for(int c1=0;c1<N;c1++){
-    int c3=0;
    for(int c2=0;c2<N;c2++){
-      mult(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+      mult(&ret->_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]);
    }
  }
-  for(int c3=1;c3<N;c3++){
-    for(int c1=0;c1<N;c1++){
+  for(int c1=0;c1<N;c1++){
+    for(int c3=1;c3<N;c3++){
      for(int c2=0;c2<N;c2++){
 	mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
      }
    }
  }
-    return;
+  return;
 }

 template<class rrtype,class ltype,class rtype,int N>
--- a/lib/math/Grid_math_tensors.h
+++ b/lib/math/Grid_math_tensors.h
@@ -34,66 +34,64 @@ public:

  // Scalar no action
  //  template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
-  iScalar()=default;
+  iScalar() = default;
+  /*
+  iScalar(const iScalar<vtype> &copyme)=default;
+  iScalar(iScalar<vtype> &&copyme)=default;
+  iScalar<vtype> & operator= (const iScalar<vtype> &copyme) = default;
+  iScalar<vtype> & operator= (iScalar<vtype> &&copyme) = default;
+  */
  iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
  iScalar(const Zero &z){ *this = zero; };

+  iScalar<vtype> & operator= (const Zero &hero){
+    zeroit(*this);
+    return *this;
+  }
+  friend strong_inline void vstream(iScalar<vtype> &out,const iScalar<vtype> &in){
+    vstream(out._internal,in._internal);
+  }
+  friend strong_inline void zeroit(iScalar<vtype> &that){
+    zeroit(that._internal);
+  }
+  friend strong_inline void prefetch(iScalar<vtype> &that){
+    prefetch(that._internal);
+  }
+  friend strong_inline void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
+    permute(out._internal,in._internal,permutetype);
+  }

-
-
-
-    iScalar<vtype> & operator= (const Zero &hero){
-      zeroit(*this);
-      return *this;
-    }
-    friend strong_inline void vstream(iScalar<vtype> &out,const iScalar<vtype> &in){
-      vstream(out._internal,in._internal);
-    }
-
-
-    friend strong_inline void zeroit(iScalar<vtype> &that){
-        zeroit(that._internal);
-    }
-    friend strong_inline void prefetch(iScalar<vtype> &that){
-      prefetch(that._internal);
-    }
-    friend strong_inline void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
-      permute(out._internal,in._internal,permutetype);
-    }
-
-    // Unary negation
-    friend strong_inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
-        iScalar<vtype> ret;
-        ret._internal= -r._internal;
-        return ret;
-    }
-    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-    strong_inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
-        *this = (*this)*r;
-        return *this;
-    }
-    strong_inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
-        *this = (*this)-r;
-        return *this;
-    }
-    strong_inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
-        *this = (*this)+r;
-        return *this;
-    }
-    
-    strong_inline vtype & operator ()(void) {
-      return _internal;
-    }
-
-    strong_inline const vtype & operator ()(void) const {
-      return _internal;
-    }
-
-    operator ComplexD () const { return(TensorRemove(_internal)); };
-    operator RealD () const { return(real(TensorRemove(_internal))); }
-
-    // convert from a something to a scalar
-    template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iScalar<vtype>
+  // Unary negation
+  friend strong_inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
+    iScalar<vtype> ret;
+    ret._internal= -r._internal;
+    return ret;
+  }
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+  strong_inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
+    *this = (*this)*r;
+    return *this;
+  }
+  strong_inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
+    *this = (*this)-r;
+    return *this;
+  }
+  strong_inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
+    *this = (*this)+r;
+    return *this;
+  }
+  strong_inline vtype & operator ()(void) {
+    return _internal;
+  }
+  strong_inline const vtype & operator ()(void) const {
+    return _internal;
+  }
+  
+  operator ComplexD () const { return(TensorRemove(_internal)); };
+  operator RealD () const { return(real(TensorRemove(_internal))); }
+  
+  // convert from a something to a scalar
+  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iScalar<vtype>
    { 
      _internal = vtype(arg);
      return *this;
@@ -129,67 +127,72 @@ public:
  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
  iVector(const Zero &z){ *this = zero; };
  iVector() =default;
+  /*
+  iVector(const iVector<vtype,N> &copyme)=default;
+  iVector(iVector<vtype,N> &&copyme)=default;
+  iVector<vtype,N> & operator= (const iVector<vtype,N> &copyme) = default;
+  iVector<vtype,N> & operator= (iVector<vtype,N> &&copyme) = default;
+  */

-    iVector<vtype,N> & operator= (const Zero &hero){
-        zeroit(*this);
-        return *this;
+  iVector<vtype,N> & operator= (const Zero &hero){
+    zeroit(*this);
+    return *this;
+  }
+  friend strong_inline void zeroit(iVector<vtype,N> &that){
+    for(int i=0;i<N;i++){
+      zeroit(that._internal[i]);
    }
-    friend strong_inline void zeroit(iVector<vtype,N> &that){
-        for(int i=0;i<N;i++){
-            zeroit(that._internal[i]);
-        }
+  }
+  friend strong_inline void prefetch(iVector<vtype,N> &that){
+    for(int i=0;i<N;i++) prefetch(that._internal[i]);
+  }
+  friend strong_inline void vstream(iVector<vtype,N> &out,const iVector<vtype,N> &in){
+    for(int i=0;i<N;i++){
+      vstream(out._internal[i],in._internal[i]);
    }
-    friend strong_inline void prefetch(iVector<vtype,N> &that){
-      for(int i=0;i<N;i++) prefetch(that._internal[i]);
+  }
+  friend strong_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
+    for(int i=0;i<N;i++){
+      permute(out._internal[i],in._internal[i],permutetype);
    }
-    friend strong_inline void vstream(iVector<vtype,N> &out,const iVector<vtype,N> &in){
-      for(int i=0;i<N;i++){
-	vstream(out._internal[i],in._internal[i]);
-      }
+  }
+  // Unary negation
+  friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
+    iVector<vtype,N> ret;
+    for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
+    return ret;
+  }
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+  strong_inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
+    *this = (*this)*r;
+    return *this;
+  }
+  strong_inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
+    *this = (*this)-r;
+    return *this;
+  }
+  strong_inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
+    *this = (*this)+r;
+    return *this;
+  }
+  strong_inline vtype & operator ()(int i) {
+    return _internal[i];
+  }
+  strong_inline const vtype & operator ()(int i) const {
+    return _internal[i];
+  }
+  friend std::ostream& operator<< (std::ostream& stream, const iVector<vtype,N> &o){
+    stream<< "V<"<<N<<">{";
+    for(int i=0;i<N;i++) {
+      stream<<o._internal[i];
+      if (i<N-1)	stream<<",";
    }
-
-    friend strong_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
-      for(int i=0;i<N;i++){
-	permute(out._internal[i],in._internal[i],permutetype);
-      }
-    }
-    // Unary negation
-    friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
-        iVector<vtype,N> ret;
-        for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
-        return ret;
-    }
-    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-    strong_inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
-        *this = (*this)*r;
-        return *this;
-    }
-    strong_inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
-        *this = (*this)-r;
-        return *this;
-    }
-    strong_inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
-        *this = (*this)+r;
-        return *this;
-    }
-    strong_inline vtype & operator ()(int i) {
-      return _internal[i];
-    }
-    strong_inline const vtype & operator ()(int i) const {
-      return _internal[i];
-    }
-    friend std::ostream& operator<< (std::ostream& stream, const iVector<vtype,N> &o){
-      stream<< "V<"<<N<<">{";
-      for(int i=0;i<N;i++) {
-	stream<<o._internal[i];
-	if (i<N-1)	stream<<",";
-      }
-      stream<<"}";
-      return stream;
-    };
-    //    strong_inline vtype && operator ()(int i) {
-    //      return _internal[i];
-    //    }
+    stream<<"}";
+    return stream;
+  };
+  //    strong_inline vtype && operator ()(int i) {
+  //      return _internal[i];
+  //    }
 };
    
 template<class vtype,int N> class iMatrix 
@@ -209,8 +212,6 @@ public:

  iMatrix(const Zero &z){ *this = zero; };
  iMatrix() =default;
-
-  // No copy constructor...
  
  iMatrix& operator=(const iMatrix& rhs){
    for(int i=0;i<N;i++)
@@ -221,6 +222,17 @@ public:
  
 

+  iMatrix(scalar_type s)  { (*this) = s ;};// recurse down and hit the constructor for vector_type
+
+  /*
+  iMatrix(const iMatrix<vtype,N> &copyme)=default;
+  iMatrix(iMatrix<vtype,N> &&copyme)=default;
+  iMatrix<vtype,N> & operator= (const iMatrix<vtype,N> &copyme) = default;
+  iMatrix<vtype,N> & operator= (iMatrix<vtype,N> &&copyme) = default;
+  */
+
+
+
  iMatrix<vtype,N> & operator= (const Zero &hero){
    zeroit(*this);
    return *this;
--- a/lib/qcd/Grid_qcd.h
+++ b/lib/qcd/Grid_qcd.h
@@ -10,9 +10,6 @@ namespace QCD {
    static const int Nhs=2; // half spinor
    static const int Nds=8; // double stored gauge field

-    static const int CbRed  =0;
-    static const int CbBlack=1;
-    
    //////////////////////////////////////////////////////////////////////////////
    // QCD iMatrix types
    // Index conventions:                            Lorentz x Spin x Colour
--- a/lib/qcd/Grid_qcd_wilson_dop.cc
+++ b/lib/qcd/Grid_qcd_wilson_dop.cc
@@ -4,8 +4,8 @@
 namespace Grid {
 namespace QCD {

-const std::vector<int> WilsonMatrix::directions   ({0,1,2,3, 0, 1, 2, 3,0});
-const std::vector<int> WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1,0});
+const std::vector<int> WilsonMatrix::directions   ({0,1,2,3, 0, 1, 2, 3});
+const std::vector<int> WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1});

  // Should be in header?
 const int WilsonMatrix::Xp = 0;
@@ -16,7 +16,6 @@ const int WilsonMatrix::Xm = 4;
 const int WilsonMatrix::Ym = 5;
 const int WilsonMatrix::Zm = 6;
 const int WilsonMatrix::Tm = 7;
-  //const int WilsonMatrix::X0 = 8;

  class WilsonCompressor {
  public:
@@ -72,20 +71,27 @@ const int WilsonMatrix::Tm = 7;
    }
  };

-  WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,double _mass)
-  : Stencil(Umu._grid,npoint,0,directions,displacements),
+  WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,GridCartesian &Fgrid,GridRedBlackCartesian &Hgrid, double _mass)  : 
+    CheckerBoardedSparseMatrixBase<LatticeFermion>(&Fgrid,&Hgrid),
+    Stencil    (  _grid,npoint,Even,directions,displacements),
+    StencilEven(_cbgrid,npoint,Even,directions,displacements), // source is Even
+    StencilOdd (_cbgrid,npoint,Odd ,directions,displacements), // source is Odd
    mass(_mass),
-    Umu(_Umu._grid)
-{
-  // Allocate the required comms buffer
-  grid = _Umu._grid;
-  comm_buf.resize(Stencil._unified_buffer_size);
-  DoubleStore(Umu,_Umu);
-}
+    Umu(_grid),
+    UmuEven(_cbgrid),
+    UmuOdd (_cbgrid)
+  {
+    // Allocate the required comms buffer
+    comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
+    
+    DoubleStore(Umu,_Umu);
+    pickCheckerboard(Even,UmuEven,Umu);
+    pickCheckerboard(Odd ,UmuOdd,Umu);
+  }
      
 void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu)
 {
-  LatticeColourMatrix U(grid);
+  LatticeColourMatrix U(_grid);

  for(int mu=0;mu<Nd;mu++){
    U = peekIndex<LorentzIndex>(Umu,mu);
@@ -97,48 +103,63 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF

 RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out,0);
+  out.checkerboard=in.checkerboard;
+  Dhop(in,out,DaggerNo);
  out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
  return norm2(out);
 }
 RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out,1);
+  out.checkerboard=in.checkerboard;
+  Dhop(in,out,DaggerYes);
  out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
  return norm2(out);
 }

 void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out,0);
-  out = 0.5*out; // FIXME : scale factor in Dhop
+  if ( in.checkerboard == Odd ) {
+    DhopEO(in,out,DaggerNo);
+  } else {
+    DhopOE(in,out,DaggerNo);
+  }
+  out = (-0.5)*out; // FIXME : scale factor in Dhop
 }
 void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out,1);
+  if ( in.checkerboard == Odd ) {
+    DhopEO(in,out,DaggerYes);
+  } else {
+    DhopOE(in,out,DaggerYes);
+  }
+  out = (-0.5)*out; // FIXME : scale factor in Dhop
 }
 void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out)
 {
+  out.checkerboard = in.checkerboard;
  out = (4.0+mass)*in;
  return ;
 }
-void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
-{
-  out = (1.0/(4.0+mass))*in;
-  return ;
-}
 void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out)
 {
+  out.checkerboard = in.checkerboard;
+  Mooee(in,out);
+}
+void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
+{
+  out.checkerboard = in.checkerboard;
  out = (1.0/(4.0+mass))*in;
  return ;
 }
 void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
 {
-  out = (1.0/(4.0+mass))*in;
-  return ;
+  out.checkerboard = in.checkerboard;
+  MooeeInv(in,out);
 }

-void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out)
+void WilsonMatrix::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
+			    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+			    int ss,const LatticeFermion &in, LatticeFermion &out)
 {
    vHalfSpinColourVector  tmp;    
    vHalfSpinColourVector  chi;    
@@ -146,79 +167,78 @@ void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out
    vHalfSpinColourVector Uchi;
    int offset,local,perm, ptype;

-    //    int ss = Stencil._LebesgueReorder[sss];
-    int ssu= ss;
+    //#define VERBOSE( A)  if ( ss<10 ) { std::cout << "site " <<ss << " " #A " neigh " << offset << " perm "<< perm <<std::endl;}    

    // Xp
-    offset = Stencil._offsets [Xp][ss];
-    local  = Stencil._is_local[Xp][ss];
-    perm   = Stencil._permute[Xp][ss];
+    offset = st._offsets [Xp][ss];
+    local  = st._is_local[Xp][ss];
+    perm   = st._permute[Xp][ss];

-    ptype  = Stencil._permute_type[Xp];
+    ptype  = st._permute_type[Xp];
    if ( local && perm ) {
      spProjXp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjXp(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
+    mult(&Uchi(),&U._odata[ss](Xp),&chi());
    spReconXp(result,Uchi);

    // Yp
-    offset = Stencil._offsets [Yp][ss];
-    local  = Stencil._is_local[Yp][ss];
-    perm   = Stencil._permute[Yp][ss];
-    ptype  = Stencil._permute_type[Yp];
+    offset = st._offsets [Yp][ss];
+    local  = st._is_local[Yp][ss];
+    perm   = st._permute[Yp][ss];
+    ptype  = st._permute_type[Yp];
    if ( local && perm ) {
      spProjYp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjYp(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
+    mult(&Uchi(),&U._odata[ss](Yp),&chi());
    accumReconYp(result,Uchi);

    // Zp
-    offset = Stencil._offsets [Zp][ss];
-    local  = Stencil._is_local[Zp][ss];
-    perm   = Stencil._permute[Zp][ss];
-    ptype  = Stencil._permute_type[Zp];
+    offset = st._offsets [Zp][ss];
+    local  = st._is_local[Zp][ss];
+    perm   = st._permute[Zp][ss];
+    ptype  = st._permute_type[Zp];
    if ( local && perm ) {
      spProjZp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjZp(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
+    mult(&Uchi(),&U._odata[ss](Zp),&chi());
    accumReconZp(result,Uchi);

    // Tp
-    offset = Stencil._offsets [Tp][ss];
-    local  = Stencil._is_local[Tp][ss];
-    perm   = Stencil._permute[Tp][ss];
-    ptype  = Stencil._permute_type[Tp];
+    offset = st._offsets [Tp][ss];
+    local  = st._is_local[Tp][ss];
+    perm   = st._permute[Tp][ss];
+    ptype  = st._permute_type[Tp];
    if ( local && perm ) {
      spProjTp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjTp(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
+    mult(&Uchi(),&U._odata[ss](Tp),&chi());
    accumReconTp(result,Uchi);

    // Xm
-    offset = Stencil._offsets [Xm][ss];
-    local  = Stencil._is_local[Xm][ss];
-    perm   = Stencil._permute[Xm][ss];
-    ptype  = Stencil._permute_type[Xm];
+    offset = st._offsets [Xm][ss];
+    local  = st._is_local[Xm][ss];
+    perm   = st._permute[Xm][ss];
+    ptype  = st._permute_type[Xm];

    if ( local && perm ) 
    {
@@ -227,17 +247,17 @@ void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out
    } else if ( local ) {
      spProjXm(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
+    mult(&Uchi(),&U._odata[ss](Xm),&chi());
    accumReconXm(result,Uchi);


    // Ym
-    offset = Stencil._offsets [Ym][ss];
-    local  = Stencil._is_local[Ym][ss];
-    perm   = Stencil._permute[Ym][ss];
-    ptype  = Stencil._permute_type[Ym];
+    offset = st._offsets [Ym][ss];
+    local  = st._is_local[Ym][ss];
+    perm   = st._permute[Ym][ss];
+    ptype  = st._permute_type[Ym];

    if ( local && perm ) {
      spProjYm(tmp,in._odata[offset]);
@@ -245,46 +265,49 @@ void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out
    } else if ( local ) {
      spProjYm(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
+    mult(&Uchi(),&U._odata[ss](Ym),&chi());
    accumReconYm(result,Uchi);

    // Zm
-    offset = Stencil._offsets [Zm][ss];
-    local  = Stencil._is_local[Zm][ss];
-    perm   = Stencil._permute[Zm][ss];
-    ptype  = Stencil._permute_type[Zm];
+    offset = st._offsets [Zm][ss];
+    local  = st._is_local[Zm][ss];
+    perm   = st._permute[Zm][ss];
+    ptype  = st._permute_type[Zm];
    if ( local && perm ) {
      spProjZm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjZm(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
+    mult(&Uchi(),&U._odata[ss](Zm),&chi());
    accumReconZm(result,Uchi);

    // Tm
-    offset = Stencil._offsets [Tm][ss];
-    local  = Stencil._is_local[Tm][ss];
-    perm   = Stencil._permute[Tm][ss];
-    ptype  = Stencil._permute_type[Tm];
+    offset = st._offsets [Tm][ss];
+    local  = st._is_local[Tm][ss];
+    perm   = st._permute[Tm][ss];
+    ptype  = st._permute_type[Tm];
    if ( local && perm ) {
      spProjTm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjTm(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
+    mult(&Uchi(),&U._odata[ss](Tm),&chi());
    accumReconTm(result,Uchi);

    vstream(out._odata[ss],result);
 }
-void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out)
+
+void WilsonMatrix::DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
+			       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+			       int ss,const LatticeFermion &in, LatticeFermion &out)
 {
    vHalfSpinColourVector  tmp;    
    vHalfSpinColourVector  chi;    
@@ -292,78 +315,76 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &
    vHalfSpinColourVector Uchi;
    int offset,local,perm, ptype;

-    int ssu= ss;
-
    // Xp
-    offset = Stencil._offsets [Xm][ss];
-    local  = Stencil._is_local[Xm][ss];
-    perm   = Stencil._permute[Xm][ss];
+    offset = st._offsets [Xm][ss];
+    local  = st._is_local[Xm][ss];
+    perm   = st._permute[Xm][ss];

-    ptype  = Stencil._permute_type[Xm];
+    ptype  = st._permute_type[Xm];
    if ( local && perm ) {
      spProjXp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjXp(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
+    mult(&Uchi(),&U._odata[ss](Xm),&chi());
    spReconXp(result,Uchi);

    // Yp
-    offset = Stencil._offsets [Ym][ss];
-    local  = Stencil._is_local[Ym][ss];
-    perm   = Stencil._permute[Ym][ss];
-    ptype  = Stencil._permute_type[Ym];
+    offset = st._offsets [Ym][ss];
+    local  = st._is_local[Ym][ss];
+    perm   = st._permute[Ym][ss];
+    ptype  = st._permute_type[Ym];
    if ( local && perm ) {
      spProjYp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjYp(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
+    mult(&Uchi(),&U._odata[ss](Ym),&chi());
    accumReconYp(result,Uchi);

    // Zp
-    offset = Stencil._offsets [Zm][ss];
-    local  = Stencil._is_local[Zm][ss];
-    perm   = Stencil._permute[Zm][ss];
-    ptype  = Stencil._permute_type[Zm];
+    offset = st._offsets [Zm][ss];
+    local  = st._is_local[Zm][ss];
+    perm   = st._permute[Zm][ss];
+    ptype  = st._permute_type[Zm];
    if ( local && perm ) {
      spProjZp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjZp(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
+    mult(&Uchi(),&U._odata[ss](Zm),&chi());
    accumReconZp(result,Uchi);

    // Tp
-    offset = Stencil._offsets [Tm][ss];
-    local  = Stencil._is_local[Tm][ss];
-    perm   = Stencil._permute[Tm][ss];
-    ptype  = Stencil._permute_type[Tm];
+    offset = st._offsets [Tm][ss];
+    local  = st._is_local[Tm][ss];
+    perm   = st._permute[Tm][ss];
+    ptype  = st._permute_type[Tm];
    if ( local && perm ) {
      spProjTp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjTp(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
+    mult(&Uchi(),&U._odata[ss](Tm),&chi());
    accumReconTp(result,Uchi);

    // Xm
-    offset = Stencil._offsets [Xp][ss];
-    local  = Stencil._is_local[Xp][ss];
-    perm   = Stencil._permute[Xp][ss];
-    ptype  = Stencil._permute_type[Xp];
+    offset = st._offsets [Xp][ss];
+    local  = st._is_local[Xp][ss];
+    perm   = st._permute[Xp][ss];
+    ptype  = st._permute_type[Xp];

    if ( local && perm ) 
    {
@@ -372,17 +393,16 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &
    } else if ( local ) {
      spProjXm(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
+    mult(&Uchi(),&U._odata[ss](Xp),&chi());
    accumReconXm(result,Uchi);

-
    // Ym
-    offset = Stencil._offsets [Yp][ss];
-    local  = Stencil._is_local[Yp][ss];
-    perm   = Stencil._permute[Yp][ss];
-    ptype  = Stencil._permute_type[Yp];
+    offset = st._offsets [Yp][ss];
+    local  = st._is_local[Yp][ss];
+    perm   = st._permute[Yp][ss];
+    ptype  = st._permute_type[Yp];

    if ( local && perm ) {
      spProjYm(tmp,in._odata[offset]);
@@ -390,69 +410,97 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &
    } else if ( local ) {
      spProjYm(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
+    mult(&Uchi(),&U._odata[ss](Yp),&chi());
    accumReconYm(result,Uchi);

    // Zm
-    offset = Stencil._offsets [Zp][ss];
-    local  = Stencil._is_local[Zp][ss];
-    perm   = Stencil._permute[Zp][ss];
-    ptype  = Stencil._permute_type[Zp];
+    offset = st._offsets [Zp][ss];
+    local  = st._is_local[Zp][ss];
+    perm   = st._permute[Zp][ss];
+    ptype  = st._permute_type[Zp];
    if ( local && perm ) {
      spProjZm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjZm(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
+    mult(&Uchi(),&U._odata[ss](Zp),&chi());
    accumReconZm(result,Uchi);

    // Tm
-    offset = Stencil._offsets [Tp][ss];
-    local  = Stencil._is_local[Tp][ss];
-    perm   = Stencil._permute[Tp][ss];
-    ptype  = Stencil._permute_type[Tp];
+    offset = st._offsets [Tp][ss];
+    local  = st._is_local[Tp][ss];
+    perm   = st._permute[Tp][ss];
+    ptype  = st._permute_type[Tp];
    if ( local && perm ) {
      spProjTm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjTm(chi,in._odata[offset]);
    } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
+    mult(&Uchi(),&U._odata[ss](Tp),&chi());
    accumReconTm(result,Uchi);

    vstream(out._odata[ss],result);
 }

-void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
+void WilsonMatrix::DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField & U,
+				const LatticeFermion &in, LatticeFermion &out,int dag)
 {
-  assert((dag==0) ||(dag==1));
-
+  assert((dag==DaggerNo) ||(dag==DaggerYes));
  WilsonCompressor compressor(dag);
-  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);

-  if ( dag ) {
+  st.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
+
+  if ( dag == DaggerYes ) {
 PARALLEL_FOR_LOOP
-    for(int sss=0;sss<grid->oSites();sss++){
-      DhopSiteDag(sss,in,out);
+    for(int sss=0;sss<in._grid->oSites();sss++){
+      DhopSiteDag(st,U,comm_buf,sss,in,out);
    }
  } else {
 PARALLEL_FOR_LOOP
-    for(int sss=0;sss<grid->oSites();sss++){
-      DhopSite(sss,in,out);
+    for(int sss=0;sss<in._grid->oSites();sss++){
+      DhopSite(st,U,comm_buf,sss,in,out);
    }
  }
 }
+void WilsonMatrix::DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag)
+{
+  conformable(in._grid,_cbgrid);    // verifies half grid
+  conformable(in._grid,out._grid); // drops the cb check

+  assert(in.checkerboard==Even);
+  out.checkerboard = Odd;

+  DhopInternal(StencilEven,UmuOdd,in,out,dag);
+}
+void WilsonMatrix::DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag)
+{
+  conformable(in._grid,_cbgrid);    // verifies half grid
+  conformable(in._grid,out._grid); // drops the cb check

+  assert(in.checkerboard==Odd);
+  out.checkerboard = Even;

+  DhopInternal(StencilOdd,UmuEven,in,out,dag);
+}
+void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
+{
+  conformable(in._grid,_grid); // verifies full grid
+  conformable(in._grid,out._grid);
+
+  out.checkerboard = in.checkerboard;
+
+  DhopInternal(Stencil,Umu,in,out,dag);
+}

 }}

+
+
--- a/lib/qcd/Grid_qcd_wilson_dop.h
+++ b/lib/qcd/Grid_qcd_wilson_dop.h
@@ -11,14 +11,20 @@ namespace Grid {
      //NB r=1;
    public:
      double                        mass;
-      GridBase                     *grid;
+      //      GridBase                     *    grid; // Inherited
+      //      GridBase                     *  cbgrid;

-      // Copy of the gauge field 
-      LatticeDoubledGaugeField             Umu;
+      //Defines the stencils for even and odd
+      CartesianStencil Stencil; 
+      CartesianStencil StencilEven; 
+      CartesianStencil StencilOdd; 

-      //Defines the stencil
-      CartesianStencil              Stencil; 
-      static const int npoint=9;
+      // Copy of the gauge field , with even and odd subsets
+      LatticeDoubledGaugeField Umu;
+      LatticeDoubledGaugeField UmuEven;
+      LatticeDoubledGaugeField UmuOdd;
+
+      static const int npoint=8;
      static const std::vector<int> directions   ;
      static const std::vector<int> displacements;
      static const int Xp,Xm,Yp,Ym,Zp,Zm,Tp,Tm;
@@ -27,7 +33,7 @@ namespace Grid {
      std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  comm_buf;

      // Constructor
-      WilsonMatrix(LatticeGaugeField &Umu,double mass);
+      WilsonMatrix(LatticeGaugeField &_Umu,GridCartesian &Fgrid,GridRedBlackCartesian &Hgrid,double _mass);

      // DoubleStore
      void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu);
@@ -45,9 +51,19 @@ namespace Grid {
      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);

      // non-hermitian hopping term; half cb or both
-      void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
-      void DhopSite   (int ss,const LatticeFermion &in, LatticeFermion &out);
-      void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out);
+      void Dhop  (const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField &U,
+			const LatticeFermion &in, LatticeFermion &out,int dag);
+      // These ones will need to be package intelligently. WilsonType base class
+      // for use by DWF etc..
+      void DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
+		    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+		    int ss,const LatticeFermion &in, LatticeFermion &out);
+      void DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
+		       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+		       int ss,const LatticeFermion &in, LatticeFermion &out);

      typedef iScalar<iMatrix<vComplex, Nc> > matrix;

--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -283,6 +283,7 @@ namespace Optimization {
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
 namespace Grid {
+
  typedef __m128 SIMD_Ftype;  // Single precision type
  typedef __m128d SIMD_Dtype; // Double precision type
  typedef __m128i SIMD_Itype; // Integer type
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -2,7 +2,7 @@
 /*! @file Grid_vector_types.h
  @brief Defines templated class Grid_simd to deal with inner vector types
 */
-// Time-stamp: <2015-05-22 17:08:19 neo>
+// Time-stamp: <2015-05-26 12:05:39 neo>
 //---------------------------------------------------------------------------
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES
@@ -21,31 +21,24 @@
 namespace Grid {

  // To take the floating point type of real/complex type
-  template <typename T> 
-    struct RealPart {
-      typedef T type;
-    };
-  template <typename T> 
-    struct RealPart< std::complex<T> >{
+  template <typename T> struct RealPart {
+    typedef T type;
+  };
+  template <typename T> struct RealPart< std::complex<T> >{
    typedef T type;
  };

  // type alias used to simplify the syntax of std::enable_if
-  template <typename T> using Invoke =
-    typename T::type;
-  template <typename Condition, typename ReturnType> using EnableIf =
-    Invoke<std::enable_if<Condition::value, ReturnType>>;
-  template <typename Condition, typename ReturnType> using NotEnableIf =
-    Invoke<std::enable_if<!Condition::value, ReturnType>>;
+  template <typename T> using Invoke                                  =  typename T::type;
+  template <typename Condition, typename ReturnType> using EnableIf   =    Invoke<std::enable_if<Condition::value, ReturnType>>;
+  template <typename Condition, typename ReturnType> using NotEnableIf=    Invoke<std::enable_if<!Condition::value, ReturnType>>;
  


  ////////////////////////////////////////////////////////
  // Check for complexity with type traits
-  template <typename T> 
-    struct is_complex : std::false_type {};
-  template < typename T > 
-    struct is_complex< std::complex<T> >: std::true_type {};
+  template <typename T>     struct is_complex : std::false_type {};
+  template < typename T >   struct is_complex< std::complex<T> >: std::true_type {};
  ////////////////////////////////////////////////////////
  // Define the operation templates functors
  // general forms to allow for vsplat syntax
@@ -102,8 +95,6 @@ namespace Grid {
    Grid_simd(const Real a){
      vsplat(*this,Scalar_type(a));
    };
-
-
       
    ///////////////////////////////////////////////
    // mac, mult, sub, add, adj
@@ -145,10 +136,6 @@ namespace Grid {
      friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);}
    template <  class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 > 
      friend inline void vfalse(Grid_simd &ret){vsplat(ret,0);}
-
-
-   
- 
   
    ////////////////////////////////////
    // Arithmetic operator overloads +,-,*
@@ -184,7 +171,6 @@ namespace Grid {
 	ret.v = binary<Vector_type>(a.v,b.v, MultSIMD());
 	return ret;
      };
-    

    ////////////////////////////////////////////////////////////////////////
    // FIXME:  gonna remove these load/store, get, set, prefetch
--- a/lib/simd/Old/Grid_vComplexD.h
+++ b/lib/simd/Old/Grid_vComplexD.h
@@ -345,20 +345,30 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 // REDUCE FIXME must be a cleaner implementation
       friend inline ComplexD Reduce(const vComplexD & in)
       { 
+	 vComplexD v1,v2;
+	 union { 
+	   zvec v;
+	   double f[sizeof(zvec)/sizeof(double)];
+	 } conv;
+	   
 #ifdef SSE4
-	 return ComplexD(in.v[0],in.v[1]);
+	 v1=in;
 #endif
 #if defined(AVX1) || defined (AVX2)
-	 vComplexD v1;
 	 permute(v1,in,0); // sse 128; paired complex single
 	 v1=v1+in;
-	 return ComplexD(v1.v[0],v1.v[1]);
 #endif
 #ifdef AVX512
-	 return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
+	 permute(v1,in,0); // sse 128; paired complex single
+	 v1=v1+in;
+	 permute(v2,v1,1); // avx 256; quad complex single
+	 v1=v1+v2;
 #endif 
 #ifdef QPX
+#error
 #endif
+	 conv.v = v1.v;
+	 return ComplexD(conv.f[0],conv.f[1]);
        }
        
        // Unary negation
--- a/lib/simd/Old/Grid_vComplexF.h
+++ b/lib/simd/Old/Grid_vComplexF.h
@@ -234,26 +234,34 @@ namespace Grid {
 	}
 	friend inline ComplexF Reduce(const vComplexF & in)
 	{
+	 vComplexF v1,v2;
+	 union { 
+	   cvec v;
+	   float f[sizeof(cvec)/sizeof(float)];
+	 } conv;
 #ifdef SSE4
-	 vComplexF v1;
 	 permute(v1,in,0); // sse 128; paired complex single
 	 v1=v1+in;
-	 return ComplexF(v1.v[0],v1.v[1]);
 #endif
 #if defined(AVX1) || defined (AVX2)
-	 vComplexF v1,v2;
 	 permute(v1,in,0); // sse 128; paired complex single
 	 v1=v1+in;
 	 permute(v2,v1,1); // avx 256; quad complex single
 	 v1=v1+v2;
-	 return ComplexF(v1.v[0],v1.v[1]);
 #endif
 #ifdef AVX512
-            return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
+	 permute(v1,in,0); // avx512 octo-complex single
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 permute(v2,v1,2); 
+	 v1=v1+v2;
 #endif
 #ifdef QPX
 #error
 #endif
+	 conv.v = v1.v;
+	 return ComplexF(conv.f[0],conv.f[1]);
        }

        friend inline vComplexF operator * (const ComplexF &a, vComplexF b){
--- a/lib/simd/Old/Grid_vRealD.h
+++ b/lib/simd/Old/Grid_vRealD.h
@@ -210,25 +210,33 @@ namespace Grid {

       friend inline RealD Reduce(const vRealD & in)
       {
+	 vRealD v1,v2;
+	 union { 
+	   dvec v;
+	   double f[sizeof(dvec)/sizeof(double)];
+	 } conv;
 #ifdef SSE4
-	 vRealD v1;
 	 permute(v1,in,0); // sse 128; paired real double
 	 v1=v1+in;
-	 return RealD(v1.v[0]);
 #endif
 #if defined(AVX1) || defined (AVX2)
-	 vRealD v1,v2;
 	 permute(v1,in,0); // avx 256; quad double
 	 v1=v1+in;
 	 permute(v2,v1,1); 
 	 v1=v1+v2;
-	 return v1.v[0];
 #endif
 #ifdef AVX512
-            return _mm512_reduce_add_pd(in.v);
+	 permute(v1,in,0); // avx 512; octo-double
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 permute(v2,v1,2); 
+	 v1=v1+v2;
 #endif
 #ifdef QPX
 #endif
+	 conv.v=v1.v;
+	 return conv.f[0];
       }

        // *=,+=,-= operators
--- a/lib/simd/Old/Grid_vRealF.h
+++ b/lib/simd/Old/Grid_vRealF.h
@@ -243,29 +243,39 @@ friend inline void vstore(const vRealF &ret, float *a){
        }
       friend inline RealF Reduce(const vRealF & in)
       {
-#ifdef SSE4
 	 vRealF v1,v2;
+	 union { 
+	   fvec v;
+	   float f[sizeof(fvec)/sizeof(double)];
+	 } conv;
+#ifdef SSE4
 	 permute(v1,in,0); // sse 128; quad single
 	 v1=v1+in;
 	 permute(v2,v1,1); 
 	 v1=v1+v2;
-	 return v1.v[0];
 #endif
 #if defined(AVX1) || defined (AVX2)
-	 vRealF v1,v2;
 	 permute(v1,in,0); // avx 256; octo-double
 	 v1=v1+in;
 	 permute(v2,v1,1); 
 	 v1=v1+v2;
 	 permute(v2,v1,2); 
 	 v1=v1+v2;
-	 return v1.v[0];
 #endif
 #ifdef AVX512
-            return _mm512_reduce_add_ps(in.v);
+	 permute(v1,in,0); // avx 256; octo-double
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 permute(v2,v1,2); 
+	 v1=v1+v2;
+	 permute(v2,v1,3); 
+	 v1=v1+v2;
 #endif
 #ifdef QPX
 #endif
+	 conv.v=v1.v;
+	 return conv.f[0];
        }

        // *=,+=,-= operators
--- a/scripts/configure-all
+++ b/scripts/configure-all
@@ -1,6 +1,6 @@
 #!/bin/bash

-DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse"
+DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse icpc-avx-openmp-mpi icpc-avx-openmp"

 for D in $DIRS
 do
--- a/scripts/configure-commands
+++ b/scripts/configure-commands
@@ -25,6 +25,12 @@ g++-avx)
 icpc-avx)
  CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
+icpc-avx-openmp-mpi)
+CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
+;;
+icpc-avx-openmp)
+CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
+;;
 icpc-avx2)
  CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
--- a/tests/Grid_main.cc
+++ b/tests/Grid_main.cc
@@ -1,8 +1,9 @@
 #include "Grid.h"

 //DEBUG
+#ifdef SSE4
 #include "simd/Grid_vector_types.h"
-
+#endif

 using namespace std;
 using namespace Grid;
--- a/tests/Grid_nersc_io.cc
+++ b/tests/Grid_nersc_io.cc
@@ -13,7 +13,7 @@ int main (int argc, char ** argv)

  std::vector<int> simd_layout = GridDefaultSimd(4,vComplexF::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
-  std::vector<int> latt_size  ({16,16,16,32});
+  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> clatt_size  ({4,4,4,8});
  int orthodir=3;
  int orthosz =latt_size[orthodir];
@@ -44,13 +44,15 @@ int main (int argc, char ** argv)
  // (1+2+3)=6 = N(N-1)/2 terms
  LatticeComplex Plaq(&Fine);
  LatticeComplex cPlaq(&Coarse);
+
  Plaq = zero;
+#if 1
  for(int mu=1;mu<Nd;mu++){
    for(int nu=0;nu<mu;nu++){
      Plaq = Plaq + trace(U[mu]*Cshift(U[nu],mu,1)*adj(Cshift(U[mu],nu,1))*adj(U[nu]));
    }
  }
-  
+#endif
  double vol = Fine.gSites();
  Complex PlaqScale(1.0/vol/6.0/3.0);
  std::cout <<"PlaqScale" << PlaqScale<<std::endl;
--- a/tests/InvSqrt.gnu
+++ b/tests/InvSqrt.gnu
--- a/tests/Sqrt.gnu
+++ b/tests/Sqrt.gnu
@@ -0,0 +1,2 @@
+f(x) = 6.81384+(-2.34645e-06/(x+0.000228091))+(-1.51593e-05/(x+0.00112084))+(-6.89254e-05/(x+0.003496))+(-0.000288983/(x+0.00954309))+(-0.00119277/(x+0.024928))+(-0.0050183/(x+0.0646627))+(-0.0226449/(x+0.171576))+(-0.123767/(x+0.491792))+(-1.1705/(x+1.78667))+(-102.992/(x+18.4866));
+f(x) = 0.14676+(0.00952992/(x+5.40933e-05))+(0.0115952/(x+0.000559699))+(0.0161824/(x+0.00203338))+(0.0243252/(x+0.00582831))+(0.0379533/(x+0.0154649))+(0.060699/(x+0.0401156))+(0.100345/(x+0.104788))+(0.178335/(x+0.286042))+(0.381586/(x+0.892189))+(1.42625/(x+4.38422));