From b562b50196a699587a3477e37fcedc4e3f86861c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 19 May 2015 21:29:07 +0100
Subject: [PATCH 01/27] Reworking to keep intel compiler happy

---
 benchmarks/Makefile.am           |  2 +-
 lib/algorithms/LinearOperator.h  |  2 +-
 lib/algorithms/approx/bigfloat.h |  2 +-
 lib/simd/Grid_vComplexD.h        | 18 ++++++++++++++----
 lib/simd/Grid_vComplexF.h        | 18 +++++++++++++-----
 lib/simd/Grid_vRealD.h           | 18 +++++++++++++-----
 lib/simd/Grid_vRealF.h           | 20 +++++++++++++++-----
 tests/Grid_nersc_io.cc           |  6 ++++--
 8 files changed, 62 insertions(+), 24 deletions(-)
diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am
index 77ce7663..ae09a6db 100644
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -16,7 +16,7 @@ Grid_wilson_cg_unprec_LDADD = -lGrid
 Grid_comms_SOURCES = Grid_comms.cc
 Grid_comms_LDADD = -lGrid
 
-Grid_su3_SOURCES = Grid_su3.cc
+Grid_su3_SOURCES = Grid_su3.cc Grid_su3_test.cc
 Grid_su3_LDADD = -lGrid
 
 Grid_memory_bandwidth_SOURCES = Grid_memory_bandwidth.cc
diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h
index bdb6c387..167213ec 100644
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -25,7 +25,7 @@ namespace Grid {
   /////////////////////////////////////////////////////////////////////////////////////////////
     template<class Field> class HermitianOperatorBase : public LinearOperatorBase<Field> {
     public:
-      virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2);
+      virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2)=0;
       void AdjOp(const Field &in, Field &out) {
 	Op(in,out);
       };
diff --git a/lib/algorithms/approx/bigfloat.h b/lib/algorithms/approx/bigfloat.h
index aee71d03..a749fa81 100755
--- a/lib/algorithms/approx/bigfloat.h
+++ b/lib/algorithms/approx/bigfloat.h
@@ -31,7 +31,7 @@ public:
   bigfloat(const double d) { mpf_init_set_d(x, d); }  
   bigfloat(const char *str) { mpf_init_set_str(x, (char*)str, 10); }
   ~bigfloat(void) { mpf_clear(x); }
-  operator const double (void) const { return (double)mpf_get_d(x); }
+  operator double (void) const { return (double)mpf_get_d(x); }
   static void setDefaultPrecision(unsigned long dprec) {
     unsigned long bprec =  (unsigned long)(3.321928094 * (double)dprec);
     mpf_set_default_prec(bprec);
diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Grid_vComplexD.h
index 2b03c825..116c2866 100644
--- a/lib/simd/Grid_vComplexD.h
+++ b/lib/simd/Grid_vComplexD.h
@@ -345,20 +345,30 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 // REDUCE FIXME must be a cleaner implementation
        friend inline ComplexD Reduce(const vComplexD & in)
        { 
+	 vComplexD v1,v2;
+	 union { 
+	   zvec v;
+	   double f[sizeof(zvec)/sizeof(double)];
+	 } conv;
+	   
 #ifdef SSE4
-	 return ComplexD(in.v[0],in.v[1]);
+	 v1=in;
 #endif
 #if defined(AVX1) || defined (AVX2)
-	 vComplexD v1;
 	 permute(v1,in,0); // sse 128; paired complex single
 	 v1=v1+in;
-	 return ComplexD(v1.v[0],v1.v[1]);
 #endif
 #ifdef AVX512
-	 return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
+	 permute(v1,in,0); // sse 128; paired complex single
+	 v1=v1+in;
+	 permute(v2,v1,1); // avx 256; quad complex single
+	 v1=v1+v2;
 #endif 
 #ifdef QPX
+#error
 #endif
+	 conv.v = v1.v;
+	 return ComplexD(conv.f[0],conv.f[1]);
         }
         
         // Unary negation
diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h
index 713bbdd8..cf74ff03 100644
--- a/lib/simd/Grid_vComplexF.h
+++ b/lib/simd/Grid_vComplexF.h
@@ -234,26 +234,34 @@ namespace Grid {
 	}
 	friend inline ComplexF Reduce(const vComplexF & in)
 	{
+	 vComplexF v1,v2;
+	 union { 
+	   cvec v;
+	   float f[sizeof(cvec)/sizeof(float)];
+	 } conv;
 #ifdef SSE4
-	 vComplexF v1;
 	 permute(v1,in,0); // sse 128; paired complex single
 	 v1=v1+in;
-	 return ComplexF(v1.v[0],v1.v[1]);
 #endif
 #if defined(AVX1) || defined (AVX2)
-	 vComplexF v1,v2;
 	 permute(v1,in,0); // sse 128; paired complex single
 	 v1=v1+in;
 	 permute(v2,v1,1); // avx 256; quad complex single
 	 v1=v1+v2;
-	 return ComplexF(v1.v[0],v1.v[1]);
 #endif
 #ifdef AVX512
-            return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
+	 permute(v1,in,0); // avx512 octo-complex single
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 permute(v2,v1,2); 
+	 v1=v1+v2;
 #endif
 #ifdef QPX
 #error
 #endif
+	 conv.v = v1.v;
+	 return ComplexF(conv.f[0],conv.f[1]);
         }
 
         friend inline vComplexF operator * (const ComplexF &a, vComplexF b){
diff --git a/lib/simd/Grid_vRealD.h b/lib/simd/Grid_vRealD.h
index 83979d14..a32e579e 100644
--- a/lib/simd/Grid_vRealD.h
+++ b/lib/simd/Grid_vRealD.h
@@ -210,25 +210,33 @@ namespace Grid {
 
        friend inline RealD Reduce(const vRealD & in)
        {
+	 vRealD v1,v2;
+	 union { 
+	   dvec v;
+	   double f[sizeof(dvec)/sizeof(double)];
+	 } conv;
 #ifdef SSE4
-	 vRealD v1;
 	 permute(v1,in,0); // sse 128; paired real double
 	 v1=v1+in;
-	 return RealD(v1.v[0]);
 #endif
 #if defined(AVX1) || defined (AVX2)
-	 vRealD v1,v2;
 	 permute(v1,in,0); // avx 256; quad double
 	 v1=v1+in;
 	 permute(v2,v1,1); 
 	 v1=v1+v2;
-	 return v1.v[0];
 #endif
 #ifdef AVX512
-            return _mm512_reduce_add_pd(in.v);
+	 permute(v1,in,0); // avx 512; octo-double
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 permute(v2,v1,2); 
+	 v1=v1+v2;
 #endif
 #ifdef QPX
 #endif
+	 conv.v=v1.v;
+	 return conv.f[0];
        }
 
         // *=,+=,-= operators
diff --git a/lib/simd/Grid_vRealF.h b/lib/simd/Grid_vRealF.h
index 754c5121..05bfa2f5 100644
--- a/lib/simd/Grid_vRealF.h
+++ b/lib/simd/Grid_vRealF.h
@@ -243,29 +243,39 @@ friend inline void vstore(const vRealF &ret, float *a){
         }
        friend inline RealF Reduce(const vRealF & in)
        {
-#ifdef SSE4
 	 vRealF v1,v2;
+	 union { 
+	   fvec v;
+	   float f[sizeof(fvec)/sizeof(double)];
+	 } conv;
+#ifdef SSE4
 	 permute(v1,in,0); // sse 128; quad single
 	 v1=v1+in;
 	 permute(v2,v1,1); 
 	 v1=v1+v2;
-	 return v1.v[0];
 #endif
 #if defined(AVX1) || defined (AVX2)
-	 vRealF v1,v2;
 	 permute(v1,in,0); // avx 256; octo-double
 	 v1=v1+in;
 	 permute(v2,v1,1); 
 	 v1=v1+v2;
 	 permute(v2,v1,2); 
 	 v1=v1+v2;
-	 return v1.v[0];
 #endif
 #ifdef AVX512
-            return _mm512_reduce_add_ps(in.v);
+	 permute(v1,in,0); // avx 256; octo-double
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 permute(v2,v1,2); 
+	 v1=v1+v2;
+	 permute(v2,v1,3); 
+	 v1=v1+v2;
 #endif
 #ifdef QPX
 #endif
+	 conv.v=v1.v;
+	 return conv.f[0];
         }
 
         // *=,+=,-= operators
diff --git a/tests/Grid_nersc_io.cc b/tests/Grid_nersc_io.cc
index fbef3cb1..6fe587a6 100644
--- a/tests/Grid_nersc_io.cc
+++ b/tests/Grid_nersc_io.cc
@@ -13,7 +13,7 @@ int main (int argc, char ** argv)
 
   std::vector<int> simd_layout = GridDefaultSimd(4,vComplexF::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();
-  std::vector<int> latt_size  ({16,16,16,32});
+  std::vector<int> latt_size   = GridDefaultLatt();
   std::vector<int> clatt_size  ({4,4,4,8});
   int orthodir=3;
   int orthosz =latt_size[orthodir];
@@ -44,13 +44,15 @@ int main (int argc, char ** argv)
   // (1+2+3)=6 = N(N-1)/2 terms
   LatticeComplex Plaq(&Fine);
   LatticeComplex cPlaq(&Coarse);
+
   Plaq = zero;
+#if 1
   for(int mu=1;mu<Nd;mu++){
     for(int nu=0;nu<mu;nu++){
       Plaq = Plaq + trace(U[mu]*Cshift(U[nu],mu,1)*adj(Cshift(U[mu],nu,1))*adj(U[nu]));
     }
   }
-  
+#endif
   double vol = Fine.gSites();
   Complex PlaqScale(1.0/vol/6.0/3.0);
   std::cout <<"PlaqScale" << PlaqScale<<std::endl;

From 3f57662cd0db1048392f507d7603d50c9d872767 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 19 May 2015 21:29:40 +0100
Subject: [PATCH 02/27] Build a simple kernel to compare intel compiler and
 clang in simple environment

---
 benchmarks/Grid_su3_test.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 benchmarks/Grid_su3_test.cc

diff --git a/benchmarks/Grid_su3_test.cc b/benchmarks/Grid_su3_test.cc
new file mode 100644
index 00000000..e4d4ec0a
--- /dev/null
+++ b/benchmarks/Grid_su3_test.cc
@@ -0,0 +1,10 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+void su3_test_mult(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y)
+{
+  z=x*y;
+}

From 35055ed5c1249c1391a64217a203fe5b3d410b8b Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 21 May 2015 06:34:33 +0100
Subject: [PATCH 03/27] Unroll pragma abstraction

---
 lib/Grid_threads.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/Grid_threads.h b/lib/Grid_threads.h
index 89a56d80..24581855 100644
--- a/lib/Grid_threads.h
+++ b/lib/Grid_threads.h
@@ -5,6 +5,8 @@
 #define GRID_OMP
 #endif
 
+#define UNROLL  _Pragma("unroll")
+
 #ifdef GRID_OMP
 #include <omp.h>
 #define PARALLEL_FOR_LOOP _Pragma("omp parallel for")

From d8065816662c56de8741569686947e0b9c03fe06 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 21 May 2015 06:35:46 +0100
Subject: [PATCH 04/27] better comms benchmarking

---
 benchmarks/Grid_comms.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/Grid_comms.cc b/benchmarks/Grid_comms.cc
index e844c5d3..59d709a0 100644
--- a/benchmarks/Grid_comms.cc
+++ b/benchmarks/Grid_comms.cc
@@ -23,7 +23,7 @@ int main (int argc, char ** argv)
 
 
 
-  for(int lat=4;lat<=16;lat+=4){
+  for(int lat=4;lat<=32;lat+=2){
     for(int Ls=1;Ls<=16;Ls*=2){
 
       std::vector<int> latt_size  ({lat,lat,lat,lat});
@@ -94,8 +94,7 @@ int main (int argc, char ** argv)
   std::cout << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
 
 
-
-  for(int lat=4;lat<=16;lat+=4){
+  for(int lat=4;lat<=32;lat+=2){
     for(int Ls=1;Ls<=16;Ls*=2){
 
       std::vector<int> latt_size  ({lat,lat,lat,lat});

From 57a01e6bbb69f3b2d98fb0e060d58987d283f94a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 21 May 2015 06:36:15 +0100
Subject: [PATCH 05/27] Didn't like a print statement

---
 lib/algorithms/iterative/ConjugateGradient.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h
index f7f8559e..85b196e9 100644
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -15,7 +15,6 @@ public:
     Integer MaxIterations;
 
     ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
-      std::cout << Tolerance<<std::endl;
     };
 
     void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi) {assert(0);};

From c96af471eedb11379b1700fd2667aa462526191a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 21 May 2015 06:36:47 +0100
Subject: [PATCH 06/27] useful to dump assembler

---
 benchmarks/Grid_su3_test.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/Grid_su3_test.cc b/benchmarks/Grid_su3_test.cc
index e4d4ec0a..8a7f6246 100644
--- a/benchmarks/Grid_su3_test.cc
+++ b/benchmarks/Grid_su3_test.cc
@@ -4,7 +4,8 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 
-void su3_test_mult(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y)
+
+void su3_test_mult_routine(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y)
 {
-  z=x*y;
+  mult(z,x,y);
 }

From 3e1d1aff185c7917670893361dfb298e84b7581b Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 21 May 2015 06:37:20 +0100
Subject: [PATCH 07/27] Minor change

---
 benchmarks/Grid_wilson_cg_unprec.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Grid_wilson_cg_unprec.cc b/benchmarks/Grid_wilson_cg_unprec.cc
index 2a91864b..93023534 100644
--- a/benchmarks/Grid_wilson_cg_unprec.cc
+++ b/benchmarks/Grid_wilson_cg_unprec.cc
@@ -30,7 +30,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
 
   LatticeFermion src(&Grid); random(pRNG,src);
-  std::cout << "src norm" << norm2(src)<<std::endl;
+  RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=zero;
   LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 

From f1fb92fd010d2560e9b869223860303117c8ac4d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 21 May 2015 06:37:46 +0100
Subject: [PATCH 08/27] adding two routines containing only a single operation
 so I can easily see the assembly dump

---
 benchmarks/Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am
index ae09a6db..fc513e86 100644
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -16,7 +16,7 @@ Grid_wilson_cg_unprec_LDADD = -lGrid
 Grid_comms_SOURCES = Grid_comms.cc
 Grid_comms_LDADD = -lGrid
 
-Grid_su3_SOURCES = Grid_su3.cc Grid_su3_test.cc
+Grid_su3_SOURCES = Grid_su3.cc Grid_su3_test.cc Grid_su3_expr.cc
 Grid_su3_LDADD = -lGrid
 
 Grid_memory_bandwidth_SOURCES = Grid_memory_bandwidth.cc

From 874b2eb32de3334528e856d59cb6399cb849c7ae Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 21 May 2015 06:39:00 +0100
Subject: [PATCH 09/27] Compile time select if we do the streaming store copy.
 Relies on Clang++ eliminating object copies, and other compliers do not
 necessarily cope.

---
 lib/lattice/Grid_lattice_arith.h | 61 ++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h
index f1e566a2..ff966578 100644
--- a/lib/lattice/Grid_lattice_arith.h
+++ b/lib/lattice/Grid_lattice_arith.h
@@ -12,10 +12,13 @@ namespace Grid {
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
-      //      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#else
+      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -24,9 +27,13 @@ PARALLEL_FOR_LOOP
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else
+      mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -35,9 +42,13 @@ PARALLEL_FOR_LOOP
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
     }
   }
   template<class obj1,class obj2,class obj3> strong_inline
@@ -45,9 +56,13 @@ PARALLEL_FOR_LOOP
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -81,9 +96,13 @@ PARALLEL_FOR_LOOP
     conformable(lhs,ret);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       sub(&tmp,&lhs._odata[ss],&rhs);
       vstream(ret._odata[ss],tmp);
+#else 
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
+#endif
     }
   }
   template<class obj1,class obj2,class obj3> strong_inline
@@ -91,9 +110,13 @@ PARALLEL_FOR_LOOP
     conformable(lhs,ret);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       add(&tmp,&lhs._odata[ss],&rhs);
       vstream(ret._odata[ss],tmp);
+#else 
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs);
+#endif
     }
   }
 
@@ -105,9 +128,13 @@ PARALLEL_FOR_LOOP
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       mult(&tmp,&lhs,&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else 
+      mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -116,9 +143,13 @@ PARALLEL_FOR_LOOP
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       mac(&tmp,&lhs,&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else 
+      mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -127,9 +158,13 @@ PARALLEL_FOR_LOOP
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       sub(&tmp,&lhs,&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else 
+      sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
     }
   }
   template<class obj1,class obj2,class obj3> strong_inline
@@ -137,9 +172,13 @@ PARALLEL_FOR_LOOP
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       add(&tmp,&lhs,&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else 
+      add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -148,8 +187,12 @@ PARALLEL_FOR_LOOP
     conformable(x,y);
 #pragma omp parallel for
     for(int ss=0;ss<x._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       vobj tmp = a*x._odata[ss]+y._odata[ss];
       vstream(ret._odata[ss],tmp);
+#else
+      ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
+#endif
     }
   }
   template<class sobj,class vobj> strong_inline
@@ -157,29 +200,25 @@ PARALLEL_FOR_LOOP
     conformable(x,y);
 #pragma omp parallel for
     for(int ss=0;ss<x._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       vobj tmp = a*x._odata[ss]+b*y._odata[ss];
       vstream(ret._odata[ss],tmp);
+#else
+      ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
+#endif
     }
   }
 
   template<class sobj,class vobj> strong_inline
   RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
     conformable(x,y);
-#pragma omp parallel for
-    for(int ss=0;ss<x._grid->oSites();ss++){
-      vobj tmp = a*x._odata[ss]+y._odata[ss];
-      vstream(ret._odata[ss],tmp);
-    }
+    axpy(ret,a,x,y);
     return norm2(ret);
   }
   template<class sobj,class vobj> strong_inline
   RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
     conformable(x,y);
-#pragma omp parallel for
-    for(int ss=0;ss<x._grid->oSites();ss++){
-      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
-      vstream(ret._odata[ss],tmp);
-    }
+    axpby(ret,a,b,x,y);
     return norm2(ret); // FIXME implement parallel norm in ss loop
   }
 

From d8061afe2456dc8c46499bd803e1726e6391fc51 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 21 May 2015 06:47:05 +0100
Subject: [PATCH 10/27] Streaming store option ifdef

---
 lib/lattice/Grid_lattice_base.h | 41 ++++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/lib/lattice/Grid_lattice_base.h b/lib/lattice/Grid_lattice_base.h
index 0016bafa..41f349b7 100644
--- a/lib/lattice/Grid_lattice_base.h
+++ b/lib/lattice/Grid_lattice_base.h
@@ -1,6 +1,8 @@
 #ifndef GRID_LATTICE_BASE_H
 #define GRID_LATTICE_BASE_H
 
+#define STREAMING_STORES
+
 namespace Grid {
 
 // TODO: 
@@ -66,8 +68,12 @@ public:
   {
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
-      vobj tmp= eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
       vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
     }
     return *this;
   }
@@ -75,8 +81,12 @@ PARALLEL_FOR_LOOP
   {
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
-      vobj tmp= eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
       vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
     }
     return *this;
   }
@@ -84,8 +94,12 @@ PARALLEL_FOR_LOOP
   {
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
-      vobj tmp= eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
       vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss] = eval(ss,expr);
+#endif
     }
     return *this;
   }
@@ -97,7 +111,12 @@ PARALLEL_FOR_LOOP
     _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
-      _odata[ss] = eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
+      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=_eval(ss,expr);
+#endif
     }
   };
   template<class Op,class T1, class T2>
@@ -107,7 +126,12 @@ PARALLEL_FOR_LOOP
     _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
-      _odata[ss] = eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(tmp,ss,expr);
+      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
     }
   };
   template<class Op,class T1, class T2, class T3>
@@ -117,7 +141,12 @@ PARALLEL_FOR_LOOP
     _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
-      _odata[ss] = eval(ss,expr);
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
+      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
     }
   };
 

From ae58a9ada21a6fb8e05bd254bd2c99861ae95ef9 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 23 May 2015 09:30:28 +0100
Subject: [PATCH 11/27] Iterator required

---
 lib/Grid_init.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Grid_init.cc b/lib/Grid_init.cc
index 6af0be84..572672eb 100644
--- a/lib/Grid_init.cc
+++ b/lib/Grid_init.cc
@@ -11,6 +11,7 @@
 #include <sys/time.h>
 #include <signal.h>
 #include <iostream>
+#include <iterator>
 #include <Grid.h>
 #include <algorithm>
 

From 764732944f1501368d07f92fafd5f65901764a47 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 23 May 2015 09:31:15 +0100
Subject: [PATCH 12/27] Cosmetic

---
 lib/math/Grid_math_arith_mul.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/math/Grid_math_arith_mul.h b/lib/math/Grid_math_arith_mul.h
index 04b05bb4..fcdd60ee 100644
--- a/lib/math/Grid_math_arith_mul.h
+++ b/lib/math/Grid_math_arith_mul.h
@@ -16,19 +16,18 @@ strong_inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> *
 template<class rrtype,class ltype,class rtype,int N>
 strong_inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
   for(int c1=0;c1<N;c1++){
-    int c3=0;
     for(int c2=0;c2<N;c2++){
-      mult(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+      mult(&ret->_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]);
     }
   }
-  for(int c3=1;c3<N;c3++){
-    for(int c1=0;c1<N;c1++){
+  for(int c1=0;c1<N;c1++){
+    for(int c3=1;c3<N;c3++){
       for(int c2=0;c2<N;c2++){
 	mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
       }
     }
   }
-    return;
+  return;
 }
 
 template<class rrtype,class ltype,class rtype,int N>

From a2928321b6c9fa23686208fb18225916980698bc Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 23 May 2015 09:32:37 +0100
Subject: [PATCH 13/27] Better pragma use

---
 lib/lattice/Grid_lattice_arith.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h
index ff966578..a6f59b56 100644
--- a/lib/lattice/Grid_lattice_arith.h
+++ b/lib/lattice/Grid_lattice_arith.h
@@ -185,7 +185,7 @@ PARALLEL_FOR_LOOP
   template<class sobj,class vobj> strong_inline
   void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
     conformable(x,y);
-#pragma omp parallel for
+PARALLEL_FOR_LOOP
     for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
       vobj tmp = a*x._odata[ss]+y._odata[ss];
@@ -198,7 +198,7 @@ PARALLEL_FOR_LOOP
   template<class sobj,class vobj> strong_inline
   void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
     conformable(x,y);
-#pragma omp parallel for
+PARALLEL_FOR_LOOP
     for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
       vobj tmp = a*x._odata[ss]+b*y._odata[ss];

From d07a5c084d803450f2379721854b4e61e1faf79d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 23 May 2015 09:33:42 +0100
Subject: [PATCH 14/27] Rely on default constructors

---
 lib/math/Grid_math_tensors.h | 120 +++++++++++++++++++----------------
 1 file changed, 66 insertions(+), 54 deletions(-)

diff --git a/lib/math/Grid_math_tensors.h b/lib/math/Grid_math_tensors.h
index a0424576..32ca78e5 100644
--- a/lib/math/Grid_math_tensors.h
+++ b/lib/math/Grid_math_tensors.h
@@ -34,62 +34,64 @@ public:
 
   // Scalar no action
   //  template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
-  iScalar()=default;
+  iScalar() = default;
+  /*
+  iScalar(const iScalar<vtype> &copyme)=default;
+  iScalar(iScalar<vtype> &&copyme)=default;
+  iScalar<vtype> & operator= (const iScalar<vtype> &copyme) = default;
+  iScalar<vtype> & operator= (iScalar<vtype> &&copyme) = default;
+  */
   iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
   iScalar(const Zero &z){ *this = zero; };
 
-    iScalar<vtype> & operator= (const Zero &hero){
-      zeroit(*this);
-      return *this;
-    }
-    friend strong_inline void vstream(iScalar<vtype> &out,const iScalar<vtype> &in){
-      vstream(out._internal,in._internal);
-    }
+  iScalar<vtype> & operator= (const Zero &hero){
+    zeroit(*this);
+    return *this;
+  }
+  friend strong_inline void vstream(iScalar<vtype> &out,const iScalar<vtype> &in){
+    vstream(out._internal,in._internal);
+  }
+  friend strong_inline void zeroit(iScalar<vtype> &that){
+    zeroit(that._internal);
+  }
+  friend strong_inline void prefetch(iScalar<vtype> &that){
+    prefetch(that._internal);
+  }
+  friend strong_inline void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
+    permute(out._internal,in._internal,permutetype);
+  }
 
-
-    friend strong_inline void zeroit(iScalar<vtype> &that){
-        zeroit(that._internal);
-    }
-    friend strong_inline void prefetch(iScalar<vtype> &that){
-      prefetch(that._internal);
-    }
-    friend strong_inline void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
-      permute(out._internal,in._internal,permutetype);
-    }
-
-    // Unary negation
-    friend strong_inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
-        iScalar<vtype> ret;
-        ret._internal= -r._internal;
-        return ret;
-    }
-    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-    strong_inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
-        *this = (*this)*r;
-        return *this;
-    }
-    strong_inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
-        *this = (*this)-r;
-        return *this;
-    }
-    strong_inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
-        *this = (*this)+r;
-        return *this;
-    }
-    
-    strong_inline vtype & operator ()(void) {
-      return _internal;
-    }
-
-    strong_inline const vtype & operator ()(void) const {
-      return _internal;
-    }
-
-    operator ComplexD () const { return(TensorRemove(_internal)); };
-    operator RealD () const { return(real(TensorRemove(_internal))); }
-
-    // convert from a something to a scalar
-    template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iScalar<vtype>
+  // Unary negation
+  friend strong_inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
+    iScalar<vtype> ret;
+    ret._internal= -r._internal;
+    return ret;
+  }
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+  strong_inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
+    *this = (*this)*r;
+    return *this;
+  }
+  strong_inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
+    *this = (*this)-r;
+    return *this;
+  }
+  strong_inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
+    *this = (*this)+r;
+    return *this;
+  }
+  strong_inline vtype & operator ()(void) {
+    return _internal;
+  }
+  strong_inline const vtype & operator ()(void) const {
+    return _internal;
+  }
+  
+  operator ComplexD () const { return(TensorRemove(_internal)); };
+  operator RealD () const { return(real(TensorRemove(_internal))); }
+  
+  // convert from a something to a scalar
+  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iScalar<vtype>
     { 
       _internal = vtype(arg);
       return *this;
@@ -125,6 +127,12 @@ public:
   enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
   iVector(const Zero &z){ *this = zero; };
   iVector() =default;
+  /*
+  iVector(const iVector<vtype,N> &copyme)=default;
+  iVector(iVector<vtype,N> &&copyme)=default;
+  iVector<vtype,N> & operator= (const iVector<vtype,N> &copyme) = default;
+  iVector<vtype,N> & operator= (iVector<vtype,N> &&copyme) = default;
+  */
 
     iVector<vtype,N> & operator= (const Zero &hero){
         zeroit(*this);
@@ -205,8 +213,12 @@ public:
 
   iMatrix(const Zero &z){ *this = zero; };
   iMatrix() =default;
-
-
+  /*
+  iMatrix(const iMatrix<vtype,N> &copyme)=default;
+  iMatrix(iMatrix<vtype,N> &&copyme)=default;
+  iMatrix<vtype,N> & operator= (const iMatrix<vtype,N> &copyme) = default;
+  iMatrix<vtype,N> & operator= (iMatrix<vtype,N> &&copyme) = default;
+  */
   iMatrix<vtype,N> & operator= (const Zero &hero){
     zeroit(*this);
     return *this;

From 65f2e6b2696394992656f8a6a01c4ffd80507488 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 23 May 2015 09:34:16 +0100
Subject: [PATCH 15/27] Improving even odd sector; lot of work and through
 required cleaning this

---
 lib/qcd/Grid_qcd_wilson_dop.cc | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/lib/qcd/Grid_qcd_wilson_dop.cc b/lib/qcd/Grid_qcd_wilson_dop.cc
index 9ef4af0c..2020a9ec 100644
--- a/lib/qcd/Grid_qcd_wilson_dop.cc
+++ b/lib/qcd/Grid_qcd_wilson_dop.cc
@@ -97,45 +97,44 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF
 
 RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out,0);
+  this->Dhop(in,out,0);
   out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
   return norm2(out);
 }
 RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out,1);
+  this->Dhop(in,out,1);
   out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
   return norm2(out);
 }
 
 void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out,0);
+  this->Dhop(in,out,0);
   out = 0.5*out; // FIXME : scale factor in Dhop
 }
 void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out,1);
+  this->Dhop(in,out,1);
+  out = 0.5*out; // FIXME : scale factor in Dhop
 }
 void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out)
 {
   out = (4.0+mass)*in;
   return ;
 }
-void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
-{
-  out = (1.0/(4.0+mass))*in;
-  return ;
-}
 void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out)
+{
+  this->Mooee(in,out);
+}
+void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
 {
   out = (1.0/(4.0+mass))*in;
   return ;
 }
 void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
 {
-  out = (1.0/(4.0+mass))*in;
-  return ;
+  this->MooeeInv(in,out);
 }
 
 void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out)
@@ -451,8 +450,6 @@ PARALLEL_FOR_LOOP
 }
 
 
-
-
-
-}}
+}
+}
 

From b8fdb65fbf89273f8cfd7e7b8691bf16a8ea89b1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 23 May 2015 09:34:50 +0100
Subject: [PATCH 16/27] More targets

---
 scripts/configure-all | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/configure-all b/scripts/configure-all
index f42b1960..ad91034d 100755
--- a/scripts/configure-all
+++ b/scripts/configure-all
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse"
+DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse icpc-avx-openmp-mpi icpc-avx-openmp"
 
 for D in $DIRS
 do

From 73ee36c48d14d02c5be8937b84eb6eda1778b4e6 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 23 May 2015 09:35:37 +0100
Subject: [PATCH 17/27] Extra targets

---
 scripts/configure-commands | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/configure-commands b/scripts/configure-commands
index c7c35e1c..538f3661 100755
--- a/scripts/configure-commands
+++ b/scripts/configure-commands
@@ -25,6 +25,12 @@ g++-avx)
 icpc-avx)
   CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
   ;;
+icpc-avx-openmp-mpi)
+CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
+;;
+icpc-avx-openmp)
+CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
+;;
 icpc-avx2)
   CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
   ;;

From 280627334076bfafe30a77b30a5d1eab824f98cd Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 23 May 2015 09:36:01 +0100
Subject: [PATCH 18/27] Added

---
 benchmarks/Grid_su3_expr.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 benchmarks/Grid_su3_expr.cc

diff --git a/benchmarks/Grid_su3_expr.cc b/benchmarks/Grid_su3_expr.cc
new file mode 100644
index 00000000..e20bcef4
--- /dev/null
+++ b/benchmarks/Grid_su3_expr.cc
@@ -0,0 +1,11 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+void su3_test_mult_expression(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y)
+{
+  z=x*y;
+}
+

From 17a06af1ff2c79edd662cc436ddf1ba061fe2f31 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 25 May 2015 13:42:12 +0100
Subject: [PATCH 19/27] red black fix

---
 lib/Grid_stencil.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Grid_stencil.h b/lib/Grid_stencil.h
index ce987746..fa22361b 100644
--- a/lib/Grid_stencil.h
+++ b/lib/Grid_stencil.h
@@ -120,8 +120,8 @@ namespace Grid {
 	  // Map to always positive shift modulo global full dimension.
 	  int shift = (displacement+fd)%fd;
 	  
-     	  int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
-	  assert (checkerboard== _checkerboard);
+	  //     	  int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
+	  assert (source.checkerboard== _checkerboard);
 
 	  // the permute type
 	  int simd_layout     = _grid->_simd_layout[dimension];

From 9b5633ff4f6b13bfd07cebffadbe7bd70177efea Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 25 May 2015 13:42:36 +0100
Subject: [PATCH 20/27] Herm op

---
 lib/algorithms/LinearOperator.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h
index 167213ec..cb6240ba 100644
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -106,6 +106,10 @@ namespace Grid {
     public:
       virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
     };
+    template<class Field> class HermitianOperatorFunction {
+    public:
+      virtual void operator() (HermitianOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
+    };
 
     // FIXME : To think about
 

From 29f72292ba44ec5408b73dc94da2cedc5a03b3c0 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 25 May 2015 13:43:58 +0100
Subject: [PATCH 21/27] Updates now schur red black solver working

---
 lib/algorithms/SparseMatrix.h                | 18 ++---
 lib/algorithms/iterative/ConjugateGradient.h | 49 ++++++++-----
 lib/algorithms/iterative/SchurRedBlack.h     | 73 +++++++++++---------
 3 files changed, 82 insertions(+), 58 deletions(-)

diff --git a/lib/algorithms/SparseMatrix.h b/lib/algorithms/SparseMatrix.h
index 6c54fc92..7bfe959b 100644
--- a/lib/algorithms/SparseMatrix.h
+++ b/lib/algorithms/SparseMatrix.h
@@ -10,6 +10,7 @@ namespace Grid {
   /////////////////////////////////////////////////////////////////////////////////////////////
     template<class Field> class SparseMatrixBase {
     public:
+      GridBase *_grid;
       // Full checkerboar operations
       virtual RealD M    (const Field &in, Field &out)=0;
       virtual RealD Mdag (const Field &in, Field &out)=0;
@@ -18,6 +19,7 @@ namespace Grid {
 	ni=M(in,tmp);
 	no=Mdag(tmp,out);
       }
+      SparseMatrixBase(GridBase *grid) : _grid(grid) {};
     };
 
   /////////////////////////////////////////////////////////////////////////////////////////////
@@ -25,7 +27,7 @@ namespace Grid {
   /////////////////////////////////////////////////////////////////////////////////////////////
     template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
     public:
-      
+      GridBase *_cbgrid;
       // half checkerboard operaions
       virtual  void Meooe    (const Field &in, Field &out)=0;
       virtual  void Mooee    (const Field &in, Field &out)=0;
@@ -44,9 +46,7 @@ namespace Grid {
 	Meooe(out,tmp);
 
 	Mooee(in,out);
-	out=out-tmp; // axpy_norm
-	RealD n=norm2(out);
-	return n;
+	return axpy_norm(out,-1.0,tmp,out);
       }
       virtual  RealD MpcDag   (const Field &in, Field &out){
 	Field tmp(in._grid);
@@ -56,15 +56,15 @@ namespace Grid {
 	MeooeDag(out,tmp);
 
 	MooeeDag(in,out);
-	out=out-tmp; // axpy_norm
-	RealD n=norm2(out);
-	return n;
+	return axpy_norm(out,-1.0,tmp,out);
       }
-      virtual void MpcDagMpc(const Field &in, Field &out,RealD ni,RealD no) {
+      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
 	Field tmp(in._grid);
 	ni=Mpc(in,tmp);
-	no=Mpc(tmp,out);
+	no=MpcDag(tmp,out);
+	//	std::cout<<"MpcDagMpc "<<ni<<" "<<no<<std::endl;
       }
+      CheckerBoardedSparseMatrixBase(GridBase *grid,GridBase *cbgrid) : SparseMatrixBase<Field>(grid), _cbgrid(cbgrid) {};
     };
 
 }
diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h
index 85b196e9..d4f89662 100644
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -9,17 +9,21 @@ namespace Grid {
     /////////////////////////////////////////////////////////////
 
   template<class Field> 
-    class ConjugateGradient :  public OperatorFunction<Field> {
+    class ConjugateGradient : public HermitianOperatorFunction<Field> {
 public:                                                
     RealD   Tolerance;
     Integer MaxIterations;
-
+    int verbose;
     ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
+      verbose=0;
     };
 
-    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi) {assert(0);};
+
     void operator() (HermitianOperatorBase<Field> &Linop,const Field &src, Field &psi){
 
+      psi.checkerboard = src.checkerboard;
+      conformable(psi,src);
+
       RealD cp,c,a,d,b,ssq,qq,b_pred;
       
       Field   p(src);
@@ -37,14 +41,16 @@ public:
       a  =norm2(p);
       cp =a;
       ssq=norm2(src);
-      
-      std::cout <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
-      std::cout <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
-      std::cout <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
-      std::cout <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
-      std::cout <<std::setprecision(4)<< "ConjugateGradient:     r "<<cp   <<std::endl;
-      std::cout <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
-      
+
+      if ( verbose ) {
+	std::cout <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
+	std::cout <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
+	std::cout <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
+	std::cout <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
+	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
+	std::cout <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
+      }
+
       RealD rsq =  Tolerance* Tolerance*ssq;
       
       //Check if guess is really REALLY good :)
@@ -60,14 +66,16 @@ public:
 	c=cp;
 	
 	Linop.OpAndNorm(p,mmp,d,qq);
-	  
-	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  d,qq "<<d<< " "<<qq <<std::endl;
+
+	RealD    qqck = norm2(mmp);
+	ComplexD dck  = innerProduct(p,mmp);
+	//	if (verbose) std::cout <<std::setprecision(4)<< "ConjugateGradient:  d,qq "<<d<< " "<<qq <<" qqcheck "<< qqck<< " dck "<< dck<<std::endl;
       
 	a      = c/d;
 	b_pred = a*(a*qq-d)/c;
-	
-	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  a,bp "<<a<< " "<<b_pred <<std::endl;
 
+
+	//	if (verbose) std::cout <<std::setprecision(4)<< "ConjugateGradient:  a,bp "<<a<< " "<<b_pred <<std::endl;
 	cp = axpy_norm(r,-a,mmp,r);
 	b = cp/c;
 	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,b "<<cp<< " "<<b <<std::endl;
@@ -76,7 +84,16 @@ public:
 	psi= a*p+psi;
 	p  = p*b+r;
 	  
-	std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	if (verbose) std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+	// Hack
+	if (0) {
+	  Field   tt(src);
+      	  Linop.Op(psi,mmp);
+	  tt=mmp-src;
+	  RealD resnorm = norm2(tt);
+	  std::cout<<"ConjugateGradient: Iteration " <<k<<" true residual "<<resnorm << " computed " << cp <<std::endl;
+	}
 
 	// Stopping condition
 	if ( cp <= rsq ) { 
diff --git a/lib/algorithms/iterative/SchurRedBlack.h b/lib/algorithms/iterative/SchurRedBlack.h
index e6a7b59e..d3a76d7f 100644
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -1,6 +1,7 @@
 #ifndef GRID_SCHUR_RED_BLACK_H
 #define GRID_SCHUR_RED_BLACK_H
 
+
   /*
    * Red black Schur decomposition
    *
@@ -25,80 +26,86 @@
    *     M psi = eta
    ***********************
    *Odd
-   * i)   (D_oo)^{\dag} D_oo psi_o = (D_oo)^\dag L^{-1}  eta_o
-   *                        eta_o' = D_oo (eta_o - Moe Mee^{-1} eta_e)
+   * i)   (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
+   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
    *Even
    * ii)  Mee psi_e + Meo psi_o = src_e
    *
    *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
    *
    */
-
 namespace Grid {
 
   ///////////////////////////////////////////////////////////////////////////////////////////////////////
   // Take a matrix and form a Red Black solver calling a Herm solver
   // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
   ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackSolve : public OperatorFunction<Field>{
+  template<class Field> class SchurRedBlackSolve {
   private:
-    SparseMatrixBase<Field> & _Matrix;
-    OperatorFunction<Field> & _HermitianRBSolver;
+    HermitianOperatorFunction<Field> & _HermitianRBSolver;
     int CBfactorise;
   public:
 
     /////////////////////////////////////////////////////
     // Wrap the usual normal equations Schur trick
     /////////////////////////////////////////////////////
-  SchurRedBlackSolve(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianRBSolver) 
-    :  _Matrix(Matrix), _HermitianRBSolver(HermitianRBSolver) { 
+  SchurRedBlackSolve(HermitianOperatorFunction<Field> &HermitianRBSolver)  :
+     _HermitianRBSolver(HermitianRBSolver) 
+    { 
       CBfactorise=0;
     };
 
-    void operator() (const Field &in, Field &out){
+    template<class Matrix>
+      void operator() (Matrix & _Matrix,const Field &in, Field &out){
 
       // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME need to make eo grid from full grid.
       // FIXME use CBfactorise to control schur decomp
-      const int Even=0;
-      const int Odd =1;
-
-      // Make a cartesianRedBlack from full Grid
-      GridRedBlackCartesian grid(in._grid);
+      GridBase *grid = _Matrix._cbgrid;
+      GridBase *fgrid= _Matrix._grid;
  
-      Field src_e(&grid);
-      Field src_o(&grid);
-      Field sol_e(&grid);
-      Field sol_o(&grid);
-      Field   tmp(&grid);
-      Field  Mtmp(&grid);
-      
+      Field src_e(grid);
+      Field src_o(grid);
+      Field sol_e(grid);
+      Field sol_o(grid);
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+      Field resid(fgrid);
+
       pickCheckerboard(Even,src_e,in);
       pickCheckerboard(Odd ,src_o,in);
 
       /////////////////////////////////////////////////////
       // src_o = Mdag * (source_o - Moe MeeInv source_e)
       /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     //    MooeeInv(source[Even],tmp,DaggerNo,Even);
-      _Matrix.Meooe   (tmp,Mtmp);      //    Meo     (tmp,src,Odd,DaggerNo);
-      tmp=src_o-Mtmp;                  //    axpy    (tmp,src,source[Odd],-1.0);
-      _Matrix.MpcDag(tmp,src_o);       //    Mprec(tmp,src,Mtmp,DaggerYes);  
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+      _Matrix.MpcDag(tmp,src_o);       assert(src_o.checkerboard ==Odd);       
 
       //////////////////////////////////////////////////////////////
       // Call the red-black solver
       //////////////////////////////////////////////////////////////
-      _HermitianRBSolver(src_o,sol_o); //  CGNE_prec_MdagM(solution[Odd],src);
+      HermitianCheckerBoardedOperator<Matrix,Field> _HermOpEO(_Matrix);
+      std::cout << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 
       ///////////////////////////////////////////////////
       // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
       ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        // Meo(solution[Odd],tmp,Even,DaggerNo);
-      src_e = src_e-tmp;               // axpy(src,tmp,source[Even],-1.0);
-      _Matrix.MooeeInv(src_e,sol_e);   // MooeeInv(src,solution[Even],DaggerNo,Even);
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      
-      setCheckerboard(out,sol_e);
-      setCheckerboard(out,sol_o);
- 
+      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+
+      // Verify the unprec residual
+      _Matrix.M(out,resid); 
+      resid = resid-in;
+      RealD ns = norm2(in);
+      RealD nr = norm2(resid);
+
+      std::cout << "SchurRedBlack solver true unprec resid "<< sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
     }     
   };
 

From bc947477f328aa6f30566f29733687f449115ce8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 25 May 2015 13:44:35 +0100
Subject: [PATCH 22/27] move constants into red black

---
 lib/cartesian/Grid_cartesian_red_black.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lib/cartesian/Grid_cartesian_red_black.h b/lib/cartesian/Grid_cartesian_red_black.h
index 475b47c2..ff2d3ba8 100644
--- a/lib/cartesian/Grid_cartesian_red_black.h
+++ b/lib/cartesian/Grid_cartesian_red_black.h
@@ -4,6 +4,13 @@
 
 namespace Grid {
 
+    static const int CbRed  =0;
+    static const int CbBlack=1;
+    static const int Even   =CbRed;
+    static const int Odd    =CbBlack;
+    static const int DaggerNo=0;
+    static const int DaggerYes=1;
+
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
 {
@@ -44,6 +51,9 @@ public:
             return source_cb;
         }
     };
+
+    GridRedBlackCartesian(GridBase *base) : GridRedBlackCartesian(base->_fdimensions,base->_simd_layout,base->_processors)  {};
+
     GridRedBlackCartesian(std::vector<int> &dimensions,
 			  std::vector<int> &simd_layout,
 			  std::vector<int> &processor_grid ) : GridBase(processor_grid)

From 3358a77c7a111193b6310ed7b68b2cee48d34775 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 25 May 2015 13:45:08 +0100
Subject: [PATCH 23/27] Better checkerboard tracking.

---
 lib/lattice/Grid_lattice_ET.h    | 41 ++++++++++++++++++++++++++
 lib/lattice/Grid_lattice_arith.h | 28 ++++++++++++++++--
 lib/lattice/Grid_lattice_base.h  | 50 ++++++++++++++++++++++++++++++++
 3 files changed, 117 insertions(+), 2 deletions(-)

diff --git a/lib/lattice/Grid_lattice_ET.h b/lib/lattice/Grid_lattice_ET.h
index b0205f2f..2a7172ea 100644
--- a/lib/lattice/Grid_lattice_ET.h
+++ b/lib/lattice/Grid_lattice_ET.h
@@ -91,6 +91,47 @@ inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression<
   GridFromExpression(grid,std::get<2>(expr.second));
 }
 
+
+//////////////////////////////////////////////////////////////////////////
+// Obtain the CB from an expression, ensuring conformable. This must follow a tree recursion
+//////////////////////////////////////////////////////////////////////////
+template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
+inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
+{
+  if ( (cb==Odd) || (cb==Even) ) {
+    assert(cb==lat.checkerboard);
+  } 
+  cb=lat.checkerboard;
+  //  std::cout<<"Lattice leaf cb "<<cb<<std::endl;
+}
+template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
+inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
+{
+  //  std::cout<<"Non lattice leaf cb"<<cb<<std::endl;
+}
+template <typename Op, typename T1>
+inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
+{
+  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
+  //  std::cout<<"Unary node cb "<<cb<<std::endl;
+}
+
+template <typename Op, typename T1, typename T2>
+inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &expr) 
+{
+  CBFromExpression(cb,std::get<0>(expr.second));// recurse
+  CBFromExpression(cb,std::get<1>(expr.second));
+  //  std::cout<<"Binary node cb "<<cb<<std::endl;
+}
+template <typename Op, typename T1, typename T2, typename T3>
+inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
+{
+  CBFromExpression(cb,std::get<0>(expr.second));// recurse
+  CBFromExpression(cb,std::get<1>(expr.second));
+  CBFromExpression(cb,std::get<2>(expr.second));
+  //  std::cout<<"Trinary node cb "<<cb<<std::endl;
+}
+
 ////////////////////////////////////////////
 // Unary operators and funcs
 ////////////////////////////////////////////
diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h
index a6f59b56..dc5e57e8 100644
--- a/lib/lattice/Grid_lattice_arith.h
+++ b/lib/lattice/Grid_lattice_arith.h
@@ -9,6 +9,8 @@ namespace Grid {
   //////////////////////////////////////////////////////////////////////////////////////////////////////
   template<class obj1,class obj2,class obj3> strong_inline
     void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
@@ -24,6 +26,8 @@ PARALLEL_FOR_LOOP
   
   template<class obj1,class obj2,class obj3> strong_inline
     void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
@@ -39,6 +43,8 @@ PARALLEL_FOR_LOOP
   
   template<class obj1,class obj2,class obj3> strong_inline
     void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
@@ -53,6 +59,8 @@ PARALLEL_FOR_LOOP
   }
   template<class obj1,class obj2,class obj3> strong_inline
     void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
@@ -71,6 +79,7 @@ PARALLEL_FOR_LOOP
   //////////////////////////////////////////////////////////////////////////////////////////////////////
   template<class obj1,class obj2,class obj3> strong_inline
     void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    ret.checkerboard = lhs.checkerboard;
     conformable(lhs,ret);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
@@ -82,7 +91,8 @@ PARALLEL_FOR_LOOP
   
   template<class obj1,class obj2,class obj3> strong_inline
     void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-    conformable(lhs,ret);
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,lhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
       obj1 tmp;
@@ -93,7 +103,8 @@ PARALLEL_FOR_LOOP
   
   template<class obj1,class obj2,class obj3> strong_inline
     void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-    conformable(lhs,ret);
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,lhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
@@ -107,6 +118,7 @@ PARALLEL_FOR_LOOP
   }
   template<class obj1,class obj2,class obj3> strong_inline
     void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    ret.checkerboard = lhs.checkerboard;
     conformable(lhs,ret);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
@@ -125,6 +137,7 @@ PARALLEL_FOR_LOOP
   //////////////////////////////////////////////////////////////////////////////////////////////////////
     template<class obj1,class obj2,class obj3> strong_inline
     void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
@@ -140,6 +153,7 @@ PARALLEL_FOR_LOOP
   
   template<class obj1,class obj2,class obj3> strong_inline
     void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
@@ -155,6 +169,7 @@ PARALLEL_FOR_LOOP
   
   template<class obj1,class obj2,class obj3> strong_inline
     void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
@@ -169,6 +184,7 @@ PARALLEL_FOR_LOOP
   }
   template<class obj1,class obj2,class obj3> strong_inline
     void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
@@ -184,6 +200,8 @@ PARALLEL_FOR_LOOP
   
   template<class sobj,class vobj> strong_inline
   void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    ret.checkerboard = x.checkerboard;
+    conformable(ret,x);
     conformable(x,y);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<x._grid->oSites();ss++){
@@ -197,6 +215,8 @@ PARALLEL_FOR_LOOP
   }
   template<class sobj,class vobj> strong_inline
   void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    ret.checkerboard = x.checkerboard;
+    conformable(ret,x);
     conformable(x,y);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<x._grid->oSites();ss++){
@@ -211,12 +231,16 @@ PARALLEL_FOR_LOOP
 
   template<class sobj,class vobj> strong_inline
   RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    ret.checkerboard = x.checkerboard;
+    conformable(ret,x);
     conformable(x,y);
     axpy(ret,a,x,y);
     return norm2(ret);
   }
   template<class sobj,class vobj> strong_inline
   RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    ret.checkerboard = x.checkerboard;
+    conformable(ret,x);
     conformable(x,y);
     axpby(ret,a,b,x,y);
     return norm2(ret); // FIXME implement parallel norm in ss loop
diff --git a/lib/lattice/Grid_lattice_base.h b/lib/lattice/Grid_lattice_base.h
index 41f349b7..1d3b1efb 100644
--- a/lib/lattice/Grid_lattice_base.h
+++ b/lib/lattice/Grid_lattice_base.h
@@ -66,6 +66,16 @@ public:
   ////////////////////////////////////////////////////////////////////////////////
   template <typename Op, typename T1>                         strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
   {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
@@ -79,6 +89,16 @@ PARALLEL_FOR_LOOP
   }
   template <typename Op, typename T1,typename T2>             strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
   {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
@@ -92,6 +112,16 @@ PARALLEL_FOR_LOOP
   }
   template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
   {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
@@ -106,8 +136,15 @@ PARALLEL_FOR_LOOP
   //GridFromExpression is tricky to do
   template<class Op,class T1>
     Lattice(const LatticeUnaryExpression<Op,T1> & expr):    _grid(nullptr){
+
     GridFromExpression(_grid,expr);
     assert(_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
     _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
@@ -123,6 +160,12 @@ PARALLEL_FOR_LOOP
   Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr):    _grid(nullptr){
     GridFromExpression(_grid,expr);
     assert(_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
     _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
@@ -138,6 +181,12 @@ PARALLEL_FOR_LOOP
   Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr):    _grid(nullptr){
     GridFromExpression(_grid,expr);
     assert(_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
     _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
@@ -169,6 +218,7 @@ PARALLEL_FOR_LOOP
         return *this;
     }
     template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
+      this->checkerboard = r.checkerboard;
       conformable(*this,r);
       std::cout<<"Lattice operator ="<<std::endl;
 PARALLEL_FOR_LOOP

From 55685b7cf565f2741ea0506beeca7e0e14f475f6 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 25 May 2015 13:45:32 +0100
Subject: [PATCH 24/27] Most cosmetic

---
 lib/math/Grid_math_tensors.h | 113 ++++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 56 deletions(-)

diff --git a/lib/math/Grid_math_tensors.h b/lib/math/Grid_math_tensors.h
index 32ca78e5..4e6752e9 100644
--- a/lib/math/Grid_math_tensors.h
+++ b/lib/math/Grid_math_tensors.h
@@ -134,66 +134,65 @@ public:
   iVector<vtype,N> & operator= (iVector<vtype,N> &&copyme) = default;
   */
 
-    iVector<vtype,N> & operator= (const Zero &hero){
-        zeroit(*this);
-        return *this;
+  iVector<vtype,N> & operator= (const Zero &hero){
+    zeroit(*this);
+    return *this;
+  }
+  friend strong_inline void zeroit(iVector<vtype,N> &that){
+    for(int i=0;i<N;i++){
+      zeroit(that._internal[i]);
     }
-    friend strong_inline void zeroit(iVector<vtype,N> &that){
-        for(int i=0;i<N;i++){
-            zeroit(that._internal[i]);
-        }
+  }
+  friend strong_inline void prefetch(iVector<vtype,N> &that){
+    for(int i=0;i<N;i++) prefetch(that._internal[i]);
+  }
+  friend strong_inline void vstream(iVector<vtype,N> &out,const iVector<vtype,N> &in){
+    for(int i=0;i<N;i++){
+      vstream(out._internal[i],in._internal[i]);
     }
-    friend strong_inline void prefetch(iVector<vtype,N> &that){
-      for(int i=0;i<N;i++) prefetch(that._internal[i]);
+  }
+  friend strong_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
+    for(int i=0;i<N;i++){
+      permute(out._internal[i],in._internal[i],permutetype);
     }
-    friend strong_inline void vstream(iVector<vtype,N> &out,const iVector<vtype,N> &in){
-      for(int i=0;i<N;i++){
-	vstream(out._internal[i],in._internal[i]);
-      }
+  }
+  // Unary negation
+  friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
+    iVector<vtype,N> ret;
+    for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
+    return ret;
+  }
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+  strong_inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
+    *this = (*this)*r;
+    return *this;
+  }
+  strong_inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
+    *this = (*this)-r;
+    return *this;
+  }
+  strong_inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
+    *this = (*this)+r;
+    return *this;
+  }
+  strong_inline vtype & operator ()(int i) {
+    return _internal[i];
+  }
+  strong_inline const vtype & operator ()(int i) const {
+    return _internal[i];
+  }
+  friend std::ostream& operator<< (std::ostream& stream, const iVector<vtype,N> &o){
+    stream<< "V<"<<N<<">{";
+    for(int i=0;i<N;i++) {
+      stream<<o._internal[i];
+      if (i<N-1)	stream<<",";
     }
-
-    friend strong_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
-      for(int i=0;i<N;i++){
-	permute(out._internal[i],in._internal[i],permutetype);
-      }
-    }
-    // Unary negation
-    friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
-        iVector<vtype,N> ret;
-        for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
-        return ret;
-    }
-    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-    strong_inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
-        *this = (*this)*r;
-        return *this;
-    }
-    strong_inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
-        *this = (*this)-r;
-        return *this;
-    }
-    strong_inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
-        *this = (*this)+r;
-        return *this;
-    }
-    strong_inline vtype & operator ()(int i) {
-      return _internal[i];
-    }
-    strong_inline const vtype & operator ()(int i) const {
-      return _internal[i];
-    }
-    friend std::ostream& operator<< (std::ostream& stream, const iVector<vtype,N> &o){
-      stream<< "V<"<<N<<">{";
-      for(int i=0;i<N;i++) {
-	stream<<o._internal[i];
-	if (i<N-1)	stream<<",";
-      }
-      stream<<"}";
-      return stream;
-    };
-    //    strong_inline vtype && operator ()(int i) {
-    //      return _internal[i];
-    //    }
+    stream<<"}";
+    return stream;
+  };
+  //    strong_inline vtype && operator ()(int i) {
+  //      return _internal[i];
+  //    }
 };
     
 template<class vtype,int N> class iMatrix 
@@ -213,6 +212,8 @@ public:
 
   iMatrix(const Zero &z){ *this = zero; };
   iMatrix() =default;
+  iMatrix(scalar_type s)  { (*this) = s ;};// recurse down and hit the constructor for vector_type
+
   /*
   iMatrix(const iMatrix<vtype,N> &copyme)=default;
   iMatrix(iMatrix<vtype,N> &&copyme)=default;

From 1a9841a0f1991b0e4d58c0361936bea74bbc6c3f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 25 May 2015 13:46:28 +0100
Subject: [PATCH 25/27] Better EO support letting Schur solver work

---
 lib/qcd/Grid_qcd.h             |   3 -
 lib/qcd/Grid_qcd_wilson_dop.cc | 317 +++++++++++++++++++--------------
 lib/qcd/Grid_qcd_wilson_dop.h  |  36 ++--
 3 files changed, 210 insertions(+), 146 deletions(-)

diff --git a/lib/qcd/Grid_qcd.h b/lib/qcd/Grid_qcd.h
index 0bf68632..959f3529 100644
--- a/lib/qcd/Grid_qcd.h
+++ b/lib/qcd/Grid_qcd.h
@@ -10,9 +10,6 @@ namespace QCD {
     static const int Nhs=2; // half spinor
     static const int Nds=8; // double stored gauge field
 
-    static const int CbRed  =0;
-    static const int CbBlack=1;
-    
     //////////////////////////////////////////////////////////////////////////////
     // QCD iMatrix types
     // Index conventions:                            Lorentz x Spin x Colour
diff --git a/lib/qcd/Grid_qcd_wilson_dop.cc b/lib/qcd/Grid_qcd_wilson_dop.cc
index 2020a9ec..318e18df 100644
--- a/lib/qcd/Grid_qcd_wilson_dop.cc
+++ b/lib/qcd/Grid_qcd_wilson_dop.cc
@@ -4,8 +4,8 @@
 namespace Grid {
 namespace QCD {
 
-const std::vector<int> WilsonMatrix::directions   ({0,1,2,3, 0, 1, 2, 3,0});
-const std::vector<int> WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1,0});
+const std::vector<int> WilsonMatrix::directions   ({0,1,2,3, 0, 1, 2, 3});
+const std::vector<int> WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1});
 
   // Should be in header?
 const int WilsonMatrix::Xp = 0;
@@ -16,7 +16,6 @@ const int WilsonMatrix::Xm = 4;
 const int WilsonMatrix::Ym = 5;
 const int WilsonMatrix::Zm = 6;
 const int WilsonMatrix::Tm = 7;
-  //const int WilsonMatrix::X0 = 8;
 
   class WilsonCompressor {
   public:
@@ -72,20 +71,27 @@ const int WilsonMatrix::Tm = 7;
     }
   };
 
-  WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,double _mass)
-  : Stencil(Umu._grid,npoint,0,directions,displacements),
+  WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,GridCartesian &Fgrid,GridRedBlackCartesian &Hgrid, double _mass)  : 
+    CheckerBoardedSparseMatrixBase<LatticeFermion>(&Fgrid,&Hgrid),
+    Stencil    (  _grid,npoint,Even,directions,displacements),
+    StencilEven(_cbgrid,npoint,Even,directions,displacements), // source is Even
+    StencilOdd (_cbgrid,npoint,Odd ,directions,displacements), // source is Odd
     mass(_mass),
-    Umu(_Umu._grid)
-{
-  // Allocate the required comms buffer
-  grid = _Umu._grid;
-  comm_buf.resize(Stencil._unified_buffer_size);
-  DoubleStore(Umu,_Umu);
-}
+    Umu(_grid),
+    UmuEven(_cbgrid),
+    UmuOdd (_cbgrid)
+  {
+    // Allocate the required comms buffer
+    comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
+    
+    DoubleStore(Umu,_Umu);
+    pickCheckerboard(Even,UmuEven,Umu);
+    pickCheckerboard(Odd ,UmuOdd,Umu);
+  }
       
 void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu)
 {
-  LatticeColourMatrix U(grid);
+  LatticeColourMatrix U(_grid);
 
   for(int mu=0;mu<Nd;mu++){
     U = peekIndex<LorentzIndex>(Umu,mu);
@@ -97,47 +103,63 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF
 
 RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out)
 {
-  this->Dhop(in,out,0);
+  out.checkerboard=in.checkerboard;
+  Dhop(in,out,DaggerNo);
   out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
   return norm2(out);
 }
 RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out)
 {
-  this->Dhop(in,out,1);
+  out.checkerboard=in.checkerboard;
+  Dhop(in,out,DaggerYes);
   out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
   return norm2(out);
 }
 
 void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out)
 {
-  this->Dhop(in,out,0);
-  out = 0.5*out; // FIXME : scale factor in Dhop
+  if ( in.checkerboard == Odd ) {
+    DhopEO(in,out,DaggerNo);
+  } else {
+    DhopOE(in,out,DaggerNo);
+  }
+  out = (-0.5)*out; // FIXME : scale factor in Dhop
 }
 void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out)
 {
-  this->Dhop(in,out,1);
-  out = 0.5*out; // FIXME : scale factor in Dhop
+  if ( in.checkerboard == Odd ) {
+    DhopEO(in,out,DaggerYes);
+  } else {
+    DhopOE(in,out,DaggerYes);
+  }
+  out = (-0.5)*out; // FIXME : scale factor in Dhop
 }
 void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out)
 {
+  out.checkerboard = in.checkerboard;
   out = (4.0+mass)*in;
   return ;
 }
 void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out)
 {
-  this->Mooee(in,out);
+  out.checkerboard = in.checkerboard;
+  Mooee(in,out);
 }
 void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
 {
+  out.checkerboard = in.checkerboard;
   out = (1.0/(4.0+mass))*in;
   return ;
 }
 void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
 {
-  this->MooeeInv(in,out);
+  out.checkerboard = in.checkerboard;
+  MooeeInv(in,out);
 }
 
-void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out)
+void WilsonMatrix::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
+			    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+			    int ss,const LatticeFermion &in, LatticeFermion &out)
 {
     vHalfSpinColourVector  tmp;    
     vHalfSpinColourVector  chi;    
@@ -145,79 +167,78 @@ void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out
     vHalfSpinColourVector Uchi;
     int offset,local,perm, ptype;
 
-    //    int ss = Stencil._LebesgueReorder[sss];
-    int ssu= ss;
+    //#define VERBOSE( A)  if ( ss<10 ) { std::cout << "site " <<ss << " " #A " neigh " << offset << " perm "<< perm <<std::endl;}    
 
     // Xp
-    offset = Stencil._offsets [Xp][ss];
-    local  = Stencil._is_local[Xp][ss];
-    perm   = Stencil._permute[Xp][ss];
+    offset = st._offsets [Xp][ss];
+    local  = st._is_local[Xp][ss];
+    perm   = st._permute[Xp][ss];
 
-    ptype  = Stencil._permute_type[Xp];
+    ptype  = st._permute_type[Xp];
     if ( local && perm ) {
       spProjXp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjXp(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
+    mult(&Uchi(),&U._odata[ss](Xp),&chi());
     spReconXp(result,Uchi);
 
     // Yp
-    offset = Stencil._offsets [Yp][ss];
-    local  = Stencil._is_local[Yp][ss];
-    perm   = Stencil._permute[Yp][ss];
-    ptype  = Stencil._permute_type[Yp];
+    offset = st._offsets [Yp][ss];
+    local  = st._is_local[Yp][ss];
+    perm   = st._permute[Yp][ss];
+    ptype  = st._permute_type[Yp];
     if ( local && perm ) {
       spProjYp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjYp(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
+    mult(&Uchi(),&U._odata[ss](Yp),&chi());
     accumReconYp(result,Uchi);
 
     // Zp
-    offset = Stencil._offsets [Zp][ss];
-    local  = Stencil._is_local[Zp][ss];
-    perm   = Stencil._permute[Zp][ss];
-    ptype  = Stencil._permute_type[Zp];
+    offset = st._offsets [Zp][ss];
+    local  = st._is_local[Zp][ss];
+    perm   = st._permute[Zp][ss];
+    ptype  = st._permute_type[Zp];
     if ( local && perm ) {
       spProjZp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjZp(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
+    mult(&Uchi(),&U._odata[ss](Zp),&chi());
     accumReconZp(result,Uchi);
 
     // Tp
-    offset = Stencil._offsets [Tp][ss];
-    local  = Stencil._is_local[Tp][ss];
-    perm   = Stencil._permute[Tp][ss];
-    ptype  = Stencil._permute_type[Tp];
+    offset = st._offsets [Tp][ss];
+    local  = st._is_local[Tp][ss];
+    perm   = st._permute[Tp][ss];
+    ptype  = st._permute_type[Tp];
     if ( local && perm ) {
       spProjTp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjTp(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
+    mult(&Uchi(),&U._odata[ss](Tp),&chi());
     accumReconTp(result,Uchi);
 
     // Xm
-    offset = Stencil._offsets [Xm][ss];
-    local  = Stencil._is_local[Xm][ss];
-    perm   = Stencil._permute[Xm][ss];
-    ptype  = Stencil._permute_type[Xm];
+    offset = st._offsets [Xm][ss];
+    local  = st._is_local[Xm][ss];
+    perm   = st._permute[Xm][ss];
+    ptype  = st._permute_type[Xm];
 
     if ( local && perm ) 
     {
@@ -226,17 +247,17 @@ void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out
     } else if ( local ) {
       spProjXm(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
+    mult(&Uchi(),&U._odata[ss](Xm),&chi());
     accumReconXm(result,Uchi);
 
 
     // Ym
-    offset = Stencil._offsets [Ym][ss];
-    local  = Stencil._is_local[Ym][ss];
-    perm   = Stencil._permute[Ym][ss];
-    ptype  = Stencil._permute_type[Ym];
+    offset = st._offsets [Ym][ss];
+    local  = st._is_local[Ym][ss];
+    perm   = st._permute[Ym][ss];
+    ptype  = st._permute_type[Ym];
 
     if ( local && perm ) {
       spProjYm(tmp,in._odata[offset]);
@@ -244,46 +265,49 @@ void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out
     } else if ( local ) {
       spProjYm(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
+    mult(&Uchi(),&U._odata[ss](Ym),&chi());
     accumReconYm(result,Uchi);
 
     // Zm
-    offset = Stencil._offsets [Zm][ss];
-    local  = Stencil._is_local[Zm][ss];
-    perm   = Stencil._permute[Zm][ss];
-    ptype  = Stencil._permute_type[Zm];
+    offset = st._offsets [Zm][ss];
+    local  = st._is_local[Zm][ss];
+    perm   = st._permute[Zm][ss];
+    ptype  = st._permute_type[Zm];
     if ( local && perm ) {
       spProjZm(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjZm(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
+    mult(&Uchi(),&U._odata[ss](Zm),&chi());
     accumReconZm(result,Uchi);
 
     // Tm
-    offset = Stencil._offsets [Tm][ss];
-    local  = Stencil._is_local[Tm][ss];
-    perm   = Stencil._permute[Tm][ss];
-    ptype  = Stencil._permute_type[Tm];
+    offset = st._offsets [Tm][ss];
+    local  = st._is_local[Tm][ss];
+    perm   = st._permute[Tm][ss];
+    ptype  = st._permute_type[Tm];
     if ( local && perm ) {
       spProjTm(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjTm(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
+    mult(&Uchi(),&U._odata[ss](Tm),&chi());
     accumReconTm(result,Uchi);
 
     vstream(out._odata[ss],result);
 }
-void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out)
+
+void WilsonMatrix::DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
+			       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+			       int ss,const LatticeFermion &in, LatticeFermion &out)
 {
     vHalfSpinColourVector  tmp;    
     vHalfSpinColourVector  chi;    
@@ -291,78 +315,76 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &
     vHalfSpinColourVector Uchi;
     int offset,local,perm, ptype;
 
-    int ssu= ss;
-
     // Xp
-    offset = Stencil._offsets [Xm][ss];
-    local  = Stencil._is_local[Xm][ss];
-    perm   = Stencil._permute[Xm][ss];
+    offset = st._offsets [Xm][ss];
+    local  = st._is_local[Xm][ss];
+    perm   = st._permute[Xm][ss];
 
-    ptype  = Stencil._permute_type[Xm];
+    ptype  = st._permute_type[Xm];
     if ( local && perm ) {
       spProjXp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjXp(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
+    mult(&Uchi(),&U._odata[ss](Xm),&chi());
     spReconXp(result,Uchi);
 
     // Yp
-    offset = Stencil._offsets [Ym][ss];
-    local  = Stencil._is_local[Ym][ss];
-    perm   = Stencil._permute[Ym][ss];
-    ptype  = Stencil._permute_type[Ym];
+    offset = st._offsets [Ym][ss];
+    local  = st._is_local[Ym][ss];
+    perm   = st._permute[Ym][ss];
+    ptype  = st._permute_type[Ym];
     if ( local && perm ) {
       spProjYp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjYp(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
+    mult(&Uchi(),&U._odata[ss](Ym),&chi());
     accumReconYp(result,Uchi);
 
     // Zp
-    offset = Stencil._offsets [Zm][ss];
-    local  = Stencil._is_local[Zm][ss];
-    perm   = Stencil._permute[Zm][ss];
-    ptype  = Stencil._permute_type[Zm];
+    offset = st._offsets [Zm][ss];
+    local  = st._is_local[Zm][ss];
+    perm   = st._permute[Zm][ss];
+    ptype  = st._permute_type[Zm];
     if ( local && perm ) {
       spProjZp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjZp(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
+    mult(&Uchi(),&U._odata[ss](Zm),&chi());
     accumReconZp(result,Uchi);
 
     // Tp
-    offset = Stencil._offsets [Tm][ss];
-    local  = Stencil._is_local[Tm][ss];
-    perm   = Stencil._permute[Tm][ss];
-    ptype  = Stencil._permute_type[Tm];
+    offset = st._offsets [Tm][ss];
+    local  = st._is_local[Tm][ss];
+    perm   = st._permute[Tm][ss];
+    ptype  = st._permute_type[Tm];
     if ( local && perm ) {
       spProjTp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjTp(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
+    mult(&Uchi(),&U._odata[ss](Tm),&chi());
     accumReconTp(result,Uchi);
 
     // Xm
-    offset = Stencil._offsets [Xp][ss];
-    local  = Stencil._is_local[Xp][ss];
-    perm   = Stencil._permute[Xp][ss];
-    ptype  = Stencil._permute_type[Xp];
+    offset = st._offsets [Xp][ss];
+    local  = st._is_local[Xp][ss];
+    perm   = st._permute[Xp][ss];
+    ptype  = st._permute_type[Xp];
 
     if ( local && perm ) 
     {
@@ -371,17 +393,16 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &
     } else if ( local ) {
       spProjXm(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
+    mult(&Uchi(),&U._odata[ss](Xp),&chi());
     accumReconXm(result,Uchi);
 
-
     // Ym
-    offset = Stencil._offsets [Yp][ss];
-    local  = Stencil._is_local[Yp][ss];
-    perm   = Stencil._permute[Yp][ss];
-    ptype  = Stencil._permute_type[Yp];
+    offset = st._offsets [Yp][ss];
+    local  = st._is_local[Yp][ss];
+    perm   = st._permute[Yp][ss];
+    ptype  = st._permute_type[Yp];
 
     if ( local && perm ) {
       spProjYm(tmp,in._odata[offset]);
@@ -389,67 +410,97 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &
     } else if ( local ) {
       spProjYm(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
+    mult(&Uchi(),&U._odata[ss](Yp),&chi());
     accumReconYm(result,Uchi);
 
     // Zm
-    offset = Stencil._offsets [Zp][ss];
-    local  = Stencil._is_local[Zp][ss];
-    perm   = Stencil._permute[Zp][ss];
-    ptype  = Stencil._permute_type[Zp];
+    offset = st._offsets [Zp][ss];
+    local  = st._is_local[Zp][ss];
+    perm   = st._permute[Zp][ss];
+    ptype  = st._permute_type[Zp];
     if ( local && perm ) {
       spProjZm(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjZm(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
+    mult(&Uchi(),&U._odata[ss](Zp),&chi());
     accumReconZm(result,Uchi);
 
     // Tm
-    offset = Stencil._offsets [Tp][ss];
-    local  = Stencil._is_local[Tp][ss];
-    perm   = Stencil._permute[Tp][ss];
-    ptype  = Stencil._permute_type[Tp];
+    offset = st._offsets [Tp][ss];
+    local  = st._is_local[Tp][ss];
+    perm   = st._permute[Tp][ss];
+    ptype  = st._permute_type[Tp];
     if ( local && perm ) {
       spProjTm(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
       spProjTm(chi,in._odata[offset]);
     } else { 
-      chi=comm_buf[offset];
+      chi=buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
+    mult(&Uchi(),&U._odata[ss](Tp),&chi());
     accumReconTm(result,Uchi);
 
     vstream(out._odata[ss],result);
 }
 
-void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
+void WilsonMatrix::DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField & U,
+				const LatticeFermion &in, LatticeFermion &out,int dag)
 {
-  assert((dag==0) ||(dag==1));
-
+  assert((dag==DaggerNo) ||(dag==DaggerYes));
   WilsonCompressor compressor(dag);
-  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
 
-  if ( dag ) {
+  st.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
+
+  if ( dag == DaggerYes ) {
 PARALLEL_FOR_LOOP
-    for(int sss=0;sss<grid->oSites();sss++){
-      DhopSiteDag(sss,in,out);
+    for(int sss=0;sss<in._grid->oSites();sss++){
+      DhopSiteDag(st,U,comm_buf,sss,in,out);
     }
   } else {
 PARALLEL_FOR_LOOP
-    for(int sss=0;sss<grid->oSites();sss++){
-      DhopSite(sss,in,out);
+    for(int sss=0;sss<in._grid->oSites();sss++){
+      DhopSite(st,U,comm_buf,sss,in,out);
     }
   }
 }
+void WilsonMatrix::DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag)
+{
+  conformable(in._grid,_cbgrid);    // verifies half grid
+  conformable(in._grid,out._grid); // drops the cb check
 
+  assert(in.checkerboard==Even);
+  out.checkerboard = Odd;
 
+  DhopInternal(StencilEven,UmuOdd,in,out,dag);
 }
+void WilsonMatrix::DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag)
+{
+  conformable(in._grid,_cbgrid);    // verifies half grid
+  conformable(in._grid,out._grid); // drops the cb check
+
+  assert(in.checkerboard==Odd);
+  out.checkerboard = Even;
+
+  DhopInternal(StencilOdd,UmuEven,in,out,dag);
+}
+void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
+{
+  conformable(in._grid,_grid); // verifies full grid
+  conformable(in._grid,out._grid);
+
+  out.checkerboard = in.checkerboard;
+
+  DhopInternal(Stencil,Umu,in,out,dag);
 }
 
+}}
+
+
+
diff --git a/lib/qcd/Grid_qcd_wilson_dop.h b/lib/qcd/Grid_qcd_wilson_dop.h
index 8d93a926..96b29cd0 100644
--- a/lib/qcd/Grid_qcd_wilson_dop.h
+++ b/lib/qcd/Grid_qcd_wilson_dop.h
@@ -11,14 +11,20 @@ namespace Grid {
       //NB r=1;
     public:
       double                        mass;
-      GridBase                     *grid;
+      //      GridBase                     *    grid; // Inherited
+      //      GridBase                     *  cbgrid;
 
-      // Copy of the gauge field 
-      LatticeDoubledGaugeField             Umu;
+      //Defines the stencils for even and odd
+      CartesianStencil Stencil; 
+      CartesianStencil StencilEven; 
+      CartesianStencil StencilOdd; 
 
-      //Defines the stencil
-      CartesianStencil              Stencil; 
-      static const int npoint=9;
+      // Copy of the gauge field , with even and odd subsets
+      LatticeDoubledGaugeField Umu;
+      LatticeDoubledGaugeField UmuEven;
+      LatticeDoubledGaugeField UmuOdd;
+
+      static const int npoint=8;
       static const std::vector<int> directions   ;
       static const std::vector<int> displacements;
       static const int Xp,Xm,Yp,Ym,Zp,Zm,Tp,Tm;
@@ -27,7 +33,7 @@ namespace Grid {
       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  comm_buf;
 
       // Constructor
-      WilsonMatrix(LatticeGaugeField &Umu,double mass);
+      WilsonMatrix(LatticeGaugeField &_Umu,GridCartesian &Fgrid,GridRedBlackCartesian &Hgrid,double _mass);
 
       // DoubleStore
       void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu);
@@ -45,9 +51,19 @@ namespace Grid {
       virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
 
       // non-hermitian hopping term; half cb or both
-      void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
-      void DhopSite   (int ss,const LatticeFermion &in, LatticeFermion &out);
-      void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out);
+      void Dhop  (const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField &U,
+			const LatticeFermion &in, LatticeFermion &out,int dag);
+      // These ones will need to be package intelligently. WilsonType base class
+      // for use by DWF etc..
+      void DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
+		    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+		    int ss,const LatticeFermion &in, LatticeFermion &out);
+      void DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
+		       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+		       int ss,const LatticeFermion &in, LatticeFermion &out);
 
       typedef iScalar<iMatrix<vComplex, Nc> > matrix;
 

From 2ae6214104f945ebcb4d6876e5052bb4dcb24878 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 25 May 2015 13:47:12 +0100
Subject: [PATCH 26/27] Schur complement based red-black inversion working

---
 benchmarks/Grid_wilson.cc           |  17 ++-
 benchmarks/Grid_wilson_cg_prec.cc   |  61 +++++++++
 benchmarks/Grid_wilson_cg_schur.cc  |  48 +++++++
 benchmarks/Grid_wilson_cg_unprec.cc |   7 +-
 benchmarks/Grid_wilson_evenodd.cc   | 201 ++++++++++++++++++++++++++++
 lib/simd/Grid_sse4.h                |   1 +
 lib/simd/Grid_vector_types.h        |  32 ++---
 tests/Grid_main.cc                  |   3 +-
 8 files changed, 336 insertions(+), 34 deletions(-)
 create mode 100644 benchmarks/Grid_wilson_cg_prec.cc
 create mode 100644 benchmarks/Grid_wilson_cg_schur.cc
 create mode 100644 benchmarks/Grid_wilson_evenodd.cc

diff --git a/benchmarks/Grid_wilson.cc b/benchmarks/Grid_wilson.cc
index adfb6fd4..32255b3e 100644
--- a/benchmarks/Grid_wilson.cc
+++ b/benchmarks/Grid_wilson.cc
@@ -24,7 +24,8 @@ int main (int argc, char ** argv)
   std::vector<int> latt_size   = GridDefaultLatt();
   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();
-  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
 
   int threads = GridThread::GetThreads();
   std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@@ -36,11 +37,11 @@ int main (int argc, char ** argv)
   //  pRNG.SeedFixedIntegers(seeds);
   pRNG.SeedRandomDevice();
 
-  LatticeFermion src(&Grid); random(pRNG,src);
+  LatticeFermion src   (&Grid); random(pRNG,src);
   LatticeFermion result(&Grid); result=zero;
   LatticeFermion    ref(&Grid);    ref=zero;
-  LatticeFermion    err(&Grid);    
   LatticeFermion    tmp(&Grid);    tmp=zero;
+  LatticeFermion    err(&Grid);    tmp=zero;
   LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
   std::vector<LatticeColourMatrix> U(4,&Grid);
 
@@ -51,8 +52,11 @@ int main (int argc, char ** argv)
 
   // Only one non-zero (y)
   Umu=zero;
+  Complex cone(1.0,0.0);
   for(int nn=0;nn<Nd;nn++){
     random(pRNG,U[nn]);
+    if (nn!=0) U[nn]=zero;
+    else U[nn] = cone;
     pokeIndex<LorentzIndex>(Umu,U[nn],nn);
   }
 
@@ -78,7 +82,7 @@ int main (int argc, char ** argv)
   }
 
   RealD mass=0.1;
-  WilsonMatrix Dw(Umu,mass);
+  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
   
   std::cout << "Calling Dw"<<std::endl;
   int ncall=1000;
@@ -93,7 +97,7 @@ int main (int argc, char ** argv)
   std::cout << "norm result "<< norm2(result)<<std::endl;
   std::cout << "norm ref    "<< norm2(ref)<<std::endl;
   std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-  err = ref -result;
+  err = ref-result; 
   std::cout << "norm diff   "<< norm2(err)<<std::endl;
 
 
@@ -129,9 +133,8 @@ int main (int argc, char ** argv)
   std::cout << "Called DwDag"<<std::endl;
   std::cout << "norm result "<< norm2(result)<<std::endl;
   std::cout << "norm ref    "<< norm2(ref)<<std::endl;
-  err = ref -result;
+  err = ref-result; 
   std::cout << "norm diff   "<< norm2(err)<<std::endl;
 
-
   Grid_finalize();
 }
diff --git a/benchmarks/Grid_wilson_cg_prec.cc b/benchmarks/Grid_wilson_cg_prec.cc
new file mode 100644
index 00000000..e86ec820
--- /dev/null
+++ b/benchmarks/Grid_wilson_cg_prec.cc
@@ -0,0 +1,61 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeFermion src(&Grid); random(pRNG,src);
+  RealD nrm = norm2(src);
+  LatticeFermion result(&Grid); result=zero;
+  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+
+  std::vector<LatticeColourMatrix> U(4,&Grid);
+
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
+  }
+  
+  RealD mass=0.5;
+  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
+
+  //  HermitianOperator<WilsonMatrix,LatticeFermion> HermOp(Dw);
+  //  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+  //  CG(HermOp,src,result);
+  
+  LatticeFermion    src_o(&RBGrid);
+  LatticeFermion result_o(&RBGrid);
+  pickCheckerboard(Odd,src_o,src);
+  result_o=zero;
+
+  HermitianCheckerBoardedOperator<WilsonMatrix,LatticeFermion> HermOpEO(Dw);
+  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+  CG(HermOpEO,src_o,result_o);
+
+  Grid_finalize();
+}
diff --git a/benchmarks/Grid_wilson_cg_schur.cc b/benchmarks/Grid_wilson_cg_schur.cc
new file mode 100644
index 00000000..af630ae1
--- /dev/null
+++ b/benchmarks/Grid_wilson_cg_schur.cc
@@ -0,0 +1,48 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+
+  LatticeFermion    src(&Grid); random(pRNG,src);
+  LatticeFermion result(&Grid); result=zero;
+  LatticeFermion resid(&Grid); 
+
+  RealD mass=0.5;
+  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
+
+  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+  SchurRedBlackSolve<LatticeFermion> SchurSolver(CG);
+
+  SchurSolver(Dw,src,result);
+  
+  Grid_finalize();
+}
diff --git a/benchmarks/Grid_wilson_cg_unprec.cc b/benchmarks/Grid_wilson_cg_unprec.cc
index 93023534..15302aab 100644
--- a/benchmarks/Grid_wilson_cg_unprec.cc
+++ b/benchmarks/Grid_wilson_cg_unprec.cc
@@ -24,7 +24,8 @@ int main (int argc, char ** argv)
   std::vector<int> latt_size   = GridDefaultLatt();
   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();
-  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
 
   std::vector<int> seeds({1,2,3,4});
   GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
@@ -46,11 +47,11 @@ int main (int argc, char ** argv)
   }
   
   RealD mass=0.5;
-  WilsonMatrix Dw(Umu,mass);
+  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
+
   HermitianOperator<WilsonMatrix,LatticeFermion> HermOp(Dw);
   ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
   CG(HermOp,src,result);
 
-  
   Grid_finalize();
 }
diff --git a/benchmarks/Grid_wilson_evenodd.cc b/benchmarks/Grid_wilson_evenodd.cc
new file mode 100644
index 00000000..4bc8c357
--- /dev/null
+++ b/benchmarks/Grid_wilson_evenodd.cc
@@ -0,0 +1,201 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+
+  int threads = GridThread::GetThreads();
+  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> seeds({1,2,3,4});
+
+  GridParallelRNG          pRNG(&Grid);
+  //  std::vector<int> seeds({1,2,3,4});
+  //  pRNG.SeedFixedIntegers(seeds);
+  pRNG.SeedRandomDevice();
+
+  LatticeFermion src   (&Grid); random(pRNG,src);
+  LatticeFermion phi   (&Grid); random(pRNG,phi);
+  LatticeFermion chi   (&Grid); random(pRNG,chi);
+  LatticeFermion result(&Grid); result=zero;
+  LatticeFermion    ref(&Grid);    ref=zero;
+  LatticeFermion    tmp(&Grid);    tmp=zero;
+  LatticeFermion    err(&Grid);    tmp=zero;
+  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+  std::vector<LatticeColourMatrix> U(4,&Grid);
+
+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
+  // Only one non-zero (y)
+  Umu=zero;
+  for(int nn=0;nn<Nd;nn++){
+    random(pRNG,U[nn]);
+    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
+  }
+
+  RealD mass=0.1;
+
+  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
+
+  LatticeFermion src_e   (&RBGrid);
+  LatticeFermion src_o   (&RBGrid);
+  LatticeFermion r_e   (&RBGrid);
+  LatticeFermion r_o   (&RBGrid);
+  LatticeFermion r_eo  (&Grid);
+
+  const int Even=0;
+  const int Odd=1;
+  std::cout<<"=========================================================="<<std::endl;
+  std::cout<<"= Testing that Deo + Doe = Dunprec "<<std::endl;
+  std::cout<<"=========================================================="<<std::endl;
+
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+
+  Dw.Meooe(src_e,r_o);  std::cout<<"Applied Meo"<<std::endl;
+  Dw.Meooe(src_o,r_e);  std::cout<<"Applied Moe"<<std::endl;
+  Dw.Dhop (src,ref,0);
+
+  setCheckerboard(r_eo,r_o);
+  setCheckerboard(r_eo,r_e);
+
+  ref = (-0.5)*ref;
+  err= ref - r_eo;
+  std::cout << "EO norm diff   "<< norm2(err)<< " "<<norm2(ref)<< " " << norm2(r_eo) <<std::endl;
+
+  LatticeComplex cerr(&Grid);
+  cerr = localInnerProduct(err,err);
+
+  std::cout<<"=============================================================="<<std::endl;
+  std::cout<<"= Test Ddagger is the dagger of D by requiring                "<<std::endl;
+  std::cout<<"=  < phi | Deo | chi > * = < chi | Deo^dag| phi>  "<<std::endl;
+  std::cout<<"=============================================================="<<std::endl;
+  
+  LatticeFermion chi_e   (&RBGrid);
+  LatticeFermion chi_o   (&RBGrid);
+
+  LatticeFermion dchi_e  (&RBGrid);
+  LatticeFermion dchi_o  (&RBGrid);
+
+  LatticeFermion phi_e   (&RBGrid);
+  LatticeFermion phi_o   (&RBGrid);
+
+  LatticeFermion dphi_e  (&RBGrid);
+  LatticeFermion dphi_o  (&RBGrid);
+
+
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+  pickCheckerboard(Even,phi_e,phi);
+  pickCheckerboard(Odd ,phi_o,phi);
+
+  Dw.Meooe(chi_e,dchi_o);
+  Dw.Meooe(chi_o,dchi_e);
+  Dw.MeooeDag(phi_e,dphi_o);
+  Dw.MeooeDag(phi_o,dphi_e);
+
+  ComplexD pDce = innerProduct(phi_e,dchi_e);
+  ComplexD pDco = innerProduct(phi_o,dchi_o);
+  ComplexD cDpe = innerProduct(chi_e,dphi_e);
+  ComplexD cDpo = innerProduct(chi_o,dphi_o);
+
+  std::cout <<"e "<<pDce<<" "<<cDpe <<std::endl;
+  std::cout <<"o "<<pDco<<" "<<cDpo <<std::endl;
+
+  std::cout <<"pDce - conj(cDpo) "<< pDce-conj(cDpo) <<std::endl;
+  std::cout <<"pDco - conj(cDpe) "<< pDco-conj(cDpe) <<std::endl;
+
+  std::cout<<"=============================================================="<<std::endl;
+  std::cout<<"= Test MeeInv Mee = 1                                         "<<std::endl;
+  std::cout<<"=============================================================="<<std::endl;
+
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+
+  Dw.Mooee(chi_e,src_e);
+  Dw.MooeeInv(src_e,phi_e);
+
+  Dw.Mooee(chi_o,src_o);
+  Dw.MooeeInv(src_o,phi_o);
+  
+  setCheckerboard(phi,phi_e);
+  setCheckerboard(phi,phi_o);
+
+  err = phi-chi;
+  std::cout << "norm diff   "<< norm2(err)<< std::endl;
+
+  std::cout<<"=============================================================="<<std::endl;
+  std::cout<<"= Test MeeInvDag MeeDag = 1                                   "<<std::endl;
+  std::cout<<"=============================================================="<<std::endl;
+
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+
+  Dw.MooeeDag(chi_e,src_e);
+  Dw.MooeeInvDag(src_e,phi_e);
+
+  Dw.MooeeDag(chi_o,src_o);
+  Dw.MooeeInvDag(src_o,phi_o);
+  
+  setCheckerboard(phi,phi_e);
+  setCheckerboard(phi,phi_o);
+
+  err = phi-chi;
+  std::cout << "norm diff   "<< norm2(err)<< std::endl;
+
+  std::cout<<"=============================================================="<<std::endl;
+  std::cout<<"= Test MpcDagMpc is Hermitian              "<<std::endl;
+  std::cout<<"=============================================================="<<std::endl;
+  
+  random(pRNG,phi);
+  random(pRNG,chi);
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+  pickCheckerboard(Even,phi_e,phi);
+  pickCheckerboard(Odd ,phi_o,phi);
+  RealD t1,t2;
+
+  Dw.MpcDagMpc(chi_e,dchi_e,t1,t2);
+  Dw.MpcDagMpc(chi_o,dchi_o,t1,t2);
+
+  Dw.MpcDagMpc(phi_e,dphi_e,t1,t2);
+  Dw.MpcDagMpc(phi_o,dphi_o,t1,t2);
+
+  pDce = innerProduct(phi_e,dchi_e);
+  pDco = innerProduct(phi_o,dchi_o);
+  cDpe = innerProduct(chi_e,dphi_e);
+  cDpo = innerProduct(chi_o,dphi_o);
+
+  std::cout <<"e "<<pDce<<" "<<cDpe <<std::endl;
+  std::cout <<"o "<<pDco<<" "<<cDpo <<std::endl;
+
+  std::cout <<"pDce - conj(cDpo) "<< pDco-conj(cDpo) <<std::endl;
+  std::cout <<"pDco - conj(cDpe) "<< pDce-conj(cDpe) <<std::endl;
+  
+  Grid_finalize();
+}
diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h
index ddc3490b..9a467987 100644
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -283,6 +283,7 @@ namespace Optimization {
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
 namespace Grid {
+
   typedef __m128 SIMD_Ftype;  // Single precision type
   typedef __m128d SIMD_Dtype; // Double precision type
   typedef __m128i SIMD_Itype; // Integer type
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 57480621..36665b70 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -13,31 +13,24 @@
 namespace Grid {
 
   // To take the floating point type of real/complex type
-  template <typename T> 
-    struct RealPart {
-      typedef T type;
-    };
-  template <typename T> 
-    struct RealPart< std::complex<T> >{
+  template <typename T> struct RealPart {
+    typedef T type;
+  };
+  template <typename T> struct RealPart< std::complex<T> >{
     typedef T type;
   };
 
   // type alias used to simplify the syntax of std::enable_if
-  template <typename T> using Invoke =
-    typename T::type;
-  template <typename Condition, typename ReturnType> using EnableIf =
-    Invoke<std::enable_if<Condition::value, ReturnType>>;
-  template <typename Condition, typename ReturnType> using NotEnableIf =
-    Invoke<std::enable_if<!Condition::value, ReturnType>>;
+  template <typename T> using Invoke                                  =  typename T::type;
+  template <typename Condition, typename ReturnType> using EnableIf   =    Invoke<std::enable_if<Condition::value, ReturnType>>;
+  template <typename Condition, typename ReturnType> using NotEnableIf=    Invoke<std::enable_if<!Condition::value, ReturnType>>;
   
 
 
   ////////////////////////////////////////////////////////
   // Check for complexity with type traits
-  template <typename T> 
-    struct is_complex : std::false_type {};
-  template < typename T > 
-    struct is_complex< std::complex<T> >: std::true_type {};
+  template <typename T>     struct is_complex : std::false_type {};
+  template < typename T >   struct is_complex< std::complex<T> >: std::true_type {};
   ////////////////////////////////////////////////////////
   // Define the operation templates functors
   // general forms to allow for vsplat syntax
@@ -86,8 +79,6 @@ namespace Grid {
     Grid_simd(Real a){
       vsplat(*this,Scalar_type(a));
     };
-
-
        
     ///////////////////////////////////////////////
     // mac, mult, sub, add, adj
@@ -126,10 +117,6 @@ namespace Grid {
       friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);}
     template <  class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 > 
       friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
-
-
-   
- 
    
     ////////////////////////////////////
     // Arithmetic operator overloads +,-,*
@@ -165,7 +152,6 @@ namespace Grid {
 	ret.v = binary<Vector_type>(a.v,b.v, MultSIMD());
 	return ret;
       };
-    
 
     ////////////////////////////////////////////////////////////////////////
     // FIXME:  gonna remove these load/store, get, set, prefetch
diff --git a/tests/Grid_main.cc b/tests/Grid_main.cc
index 10e74099..1a761042 100644
--- a/tests/Grid_main.cc
+++ b/tests/Grid_main.cc
@@ -1,8 +1,9 @@
 #include "Grid.h"
 
 //DEBUG
+#ifdef SSE4
 #include "simd/Grid_vector_types.h"
-
+#endif
 
 using namespace std;
 using namespace Grid;

From 3a6ff2d7b86233ae1da83119c3f97b5cb3177ed7 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 25 May 2015 14:43:08 +0100
Subject: [PATCH 27/27] Makefile update

---
 Makefile.in            | 44 +++++++++++++++++++-----------
 aclocal.m4             | 61 ++++++++++++++++++++++--------------------
 benchmarks/Makefile.am | 11 +++++++-
 config.guess           |  2 +-
 config.sub             |  2 +-
 configure              | 13 ++++-----
 6 files changed, 79 insertions(+), 54 deletions(-)

diff --git a/Makefile.in b/Makefile.in
index b6894ef6..b9fba168 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.14.1 from Makefile.am.
+# Makefile.in generated by automake 1.15 from Makefile.am.
 # @configure_input@
 
-# Copyright (C) 1994-2013 Free Software Foundation, Inc.
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
 
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -14,7 +14,17 @@
 
 @SET_MAKE@
 VPATH = @srcdir@
-am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
 am__make_running_with_option = \
   case $${target_option-} in \
       ?) ;; \
@@ -79,14 +89,12 @@ build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
 subdir = .
-DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
-	$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
-	$(top_srcdir)/configure $(am__configure_deps) COPYING TODO \
-	compile config.guess config.sub depcomp install-sh missing
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
+	$(am__configure_deps) $(am__DIST_COMMON)
 am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
  configure.lineno config.status.lineno
 mkinstalldirs = $(install_sh) -d
@@ -149,6 +157,9 @@ ETAGS = etags
 CTAGS = ctags
 CSCOPE = cscope
 DIST_SUBDIRS = $(SUBDIRS)
+am__DIST_COMMON = $(srcdir)/Makefile.in AUTHORS COPYING ChangeLog \
+	INSTALL NEWS README TODO compile config.guess config.sub \
+	depcomp install-sh missing
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@@ -314,7 +325,6 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
 	$(am__cd) $(top_srcdir) && \
 	  $(AUTOMAKE) --gnu Makefile
-.PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
 	  *config.status*) \
@@ -521,15 +531,15 @@ dist-xz: distdir
 	$(am__post_remove_distdir)
 
 dist-tarZ: distdir
-	@echo WARNING: "Support for shar distribution archives is" \
-	               "deprecated." >&2
+	@echo WARNING: "Support for distribution archives compressed with" \
+		       "legacy program 'compress' is deprecated." >&2
 	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
 	$(am__post_remove_distdir)
 
 dist-shar: distdir
-	@echo WARNING: "Support for distribution archives compressed with" \
-		       "legacy program 'compress' is deprecated." >&2
+	@echo WARNING: "Support for shar distribution archives is" \
+	               "deprecated." >&2
 	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
 	$(am__post_remove_distdir)
@@ -565,17 +575,17 @@ distcheck: dist
 	esac
 	chmod -R a-w $(distdir)
 	chmod u+w $(distdir)
-	mkdir $(distdir)/_build $(distdir)/_inst
+	mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst
 	chmod a-w $(distdir)
 	test -d $(distdir)/_build || exit 0; \
 	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
 	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
 	  && am__cwd=`pwd` \
-	  && $(am__cd) $(distdir)/_build \
-	  && ../configure \
+	  && $(am__cd) $(distdir)/_build/sub \
+	  && ../../configure \
 	    $(AM_DISTCHECK_CONFIGURE_FLAGS) \
 	    $(DISTCHECK_CONFIGURE_FLAGS) \
-	    --srcdir=.. --prefix="$$dc_install_base" \
+	    --srcdir=../.. --prefix="$$dc_install_base" \
 	  && $(MAKE) $(AM_MAKEFLAGS) \
 	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
 	  && $(MAKE) $(AM_MAKEFLAGS) check \
@@ -749,6 +759,8 @@ uninstall-am:
 	maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
 	pdf-am ps ps-am tags tags-am uninstall uninstall-am
 
+.PRECIOUS: Makefile
+
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
diff --git a/aclocal.m4 b/aclocal.m4
index 389763bf..bf79d078 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.14.1 -*- Autoconf -*-
+# generated automatically by aclocal 1.15 -*- Autoconf -*-
 
-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -20,7 +20,7 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])
 
-# Copyright (C) 2002-2013 Free Software Foundation, Inc.
+# Copyright (C) 2002-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
 # generated from the m4 files accompanying Automake X.Y.
 # (This private macro should not be called outside this file.)
 AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.14'
+[am__api_version='1.15'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.14.1], [],
+m4_if([$1], [1.15], [],
       [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])
 
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.14.1])dnl
+[AM_AUTOMAKE_VERSION([1.15])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
   [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 
 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
 
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -103,15 +103,14 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 # configured tree to be moved without reconfiguration.
 
 AC_DEFUN([AM_AUX_DIR_EXPAND],
-[dnl Rely on autoconf to set up CDPATH properly.
-AC_PREREQ([2.50])dnl
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`
 ])
 
 # AM_CONDITIONAL                                            -*- Autoconf -*-
 
-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -142,7 +141,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])
 
-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -333,7 +332,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
 
 # Generate code to set up dependency tracking.              -*- Autoconf -*-
 
-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -409,7 +408,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
 
 # Do all the work for Automake.                             -*- Autoconf -*-
 
-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -499,8 +498,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+# We need awk for the "check" target (and possibly the TAP driver).  The
+# system "awk" is bad on some platforms.
 AC_REQUIRE([AC_PROG_AWK])dnl
 AC_REQUIRE([AC_PROG_MAKE_SET])dnl
 AC_REQUIRE([AM_SET_LEADING_DOT])dnl
@@ -573,7 +572,11 @@ to "yes", and re-run configure.
 END
     AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
   fi
-fi])
+fi
+dnl The trailing newline in this macro's definition is deliberate, for
+dnl backward compatibility and to allow trailing 'dnl'-style comments
+dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
+])
 
 dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
 dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
@@ -602,7 +605,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
 
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -613,7 +616,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co
 # Define $install_sh.
 AC_DEFUN([AM_PROG_INSTALL_SH],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
   case $am_aux_dir in
   *\ * | *\	*)
     install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -623,7 +626,7 @@ if test x"${install_sh}" != xset; then
 fi
 AC_SUBST([install_sh])])
 
-# Copyright (C) 2003-2013 Free Software Foundation, Inc.
+# Copyright (C) 2003-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -644,7 +647,7 @@ AC_SUBST([am__leading_dot])])
 
 # Check to see how 'make' treats includes.	            -*- Autoconf -*-
 
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -694,7 +697,7 @@ rm -f confinc confmf
 
 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
 
-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -733,7 +736,7 @@ fi
 
 # Helper functions for option handling.                     -*- Autoconf -*-
 
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -764,7 +767,7 @@ AC_DEFUN([_AM_IF_OPTION],
 
 # Check to make sure that the build environment is sane.    -*- Autoconf -*-
 
-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -845,7 +848,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])
 
-# Copyright (C) 2009-2013 Free Software Foundation, Inc.
+# Copyright (C) 2009-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -905,7 +908,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])
 
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -933,7 +936,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])
 
-# Copyright (C) 2006-2013 Free Software Foundation, Inc.
+# Copyright (C) 2006-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -952,7 +955,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
 
 # Check how to create a tarball.                            -*- Autoconf -*-
 
-# Copyright (C) 2004-2013 Free Software Foundation, Inc.
+# Copyright (C) 2004-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am
index fc513e86..95ae5eca 100644
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -5,14 +5,23 @@ AM_LDFLAGS = -L$(top_builddir)/lib
 #
 # Test code
 #
-bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec
+bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec Grid_wilson_evenodd  Grid_wilson_cg_prec Grid_wilson_cg_schur
 
 Grid_wilson_SOURCES = Grid_wilson.cc
 Grid_wilson_LDADD = -lGrid
 
+Grid_wilson_evenodd_SOURCES = Grid_wilson_evenodd.cc
+Grid_wilson_evenodd_LDADD = -lGrid
+
 Grid_wilson_cg_unprec_SOURCES = Grid_wilson_cg_unprec.cc
 Grid_wilson_cg_unprec_LDADD = -lGrid
 
+Grid_wilson_cg_prec_SOURCES = Grid_wilson_cg_prec.cc
+Grid_wilson_cg_prec_LDADD = -lGrid
+
+Grid_wilson_cg_schur_SOURCES = Grid_wilson_cg_schur.cc
+Grid_wilson_cg_schur_LDADD = -lGrid
+
 Grid_comms_SOURCES = Grid_comms.cc
 Grid_comms_LDADD = -lGrid
 
diff --git a/config.guess b/config.guess
index 5f6aa02d..a12faba2 120000
--- a/config.guess
+++ b/config.guess
@@ -1 +1 @@
-/usr/share/automake-1.14/config.guess
\ No newline at end of file
+/opt/local/share/automake-1.15/config.guess
\ No newline at end of file
diff --git a/config.sub b/config.sub
index 0abfe18c..e3c9b5ca 120000
--- a/config.sub
+++ b/config.sub
@@ -1 +1 @@
-/usr/share/automake-1.14/config.sub
\ No newline at end of file
+/opt/local/share/automake-1.15/config.sub
\ No newline at end of file
diff --git a/configure b/configure
index 8c9e8c59..6e27ab11 100755
--- a/configure
+++ b/configure
@@ -2466,7 +2466,7 @@ test -n "$target_alias" &&
     NONENONEs,x,x, &&
   program_prefix=${target_alias}-
 
-am__api_version='1.14'
+am__api_version='1.15'
 
 # Find a good install program.  We prefer a C program (faster),
 # so one script is as good as another.  But avoid the broken or
@@ -2638,8 +2638,8 @@ test "$program_suffix" != NONE &&
 ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
 program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
 
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`
 
 if test x"${MISSING+set}" != xset; then
   case $am_aux_dir in
@@ -2658,7 +2658,7 @@ else
 $as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
 fi
 
-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
   case $am_aux_dir in
   *\ * | *\	*)
     install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -2986,8 +2986,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 mkdir_p='$(MKDIR_P)'
 
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+# We need awk for the "check" target (and possibly the TAP driver).  The
+# system "awk" is bad on some platforms.
 # Always define AMTAR for backward compatibility.  Yes, it's still used
 # in the wild :-(  We should find a proper way to deprecate it ...
 AMTAR='$${TAR-tar}'
@@ -3046,6 +3046,7 @@ END
 fi
 
 
+
 ac_config_headers="$ac_config_headers lib/Grid_config.h"
 
 # Check whether --enable-silent-rules was given.