From cff3bae1557f90077ec67d56c96eb72f817bb273 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 25 Jan 2018 13:46:31 +0100
Subject: [PATCH 01/17] Adding support for general Nc in the benchmark outputs

---
 benchmarks/Benchmark_dwf.cc          | 15 ++++++----
 benchmarks/Benchmark_dwf_sweep.cc    | 11 ++++---
 benchmarks/Benchmark_gparity.cc      |  6 ++--
 benchmarks/Benchmark_wilson.cc       | 13 ++++++--
 benchmarks/Benchmark_wilson_sweep.cc | 45 ++++++++++++++++++++++------
 5 files changed, 66 insertions(+), 24 deletions(-)
diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index 73621bbe..1d9de772 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -48,7 +48,6 @@ int main (int argc, char ** argv)
 
 
   int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 
   std::vector<int> latt4 = GridDefaultLatt();
   int Ls=16;
@@ -57,6 +56,10 @@ int main (int argc, char ** argv)
       std::stringstream ss(argv[i+1]); ss >> Ls;
     }
 
+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
+
 
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -187,7 +190,7 @@ int main (int argc, char ** argv)
     FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
 
     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@@ -226,7 +229,7 @@ int main (int argc, char ** argv)
     FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
 
     std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -277,7 +280,7 @@ int main (int argc, char ** argv)
     double t1=usecond();
     FGrid->Barrier();
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
 
     std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -355,7 +358,7 @@ int main (int argc, char ** argv)
       //      sDw.stat.print();
 
       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
+      double flops=(single_site_flops*volume*ncall)/2.0;
 
       std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
       std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
@@ -478,7 +481,7 @@ int main (int argc, char ** argv)
     FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(1344.0*volume*ncall)/2;
+    double flops=(single_site_flops*volume*ncall)/2.0;
 
     std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
     std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc
index 37e47062..da8eb044 100644
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -51,6 +51,7 @@ int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
+
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
@@ -107,6 +108,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
 
   std::vector<int> seeds4({1,2,3,4});
   std::vector<int> seeds5({5,6,7,8});
@@ -196,7 +198,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
   
   if ( ! report ) {
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
     std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
   }
   
@@ -228,7 +230,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
     
     if(!report){
       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
+      double flops=(single_site_flops*volume*ncall)/2.0;
       std::cout<< flops/(t1-t0);
     }
   }
@@ -237,6 +239,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
 #define CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
 
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -321,7 +324,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
     Counter.Report();
   } else { 
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
     std::cout<<"\t"<< flops/(t1-t0);
   }
 
@@ -358,7 +361,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
     CounterSdw.Report();
   } else {
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(1344.0*volume*ncall)/2;
+    double flops=(single_site_flops*volume*ncall)/2.0;
     std::cout<<"\t"<< flops/(t1-t0);
   }
 }
diff --git a/benchmarks/Benchmark_gparity.cc b/benchmarks/Benchmark_gparity.cc
index f6036aa8..643d241c 100644
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@@ -107,7 +107,7 @@ int main (int argc, char ** argv)
     FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
 
     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@@ -134,7 +134,7 @@ int main (int argc, char ** argv)
     FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
 
     std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -174,7 +174,7 @@ int main (int argc, char ** argv)
     FGrid_d->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
 
     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
diff --git a/benchmarks/Benchmark_wilson.cc b/benchmarks/Benchmark_wilson.cc
index 55042d6a..d1499a76 100644
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -54,6 +54,9 @@ int main (int argc, char ** argv)
     overlapComms = true;
   }
 
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
+
+
   std::vector<int> latt_size   = GridDefaultLatt();
   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();
@@ -61,10 +64,15 @@ int main (int argc, char ** argv)
   GridRedBlackCartesian     RBGrid(&Grid);
 
   int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  GridLogLayout();
+
   std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
   std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
   std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
+  std::cout<<GridLogMessage << "Grid number of colours : "<< QCD::Nc <<std::endl;
+  std::cout<<GridLogMessage << "Benchmarking Wilson operator in the fundamental representation" << std::endl;
+
 
   std::vector<int> seeds({1,2,3,4});
   GridParallelRNG          pRNG(&Grid);
@@ -134,9 +142,10 @@ int main (int argc, char ** argv)
     Dw.Dhop(src,result,0);
   }
   double t1=usecond();
-  double flops=1344*volume*ncall;
+  double flops=single_site_flops*volume*ncall;
   
   std::cout<<GridLogMessage << "Called Dw"<<std::endl;
+  std::cout<<GridLogMessage << "flops per site " << single_site_flops << std::endl;
   std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
   std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc
index a189ac58..f80cde28 100644
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -62,6 +62,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Number of colours "<< QCD::Nc <<std::endl;
   std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl;
   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
   if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
@@ -69,13 +70,15 @@ int main (int argc, char ** argv)
   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage << "* OpenMP threads       : "<< GridThread::GetThreads() <<std::endl;
+  std::cout << GridLogMessage << "* MPI tasks            : "<< GridCmdVectorIntToString(mpi_layout) << std::endl;
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
-  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking Wilson operator in the fundamental representation" << std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs\tWilsonEO/MFLOPs\tWilsonDagEO/MFLOPs" << std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
 
   int Lmax = 32;
   int dmin = 0;
@@ -97,13 +100,20 @@ int main (int argc, char ** argv)
 
 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
-	  LatticeFermion    src(&Grid); random(pRNG,src);
-	  LatticeFermion result(&Grid); result=zero;
+	  LatticeFermion        src(&Grid); random(pRNG,src);
+	  LatticeFermion    src_o(&RBGrid); pickCheckerboard(Odd,src_o,src);
+	  LatticeFermion     result(&Grid); result=zero;
+	  LatticeFermion result_e(&RBGrid); result_e=zero;
 
 	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
 
 	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
-      
+
+    // Full operator      
+	  bench_wilson(src,result,Dw,volume,DaggerNo);
+	  bench_wilson(src,result,Dw,volume,DaggerYes);
+    std::cout << "\t";
+    // EO
 	  bench_wilson(src,result,Dw,volume,DaggerNo);
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << std::endl;
@@ -122,9 +132,26 @@ void bench_wilson (
 		   int const           dag )
 {
   int ncall    = 1000;
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
   double t0    = usecond();
   for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
   double t1    = usecond();
-  double flops = 1344 * volume * ncall;
+  double flops = single_site_flops * volume * ncall;
+  std::cout << flops/(t1-t0) << "\t\t";
+}
+
+void bench_wilson_eo (
+		   LatticeFermion &    src,
+		   LatticeFermion & result,
+		   WilsonFermionR &     Dw,
+		   double const     volume,
+		   int const           dag )
+{
+  int ncall    = 1000;
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
+  double t0    = usecond();
+  for(int i=0; i<ncall; i++) { Dw.DhopEO(src,result,dag); }
+  double t1    = usecond();
+  double flops = (single_site_flops * volume * ncall)/2.0;
   std::cout << flops/(t1-t0) << "\t\t";
 }

From 507c4e9efcaa4ae7deb42d6cc7268bf04ff8734b Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Sat, 27 Jan 2018 10:59:55 +0100
Subject: [PATCH 02/17] Correcting an missing semicolumn in avx512

---
 lib/simd/Grid_avx512.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index 85d27421..cce77a58 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -556,7 +556,7 @@ namespace Optimization {
     v3  = _mm256_add_epi32(v1, v2);
     v1  = _mm256_hadd_epi32(v3, v3);
     v2  = _mm256_hadd_epi32(v1, v1);
-    u1  = _mm256_castsi256_si128(v2)        // upper half
+    u1  = _mm256_castsi256_si128(v2);        // upper half
     u2  = _mm256_extracti128_si256(v2, 1);  // lower half
     ret = _mm_add_epi32(u1, u2);
     return _mm_cvtsi128_si32(ret);

From 655a69259a76b844ab06a2e78fbe8a0441dbf774 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Sun, 28 Jan 2018 17:02:46 +0100
Subject: [PATCH 03/17] Added support for GCC compilation for Skylake AVX512

---
 configure.ac                                       | 3 +++
 lib/qcd/action/fermion/CayleyFermion5Dvec.cc       | 4 ++--
 lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc | 2 +-
 lib/qcd/action/fermion/MobiusEOFAFermionvec.cc     | 2 +-
 lib/simd/Intel512avx.h                             | 2 +-
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/configure.ac b/configure.ac
index 468d9d5f..3a6a2960 100644
--- a/configure.ac
+++ b/configure.ac
@@ -249,6 +249,9 @@ case ${ax_cv_cxx_compiler_vendor} in
       AVX512)
         AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
         SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
+      SKL)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for SkyLake Xeon])
+        SIMD_FLAGS='-march=skylake-avx512';;
       KNC)
         AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
         SIMD_FLAGS='';;
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
index 653e6ab3..2b2eace7 100644
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -469,7 +469,7 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
 	}
 	a0 = a0+incr;
 	a1 = a1+incr;
-	a2 = a2+sizeof(Simd::scalar_type);
+	a2 = a2+sizeof(typename Simd::scalar_type);
       }}
     {
       int lexa = s1+LLs*site;
@@ -701,7 +701,7 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
 	}
 	a0 = a0+incr;
 	a1 = a1+incr;
-	a2 = a2+sizeof(Simd::scalar_type);
+	a2 = a2+sizeof(typename Simd::scalar_type);
       }}
     {
       int lexa = s1+LLs*site;
diff --git a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc b/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
index 81ce448c..c95172a5 100644
--- a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
@@ -475,7 +475,7 @@ namespace QCD {
                         }
                         a0 = a0 + incr;
                         a1 = a1 + incr;
-                        a2 = a2 + sizeof(Simd::scalar_type);
+                        a2 = a2 + sizeof(typename Simd::scalar_type);
                     }
                 }
 
diff --git a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
index c4eaf0f3..290ba158 100644
--- a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
@@ -853,7 +853,7 @@ namespace QCD {
 
               a0 = a0 + incr;
               a1 = a1 + incr;
-              a2 = a2 + sizeof(Simd::scalar_type);
+              a2 = a2 + sizeof(typename Simd::scalar_type);
             }
           }
 
diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h
index 7b5964ad..def37b9b 100644
--- a/lib/simd/Intel512avx.h
+++ b/lib/simd/Intel512avx.h
@@ -79,7 +79,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZEND2f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp   ";\n"\
                                   "vsubps  " #tmp "," #Ciirr "," #Criir"{%k7}"  ";\n"
 
-#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\ 
+#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\
                                   "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
 
 #define ZEND2d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp   ";\n"\

From fb24e3a7d24abb2bcdef4c85711ce0d25319a153 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Mon, 29 Jan 2018 11:11:45 +0100
Subject: [PATCH 04/17] Adding utilities for perf profiling

---
 benchmarks/Benchmark_wilson.cc | 24 +++++++++++-
 lib/util/Profiling.h           | 72 ++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 lib/util/Profiling.h

diff --git a/benchmarks/Benchmark_wilson.cc b/benchmarks/Benchmark_wilson.cc
index d1499a76..754051f0 100644
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -4,7 +4,7 @@
 
     Source file: ./benchmarks/Benchmark_wilson.cc
 
-    Copyright (C) 2015
+    Copyright (C) 2018
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
@@ -32,6 +32,9 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 
+
+#include "Grid/util/Profiling.h"
+
 template<class d>
 struct scal {
   d internal;
@@ -45,6 +48,7 @@ struct scal {
   };
 
 bool overlapComms = false;
+bool perfProfiling = false;
 
 int main (int argc, char ** argv)
 {
@@ -53,6 +57,9 @@ int main (int argc, char ** argv)
   if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
     overlapComms = true;
   }
+  if( GridCmdOptionExists(argv,argv+argc,"--perf") ){
+    perfProfiling = true;
+  }
 
   long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
 
@@ -144,6 +151,21 @@ int main (int argc, char ** argv)
   double t1=usecond();
   double flops=single_site_flops*volume*ncall;
   
+  if (perfProfiling){
+  std::cout<<GridLogMessage << "Profiling Dw with perf"<<std::endl;
+    
+  System::profile("kernel", [&]() {
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src,result,0);
+    }
+  });
+
+  std::cout<<GridLogMessage << "Generated kernel.data"<<std::endl;
+  std::cout<<GridLogMessage << "Use with: perf report -i kernel.data"<<std::endl;
+
+  }
+
+
   std::cout<<GridLogMessage << "Called Dw"<<std::endl;
   std::cout<<GridLogMessage << "flops per site " << single_site_flops << std::endl;
   std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
diff --git a/lib/util/Profiling.h b/lib/util/Profiling.h
new file mode 100644
index 00000000..acdcb0c6
--- /dev/null
+++ b/lib/util/Profiling.h
@@ -0,0 +1,72 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/util/Profiling.h
+
+    Copyright (C) 2018
+
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#ifndef GRID_PERF_PROFILING_H
+#define GRID_PERF_PROFILING_H
+
+#include <sstream>
+#include <iostream>
+#include <functional>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <signal.h>
+
+struct System
+{
+    static void profile(const std::string& name,std::function<void()> body) {
+        std::string filename = name.find(".data") == std::string::npos ? (name + ".data") : name;
+
+        // Launch profiler
+        pid_t pid;
+        std::stringstream s;
+        s << getpid();
+        pid = fork();
+        if (pid == 0) {
+            auto fd=open("/dev/null",O_RDWR);
+            dup2(fd,1);
+            dup2(fd,2);
+            exit(execl("/usr/bin/perf","perf","record","-o",filename.c_str(),"-p",s.str().c_str(),nullptr));
+        }
+
+        // Run body
+        body();
+
+        // Kill profiler  
+        kill(pid,SIGINT);
+        waitpid(pid,nullptr,0);
+    }
+
+    static void profile(std::function<void()> body) {
+        profile("perf.data",body);
+    }
+};
+
+#endif // GRID_PERF_PROFILING_H
\ No newline at end of file

From cd44e851f1021db5f895a4caf409c885b35d7bd9 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Tue, 30 Jan 2018 06:04:30 +0100
Subject: [PATCH 05/17] Fixing compilation error in FundtoHirep

---
 extras/Hadrons/Modules/MGauge/FundtoHirep.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extras/Hadrons/Modules/MGauge/FundtoHirep.cc b/extras/Hadrons/Modules/MGauge/FundtoHirep.cc
index f15a3b7c..31c5a34d 100644
--- a/extras/Hadrons/Modules/MGauge/FundtoHirep.cc
+++ b/extras/Hadrons/Modules/MGauge/FundtoHirep.cc
@@ -57,7 +57,7 @@ std::vector<std::string> TFundtoHirep<Rep>::getOutput(void)
 template <typename Rep>
 void TFundtoHirep<Rep>::setup(void)
 {
-    env().template registerLattice<typename Rep::LatticeField>(getName());
+    envCreateLat(typename Rep::LatticeField, getName());
 }
 
 // execution ///////////////////////////////////////////////////////////////////
@@ -70,6 +70,6 @@ void TFundtoHirep<Rep>::execute(void)
     Rep TargetRepresentation(U._grid);
     TargetRepresentation.update_representation(U);
 
-   typename Rep::LatticeField &URep = *env().template createLattice<typename Rep::LatticeField>(getName());
+    auto &URep = envGet(typename Rep::LatticeField, getName());
     URep = TargetRepresentation.U;
 }

From 53bffb83d453080fe5dd16fb5601d16a94997d87 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Tue, 30 Jan 2018 12:42:36 +0100
Subject: [PATCH 06/17] Updating README with new SKL target

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 13dd6996..5a92cdec 100644
--- a/README.md
+++ b/README.md
@@ -187,10 +187,11 @@ Alternatively, some CPU codenames can be directly used:
 | `<code>`    | Description                            |
 | ----------- | -------------------------------------- |
 | `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
+| `SKL`       | [Intel Skylake with AVX512 extensions](https://ark.intel.com/products/codename/37572/Skylake#@server) |
 | `BGQ`       | Blue Gene/Q                            |
 
 #### Notes:
-- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
+- We currently support AVX512 for the Intel compiler and GCC (SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced.
 - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
 - BG/Q performances are currently rather poor. This is being investigated for future versions.
 - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.

From f0fcdf75b5b7c6be03224a50b1157170e441b3b5 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Tue, 30 Jan 2018 12:44:20 +0100
Subject: [PATCH 07/17] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5a92cdec..86506f52 100644
--- a/README.md
+++ b/README.md
@@ -191,7 +191,7 @@ Alternatively, some CPU codenames can be directly used:
 | `BGQ`       | Blue Gene/Q                            |
 
 #### Notes:
-- We currently support AVX512 for the Intel compiler and GCC (SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced.
+- We currently support AVX512 for the Intel compiler and GCC (KNL and SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced.
 - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
 - BG/Q performances are currently rather poor. This is being investigated for future versions.
 - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.

From 896f3a8002b3116380e2293cf3ecca350c34ce5d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 1 Feb 2018 18:51:51 +0000
Subject: [PATCH 08/17] Fix to MPI for Hokusai system

---
 lib/communicator/SharedMemoryMPI.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/lib/communicator/SharedMemoryMPI.cc b/lib/communicator/SharedMemoryMPI.cc
index d7bd7c65..2a62b7ac 100644
--- a/lib/communicator/SharedMemoryMPI.cc
+++ b/lib/communicator/SharedMemoryMPI.cc
@@ -182,6 +182,7 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
+  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<<std::endl;
   assert(_ShmSetup==1);
   assert(_ShmAlloc==0);
   //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -218,6 +219,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
     assert(((uint64_t)ptr&0x3F)==0);
     close(fd);
     WorldShmCommBufs[r] =ptr;
+    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
   }
   _ShmAlloc=1;
   _ShmAllocBytes  = bytes;
@@ -232,6 +234,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
+  std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
   assert(_ShmSetup==1);
   assert(_ShmAlloc==0); 
   MPI_Barrier(WorldShmComm);
@@ -259,7 +262,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
       
-      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
+      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
+      if ( ptr == (void * )MAP_FAILED ) {       
+	perror("failed mmap");     
+	assert(0);    
+      }
       assert(((uint64_t)ptr&0x3F)==0);
       
       WorldShmCommBufs[r] =ptr;
@@ -318,11 +325,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
   heap_size = GlobalSharedMemory::ShmAllocBytes();
   for(int r=0;r<ShmSize;r++){
 
-    uint32_t sr = (r==ShmRank) ? GlobalSharedMemory::WorldRank : 0 ;
+    uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
 
-    MPI_Allreduce(MPI_IN_PLACE,&sr,1,MPI_UINT32_T,MPI_SUM,comm);
+    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
 
-    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[sr];
+    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
+    //    std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< "  wsr = "<<wsr<<std::endl;
   }
   ShmBufferFreeAll();
 

From 237a8ec9181d87c115e1569456430187c071adb0 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Mon, 12 Feb 2018 13:27:20 +0000
Subject: [PATCH 09/17] Communicator leak fixed (I think)

---
 lib/communicator/Communicator_mpi3.cc | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index ef47d617..6732dcdf 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -49,6 +49,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
 
   Grid_quiesce_nodes();
 
+  // Never clean up as done once.
   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
 
   GlobalSharedMemory::Init(communicator_world);
@@ -88,6 +89,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm); // Remap using the shared memory optimising routine
   InitFromMPICommunicator(processors,optimal_comm);
   SetCommunicator(optimal_comm);
+  // Free the temp communicator
+  MPI_Comm_free(&optimal_comm);
 }
 
 //////////////////////////////////
@@ -183,8 +186,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
 
   } else {
     srank = 0;
-    comm_split    = parent.communicator;
-    //    std::cout << " Inherited communicator " <<comm_split <<std::endl;
+    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
+    assert(ierr==0);
   }
 
   //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -197,6 +200,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
   //////////////////////////////////////////////////////////////////////////////////////////////////////
   SetCommunicator(comm_split);
 
+  // Free the temp communicator 
+  MPI_Comm_free(&comm_split);
+
   if(0){ 
     std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
     for(int d=0;d<processors.size();d++){
@@ -210,6 +216,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
 
 void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
 {
+  ////////////////////////////////////////////////////
+  // Creates communicator, and the communicator_halo
+  ////////////////////////////////////////////////////
   _ndimension = processors.size();
   _processor_coor.resize(_ndimension);
 

From 7b8b2731e702838e3b5696faca6746f5f8157d02 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Mon, 12 Feb 2018 16:06:31 +0000
Subject: [PATCH 10/17] Conj error for complex coeffs

---
 lib/qcd/action/fermion/CayleyFermion5D.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc
index eace6484..e053b98c 100644
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -73,7 +73,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
   this->DW(psi,tmp_f,DaggerYes);
 
   for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
   }
 }
 

From dd8f2a64febf1c69ff5d05490536dcaaac645a92 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 13 Feb 2018 02:08:49 +0000
Subject: [PATCH 11/17] INterface to suit hadrons on Lanczos

---
 .../iterative/ImplicitlyRestartedLanczos.h    |   7 +
 .../iterative/LocalCoherenceLanczos.h         | 187 ++++++++++++------
 tests/debug/Test_cayley_coarsen_support.cc    |   3 +-
 tests/debug/Test_cayley_ldop_cr.cc            |   3 +-
 .../Test_dwf_compressed_lanczos_reorg.cc      |  14 +-
 5 files changed, 143 insertions(+), 71 deletions(-)

diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index 7b85c095..b4fca33a 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -181,6 +181,13 @@ enum IRLdiagonalisation {
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
  public:
+
+  static void Deflate(const std::vector<Field> &_v,
+		      const std::vector<RealD>& eval,
+		      const Field& src_orig,Field& result) {
+    basisDeflate(_v,eval,src_orig,result);
+  }
+
   LinearFunction<Field>       &_HermOp;
   ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
   int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
diff --git a/lib/algorithms/iterative/LocalCoherenceLanczos.h b/lib/algorithms/iterative/LocalCoherenceLanczos.h
index d5d1bbc2..c530a572 100644
--- a/lib/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h
@@ -70,21 +70,24 @@ public:
   typedef Lattice<Fobj>          FineField;
 
   LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;
 
-  ProjectedHermOp(LinearOperatorBase<FineField>& linop,  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
-    _Linop(linop),
-    _Aggregate(aggregate)  {  };
+  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
+    _Linop(linop), subspace(_subspace)
+  {  
+    assert(subspace.size() >0);
+  };
 
   void operator()(const CoarseField& in, CoarseField& out) {
+    GridBase *FineGrid = subspace[0]._grid;    
+    int   checkerboard = subspace[0].checkerboard;
+      
+    FineField fin (FineGrid);     fin.checkerboard= checkerboard;
+    FineField fout(FineGrid);   fout.checkerboard = checkerboard;
 
-    GridBase *FineGrid = _Aggregate.FineGrid;
-    FineField fin(FineGrid);
-    FineField fout(FineGrid);
-
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
-    _Linop.HermOp(fin,fout);                   std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
+    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
+    _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
+    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
   }
 };
 
@@ -99,24 +102,27 @@ public:
 
   OperatorFunction<FineField>   & _poly;
   LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;
 
-  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop, 
-			  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
+  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
+			  LinearOperatorBase<FineField>& linop, 
+			  std::vector<FineField> & _subspace) :
     _poly(poly),
     _Linop(linop),
-    _Aggregate(aggregate)  {  };
+    subspace(_subspace)
+  {  };
 
   void operator()(const CoarseField& in, CoarseField& out) {
-
-    GridBase *FineGrid = _Aggregate.FineGrid;
-
-    FineField fin(FineGrid) ;fin.checkerboard  =_Aggregate.checkerboard;
-    FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard;
     
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
+    GridBase *FineGrid = subspace[0]._grid;    
+    int   checkerboard = subspace[0].checkerboard;
+
+    FineField fin (FineGrid); fin.checkerboard =checkerboard;
+    FineField fout(FineGrid);fout.checkerboard =checkerboard;
+    
+    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
     _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
+    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
   }
 };
 
@@ -132,19 +138,23 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
   LinearFunction<CoarseField> & _Poly;
   OperatorFunction<FineField>   & _smoother;
   LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
-  RealD                             _coarse_relax_tol;
+  RealD                          _coarse_relax_tol;
+  std::vector<FineField>        &_subspace;
+  
   ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
-					   Aggregation<Fobj,CComplex,nbasis> &Aggregate,
+					   std::vector<FineField>        &subspace,
 					   RealD coarse_relax_tol=5.0e3) 
-    : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol)  {    };
+    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
+      _coarse_relax_tol(coarse_relax_tol)  
+  {    };
 
   int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
   {
     CoarseField v(B);
     RealD eval_poly = eval;
+
     // Apply operator
     _Poly(B,v);
 
@@ -168,14 +178,13 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
   }
   int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
   {
-    GridBase *FineGrid = _Aggregate.FineGrid;
-
-    int checkerboard   = _Aggregate.checkerboard;
-
+    GridBase *FineGrid = _subspace[0]._grid;    
+    int checkerboard   = _subspace[0].checkerboard;
     FineField fB(FineGrid);fB.checkerboard =checkerboard;
     FineField fv(FineGrid);fv.checkerboard =checkerboard;
 
-    _Aggregate.PromoteFromSubspace(B,fv);
+    blockPromote(B,fv,_subspace);  
+    
     _smoother(_Linop,fv,fB); 
 
     RealD eval_poly = eval;
@@ -217,27 +226,80 @@ protected:
   int _checkerboard;
   LinearOperatorBase<FineField>                 & _FineOp;
   
-  // FIXME replace Aggregation with vector of fine; the code reuse is too small for
-  // the hassle and complexity of cross coupling.
-  Aggregation<Fobj,CComplex,nbasis>               _Aggregate;  
-  std::vector<RealD>                              evals_fine;
-  std::vector<RealD>                              evals_coarse; 
-  std::vector<CoarseField>                        evec_coarse;
+  std::vector<RealD>                              &evals_fine;
+  std::vector<RealD>                              &evals_coarse; 
+  std::vector<FineField>                          &subspace;
+  std::vector<CoarseField>                        &evec_coarse;
+
+private:
+  std::vector<RealD>                              _evals_fine;
+  std::vector<RealD>                              _evals_coarse; 
+  std::vector<FineField>                          _subspace;
+  std::vector<CoarseField>                        _evec_coarse;
+
 public:
+  static void Deflate(std::vector<FineField>   subspace,
+		      std::vector<CoarseField> evec_coarse,
+		      std::vector<RealD>       eval_coarse,
+		      const FineField& src_orig,FineField& result) 
+  {
+    int N = (int)evec_coarse.size();
+    CoarseField src_coarse(evec_coarse[0]._grid);
+    CoarseField res_coarse(evec_coarse[0]._grid);    res_coarse = zero;
+    blockProject(src_orig,src_coarse,subspace);    
+    for (int i=0;i<N;i++) {
+      CoarseField & tmp = evec_coarse[i];
+      axpy(res_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,res_coarse);
+    }
+    blockPromote(res_coarse,result,subspace);
+  };
+
   LocalCoherenceLanczos(GridBase *FineGrid,
-		GridBase *CoarseGrid,
-		LinearOperatorBase<FineField> &FineOp,
-		int checkerboard) :
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard) :
     _CoarseGrid(CoarseGrid),
     _FineGrid(FineGrid),
-    _Aggregate(CoarseGrid,FineGrid,checkerboard),
     _FineOp(FineOp),
-    _checkerboard(checkerboard)
+    _checkerboard(checkerboard),
+    evals_fine  (_evals_fine),
+    evals_coarse(_evals_coarse),
+    subspace    (_subspace),
+    evec_coarse(_evec_coarse)
   {
     evals_fine.resize(0);
     evals_coarse.resize(0);
   };
-  void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
+  //////////////////////////////////////////////////////////////////////////
+  // Alternate constructore, external storage for use by Hadrons module
+  //////////////////////////////////////////////////////////////////////////
+  LocalCoherenceLanczos(GridBase *FineGrid,
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard,
+			std::vector<FineField>   &ext_subspace,
+			std::vector<CoarseField> &ext_coarse,
+			std::vector<RealD>       &ext_eval_fine,
+			std::vector<RealD>       &ext_eval_coarse
+			) :
+    _CoarseGrid(CoarseGrid),
+    _FineGrid(FineGrid),
+    _FineOp(FineOp),
+    _checkerboard(checkerboard),
+    evals_fine  (ext_eval_fine), 
+    evals_coarse(ext_eval_coarse),
+    subspace    (ext_subspace),
+    evec_coarse (ext_coarse)
+  {
+    evals_fine.resize(0);
+    evals_coarse.resize(0);
+  };
+
+  void Orthogonalise(void ) {
+    CoarseScalar InnerProd(_CoarseGrid); 
+    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+  };
 
   template<typename T>  static RealD normalise(T& v) 
   {
@@ -246,43 +308,44 @@ public:
     v = v * (1.0/nn);
     return nn;
   }
-
+  /*
   void fakeFine(void)
   {
     int Nk = nbasis;
-    _Aggregate.subspace.resize(Nk,_FineGrid);
-    _Aggregate.subspace[0]=1.0;
-    _Aggregate.subspace[0].checkerboard=_checkerboard;
-    normalise(_Aggregate.subspace[0]);
+    subspace.resize(Nk,_FineGrid);
+    subspace[0]=1.0;
+    subspace[0].checkerboard=_checkerboard;
+    normalise(subspace[0]);
     PlainHermOp<FineField>    Op(_FineOp);
     for(int k=1;k<Nk;k++){
-      _Aggregate.subspace[k].checkerboard=_checkerboard;
-      Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
-      normalise(_Aggregate.subspace[k]);
+      subspace[k].checkerboard=_checkerboard;
+      Op(subspace[k-1],subspace[k]);
+      normalise(subspace[k]);
     }
   }
+  */
 
   void testFine(RealD resid) 
   {
     assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
     PlainHermOp<FineField>    Op(_FineOp);
     ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
     for(int k=0;k<nbasis;k++){
-      assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
+      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
     }
   }
 
   void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
   {
     assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
     //////////////////////////////////////////////////////////////////////////////////////////////////
     // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
     //////////////////////////////////////////////////////////////////////////////////////////////////
     Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_subspace);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
 
     for(int k=0;k<evec_coarse.size();k++){
       if ( k < nbasis ) { 
@@ -302,34 +365,34 @@ public:
     PlainHermOp<FineField>    Op(_FineOp);
 
     evals_fine.resize(Nm);
-    _Aggregate.subspace.resize(Nm,_FineGrid);
+    subspace.resize(Nm,_FineGrid);
 
     ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
 
     FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
 
     int Nconv;
-    IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
+    IRL.calc(evals_fine,subspace,src,Nconv,false);
     
     // Shrink down to number saved
     assert(Nstop>=nbasis);
     assert(Nconv>=nbasis);
     evals_fine.resize(nbasis);
-    _Aggregate.subspace.resize(nbasis,_FineGrid);
+    subspace.resize(nbasis,_FineGrid);
   }
   void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
   {
     Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_Aggregate);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_subspace);
     //////////////////////////////////////////////////////////////////////////////////////////////////
     // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
     //////////////////////////////////////////////////////////////////////////////////////////////////
 
     Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_subspace,relax);
 
     evals_coarse.resize(Nm);
     evec_coarse.resize(Nm,_CoarseGrid);
diff --git a/tests/debug/Test_cayley_coarsen_support.cc b/tests/debug/Test_cayley_coarsen_support.cc
index c6532a0d..f57823e5 100644
--- a/tests/debug/Test_cayley_coarsen_support.cc
+++ b/tests/debug/Test_cayley_coarsen_support.cc
@@ -111,6 +111,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage<<"Error "<<norm2(err)<<std::endl;
 
   const int nbasis = 2;
+  const int cb = 0 ;
   LatticeFermion prom(FGrid);
 
   std::vector<LatticeFermion> subspace(nbasis,FGrid);
@@ -119,7 +120,7 @@ int main (int argc, char ** argv)
 
   MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
   typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
-  Subspace Aggregates(Coarse5d,FGrid);
+  Subspace Aggregates(Coarse5d,FGrid,cb);
   Aggregates.CreateSubspaceRandom(RNG5);
 
   subspace=Aggregates.subspace;
diff --git a/tests/debug/Test_cayley_ldop_cr.cc b/tests/debug/Test_cayley_ldop_cr.cc
index cbefdd46..c6005fd0 100644
--- a/tests/debug/Test_cayley_ldop_cr.cc
+++ b/tests/debug/Test_cayley_ldop_cr.cc
@@ -78,6 +78,7 @@ int main (int argc, char ** argv)
 
   RealD mass=0.1;
   RealD M5=1.5;
+  int cb=0;
 
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
@@ -95,7 +96,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
-  Subspace Aggregates(Coarse5d,FGrid);
+  Subspace Aggregates(Coarse5d,FGrid,cb);
   Aggregates.CreateSubspace(RNG5,HermDefOp);
 
 
diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
index 4c702a33..3dff4b90 100644
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
@@ -56,12 +56,12 @@ public:
 
   void checkpointFine(std::string evecs_file,std::string evals_file)
   {
-    assert(this->_Aggregate.subspace.size()==nbasis);
+    assert(this->subspace.size()==nbasis);
     emptyUserRecord record;
     Grid::QCD::ScidacWriter WR;
     WR.open(evecs_file);
     for(int k=0;k<nbasis;k++) {
-      WR.writeScidacFieldRecord(this->_Aggregate.subspace[k],record);
+      WR.writeScidacFieldRecord(this->subspace[k],record);
     }
     WR.close();
     
@@ -72,7 +72,7 @@ public:
   void checkpointFineRestore(std::string evecs_file,std::string evals_file)
   {
     this->evals_fine.resize(nbasis);
-    this->_Aggregate.subspace.resize(nbasis,this->_FineGrid);
+    this->subspace.resize(nbasis,this->_FineGrid);
     
     std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
     XmlReader RDx(evals_file);
@@ -85,8 +85,8 @@ public:
     Grid::QCD::ScidacReader RD ;
     RD.open(evecs_file);
     for(int k=0;k<nbasis;k++) {
-      this->_Aggregate.subspace[k].checkerboard=this->_checkerboard;
-      RD.readScidacFieldRecord(this->_Aggregate.subspace[k],record);
+      this->subspace[k].checkerboard=this->_checkerboard;
+      RD.readScidacFieldRecord(this->subspace[k],record);
       
     }
     RD.close();
@@ -221,7 +221,9 @@ int main (int argc, char ** argv) {
     std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
     _LocalCoherenceLanczos.checkpointFine(std::string("evecs.scidac"),std::string("evals.xml"));
     _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
+    std::cout << GridLogIRL<<"Orthogonalising"<<std::endl;
     _LocalCoherenceLanczos.Orthogonalise();
+    std::cout << GridLogIRL<<"Orthogonaled"<<std::endl;
   }
 
   if ( Params.doFineRead ) { 
@@ -231,8 +233,6 @@ int main (int argc, char ** argv) {
   }
 
   if ( Params.doCoarse ) {
-    std::cout << GridLogMessage << "Orthogonalising " << nbasis<<" Nm "<<Nm2<< std::endl;
-    
     std::cout << GridLogMessage << "Performing coarse grid IRL Nstop "<< Ns2<< " Nk "<<Nk2<<" Nm "<<Nm2<< std::endl;
     _LocalCoherenceLanczos.calcCoarse(coarse.Cheby,Params.Smoother,Params.coarse_relax_tol,
 			      coarse.Nstop, coarse.Nk,coarse.Nm,

From ae31a6a760dc36ec984e93a4eab1acd344e59846 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 13 Feb 2018 02:11:37 +0000
Subject: [PATCH 12/17] Move deflate to right class

---
 .../iterative/ImplicitlyRestartedLanczos.h          | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index b4fca33a..7d5a1889 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -182,12 +182,6 @@ template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public Imp
 {
  public:
 
-  static void Deflate(const std::vector<Field> &_v,
-		      const std::vector<RealD>& eval,
-		      const Field& src_orig,Field& result) {
-    basisDeflate(_v,eval,src_orig,result);
-  }
-
   LinearFunction<Field>       &_HermOp;
   ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
   int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
@@ -250,6 +244,13 @@ class ImplicitlyRestartedLanczos {
   /////////////////////////
   
 public:       
+
+  static void Deflate(const std::vector<Field> &_v,
+		      const std::vector<RealD>& eval,
+		      const Field& src_orig,Field& result) {
+    basisDeflate(_v,eval,src_orig,result);
+  }
+
   //////////////////////////////////////////////////////////////////
   // PAB:
   //////////////////////////////////////////////////////////////////

From c96483e3bd559ab4a20c12102534c37447179b4c Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 13 Feb 2018 11:39:07 +0000
Subject: [PATCH 13/17] Whitespace only change

---
 lib/algorithms/iterative/LocalCoherenceLanczos.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/algorithms/iterative/LocalCoherenceLanczos.h b/lib/algorithms/iterative/LocalCoherenceLanczos.h
index c530a572..4c05f4c7 100644
--- a/lib/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h
@@ -28,7 +28,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 #ifndef GRID_LOCAL_COHERENCE_IRL_H
 #define GRID_LOCAL_COHERENCE_IRL_H
+
 namespace Grid { 
+
 struct LanczosParams : Serializable {
  public:
   GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,

From e30a80a2340275774e464b5ce7b328f0ece84b44 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Thu, 15 Feb 2018 17:13:36 +0000
Subject: [PATCH 14/17] Relaxed constraints on MPI thread mode when not using
 multiple comms threads

---
 lib/communicator/Communicator_mpi3.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 6732dcdf..eb0144f0 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -44,7 +44,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
   MPI_Initialized(&flag); // needed to coexist with other libs apparently
   if ( !flag ) {
     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    assert (provided == MPI_THREAD_MULTIPLE);
+    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
+    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
+        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
+      assert(0);
   }
 
   Grid_quiesce_nodes();

From 945684c470845d826fdbb8511ddf098a90779188 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 20 Feb 2018 14:28:38 +0000
Subject: [PATCH 15/17] updates for deflation in the RB solver

---
 lib/algorithms/Algorithms.h                   |  1 +
 .../iterative/ImplicitlyRestartedLanczos.h    | 19 -----------
 .../iterative/LocalCoherenceLanczos.h         | 16 +--------
 lib/algorithms/iterative/SchurRedBlack.h      | 33 ++++++++++++++++---
 4 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/lib/algorithms/Algorithms.h b/lib/algorithms/Algorithms.h
index 070a1019..ef147c53 100644
--- a/lib/algorithms/Algorithms.h
+++ b/lib/algorithms/Algorithms.h
@@ -39,6 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
 
+#include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index 7d5a1889..787cf15a 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -149,19 +149,6 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
   basisReorderInPlace(_v,sort_vals,idx);
 }
 
-// PAB: faster to compute the inner products first then fuse loops.
-// If performance critical can improve.
-template<class Field>
-void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
-  result = zero;
-  assert(_v.size()==eval.size());
-  int N = (int)_v.size();
-  for (int i=0;i<N;i++) {
-    Field& tmp = _v[i];
-    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
-  }
-}
-
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
@@ -245,12 +232,6 @@ class ImplicitlyRestartedLanczos {
   
 public:       
 
-  static void Deflate(const std::vector<Field> &_v,
-		      const std::vector<RealD>& eval,
-		      const Field& src_orig,Field& result) {
-    basisDeflate(_v,eval,src_orig,result);
-  }
-
   //////////////////////////////////////////////////////////////////
   // PAB:
   //////////////////////////////////////////////////////////////////
diff --git a/lib/algorithms/iterative/LocalCoherenceLanczos.h b/lib/algorithms/iterative/LocalCoherenceLanczos.h
index 4c05f4c7..b8348c0c 100644
--- a/lib/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h
@@ -31,6 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 namespace Grid { 
 
+
 struct LanczosParams : Serializable {
  public:
   GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
@@ -240,21 +241,6 @@ private:
   std::vector<CoarseField>                        _evec_coarse;
 
 public:
-  static void Deflate(std::vector<FineField>   subspace,
-		      std::vector<CoarseField> evec_coarse,
-		      std::vector<RealD>       eval_coarse,
-		      const FineField& src_orig,FineField& result) 
-  {
-    int N = (int)evec_coarse.size();
-    CoarseField src_coarse(evec_coarse[0]._grid);
-    CoarseField res_coarse(evec_coarse[0]._grid);    res_coarse = zero;
-    blockProject(src_orig,src_coarse,subspace);    
-    for (int i=0;i<N;i++) {
-      CoarseField & tmp = evec_coarse[i];
-      axpy(res_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,res_coarse);
-    }
-    blockPromote(res_coarse,result,subspace);
-  };
 
   LocalCoherenceLanczos(GridBase *FineGrid,
 			GridBase *CoarseGrid,
diff --git a/lib/algorithms/iterative/SchurRedBlack.h b/lib/algorithms/iterative/SchurRedBlack.h
index 5f5a8b66..fac2030f 100644
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -107,7 +107,12 @@ namespace Grid {
     };
 
     template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
 
       // FIXME CGdiagonalMee not implemented virtual function
       // FIXME use CBfactorise to control schur decomp
@@ -129,7 +134,6 @@ namespace Grid {
       pickCheckerboard(Odd ,src_o,in);
       pickCheckerboard(Even,sol_e,out);
       pickCheckerboard(Odd ,sol_o,out);
-
       std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
     
       /////////////////////////////////////////////////////
@@ -146,6 +150,7 @@ namespace Grid {
       // Call the red-black solver
       //////////////////////////////////////////////////////////////
       std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
+      guess(src_o,sol_o);
       _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
       std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
 
@@ -189,7 +194,12 @@ namespace Grid {
     CBfactorise=cb;
   };
     template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
 
       // FIXME CGdiagonalMee not implemented virtual function
       // FIXME use CBfactorise to control schur decomp
@@ -225,6 +235,7 @@ namespace Grid {
       // Call the red-black solver
       //////////////////////////////////////////////////////////////
       std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+      guess(src_o,sol_o);
       _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 
       ///////////////////////////////////////////////////
@@ -268,7 +279,12 @@ namespace Grid {
     };
 
     template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix,class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
 
       // FIXME CGdiagonalMee not implemented virtual function
       // FIXME use CBfactorise to control schur decomp
@@ -305,6 +321,7 @@ namespace Grid {
       //////////////////////////////////////////////////////////////
       std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+      guess(src_o,tmp);
       _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
       _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
 
@@ -347,7 +364,12 @@ namespace Grid {
     };
 
     template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
 
       // FIXME CGdiagonalMee not implemented virtual function
       // FIXME use CBfactorise to control schur decomp
@@ -385,6 +407,7 @@ namespace Grid {
       std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
+      guess(src_o,tmp);
       _HermitianRBSolver(src_o,tmp);  assert(tmp.checkerboard==Odd);
       _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
 

From 559a501140a44e9e8440b276169ef283fc592974 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 20 Feb 2018 14:29:08 +0000
Subject: [PATCH 16/17] Deflation interface for solvers

---
 lib/algorithms/iterative/Deflation.h | 101 +++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 lib/algorithms/iterative/Deflation.h

diff --git a/lib/algorithms/iterative/Deflation.h b/lib/algorithms/iterative/Deflation.h
new file mode 100644
index 00000000..b6aa0d3d
--- /dev/null
+++ b/lib/algorithms/iterative/Deflation.h
@@ -0,0 +1,101 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_DEFLATION_H
+#define GRID_DEFLATION_H
+
+namespace Grid { 
+
+struct ZeroGuesser {
+public:
+  template<class Field> 
+  void operator()(const Field &src,Field &guess) { guess = Zero(); };
+};
+struct SourceGuesser {
+public:
+  template<class Field> 
+  void operator()(const Field &src,Field &guess) { guess = src; };
+};
+
+////////////////////////////////
+// Fine grid deflation
+////////////////////////////////
+template<class Field>
+struct DeflatedGuesser {
+private:
+  const std::vector<Field> &evec;
+  const std::vector<RealD> &eval;
+
+public:
+
+  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
+
+  void operator()(const Field &src,Field &guess) { 
+    guess = zero;
+    assert(evec.size()==eval.size());
+    auto N = evec.size();
+    for (int i=0;i<N;i++) {
+      Field& tmp = evec[i];
+      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
+    }
+  }
+};
+
+template<class FineField, class CoarseField>
+class LocalCoherenceDeflatedGuesser {
+private:
+  const std::vector<FineField>   &subspace;
+  const std::vector<CoarseField> &evec_coarse;
+  const std::vector<RealD>       &eval_coarse;
+public:
+  
+  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
+				const std::vector<CoarseField> &_evec_coarse,
+				const std::vector<RealD>       &_eval_coarse)
+    : subspace(_subspace), 
+      evec_coarse(_evec_coarse), 
+      eval_coarse(_eval_coarse)  
+  {
+  }
+  
+  void operator()(const FineField &src,FineField &guess) { 
+    int N = (int)evec_coarse.size();
+    CoarseField src_coarse(evec_coarse[0]._grid);
+    CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero;
+    blockProject(src,src_coarse,subspace);    
+    for (int i=0;i<N;i++) {
+      CoarseField & tmp = evec_coarse[i];
+      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
+    }
+    blockPromote(guess_coarse,guess,subspace);
+  };
+};
+
+
+
+}
+#endif

From 4790e99817c894d311c0d2ad149444c52f76668c Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 20 Feb 2018 15:12:31 +0000
Subject: [PATCH 17/17] Extra communicator free that I had missed. Hard to
 audit them all as this is complex

---
 lib/communicator/Communicator_mpi3.cc | 12 ++++++++++--
 lib/communicator/SharedMemory.h       |  1 +
 lib/communicator/SharedMemoryMPI.cc   |  4 ++++
 lib/communicator/SharedMemoryNone.cc  |  2 ++
 4 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index eb0144f0..424b7973 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -89,10 +89,16 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
 {
   MPI_Comm optimal_comm;
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm); // Remap using the shared memory optimising routine
+  ////////////////////////////////////////////////////
+  // Remap using the shared memory optimising routine
+  // The remap creates a comm which must be freed
+  ////////////////////////////////////////////////////
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
   InitFromMPICommunicator(processors,optimal_comm);
   SetCommunicator(optimal_comm);
+  ///////////////////////////////////////////////////
   // Free the temp communicator
+  ///////////////////////////////////////////////////
   MPI_Comm_free(&optimal_comm);
 }
 
@@ -202,8 +208,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
   // Take the right SHM buffers
   //////////////////////////////////////////////////////////////////////////////////////////////////////
   SetCommunicator(comm_split);
-
+  
+  ///////////////////////////////////////////////
   // Free the temp communicator 
+  ///////////////////////////////////////////////
   MPI_Comm_free(&comm_split);
 
   if(0){ 
diff --git a/lib/communicator/SharedMemory.h b/lib/communicator/SharedMemory.h
index 0f647dc6..9f6b1a25 100644
--- a/lib/communicator/SharedMemory.h
+++ b/lib/communicator/SharedMemory.h
@@ -133,6 +133,7 @@ class SharedMemory
 
  public:
   SharedMemory() {};
+  ~SharedMemory();
   ///////////////////////////////////////////////////////////////////////////////////////
   // set the buffers & sizes
   ///////////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/communicator/SharedMemoryMPI.cc b/lib/communicator/SharedMemoryMPI.cc
index 2a62b7ac..9e5d8f15 100644
--- a/lib/communicator/SharedMemoryMPI.cc
+++ b/lib/communicator/SharedMemoryMPI.cc
@@ -399,5 +399,9 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
     return (void *) remote;
   }
 }
+SharedMemory::~SharedMemory()
+{
+  MPI_Comm_free(&ShmComm);
+};
 
 }
diff --git a/lib/communicator/SharedMemoryNone.cc b/lib/communicator/SharedMemoryNone.cc
index 7feed7e4..a23e3c1c 100644
--- a/lib/communicator/SharedMemoryNone.cc
+++ b/lib/communicator/SharedMemoryNone.cc
@@ -122,5 +122,7 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
   return NULL;
 }
+SharedMemory::~SharedMemory()
+{};
 
 }