From f2a4f1311113505a280cc8cdb67db22bbf3f7cf2 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 10 Dec 2019 19:32:12 -0500
Subject: [PATCH 01/43] Must offload the Coarsened matrix if Stencil buffers
 are device resident

---
 Grid/algorithms/CoarsenedMatrix.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index 913f5c0c..45d5e8f7 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -128,7 +128,7 @@ public:
     for(int i=0;i<nbasis;i++){
       blockProject(iProj,subspace[i],subspace);
       eProj=Zero(); 
-      thread_for(ss, CoarseGrid->oSites(),{
+      accelerator_for(ss, CoarseGrid->oSites(),1,{
 	eProj[ss](i)=CComplex(1.0);
       });
       eProj=eProj - iProj;
@@ -307,10 +307,12 @@ public:
 
     RealD Nin = norm2(in);
     SimpleCompressor<siteVector> compressor;
+
     Stencil.HaloExchange(in,compressor);
+
     auto in_v = in.View();
     auto out_v = out.View();
-    thread_for(ss,Grid()->oSites(),{
+    accelerator_for(ss,Grid()->oSites(),1,{
       siteVector res = Zero();
       siteVector nbr;
       int ptype;
@@ -331,6 +333,7 @@ public:
       }
       vstream(out_v[ss],res);
     });
+
     RealD Nout= norm2(out);
     return Nout;
   };
@@ -356,6 +359,7 @@ public:
     conformable(in.Grid(),out.Grid());
     
     SimpleCompressor<siteVector> compressor;
+
     Stencil.HaloExchange(in,compressor);
     
     auto point = [dir, disp](){
@@ -367,7 +371,7 @@ public:
 
     auto out_v = out.View();
     auto in_v  = in.View();
-    thread_for(ss,Grid()->oSites(),{
+    accelerator_for(ss,Grid()->oSites(),1,{
       siteVector res = Zero();
       siteVector nbr;
       int ptype;

From 710fee5d2601962419b7c3521423cbc4bc62462c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 10 Dec 2019 21:48:42 -0500
Subject: [PATCH 02/43] Subspace setup testing code and timing verbose

---
 Grid/algorithms/CoarsenedMatrix.h | 70 +++++++++++++++++++++++++------
 1 file changed, 57 insertions(+), 13 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index 45d5e8f7..c19bef19 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -115,9 +115,9 @@ public:
   
   void Orthogonalise(void){
     CoarseScalar InnerProd(CoarseGrid); 
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
+    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
     blockOrthogonalise(InnerProd,subspace);
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
     blockOrthogonalise(InnerProd,subspace);
     //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
     //      CheckOrthogonal();
@@ -237,32 +237,65 @@ public:
 
   }
 
+  // 
+  // World of possibilities here. 
+  // Experiments
+  // i)  Use inverse iteration method equivaleent with Chebyshve
+  // ii) Multiply by Fourier phases
+  // iii) Multiply by Fourier phases and refilter
+  //
   virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
 
     RealD scale;
 
+    const int dependent=4;
 
-    Chebyshev<FineField> Cheb(0.1,64.0,900);
+    Chebyshev<FineField> ChebDependent(1.0,64.0,100);
+    Chebyshev<FineField> ChebFilt     (0.1,64.0,900);
 
     FineField noise(FineGrid);
     FineField Mn(FineGrid);
 
-    for(int b=0;b<nn;b++){
-	
+    for(int bb=0;bb<nn;bb+=dependent){
+      
+      // New normalised noise
       gaussian(RNG,noise);
       scale = std::pow(norm2(noise),-0.5); 
       noise=noise*scale;
 
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+      // Initial matrix element
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<bb<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
 
-      Cheb(hermop,noise,Mn);
+      for(int b=bb;b<bb+dependent;b++) {
 
-      scale = std::pow(norm2(Mn),-0.5); 
-      Mn=Mn*scale;
-      subspace[b]   = Mn;
+	// Filter
+	if(b==bb) {
+	  ChebFilt(hermop,noise,Mn);
+	} else { 
+	  ChebDependent(hermop,noise,Mn);
+	}
 
-      hermop.Op(Mn,noise); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(noise)<<std::endl;
+	// normalise
+	scale = std::pow(norm2(Mn),-0.5); 
+	Mn=Mn*scale;
 
+	// set this new vector
+	subspace[b]   = Mn;
+	
+	// new matrix element
+	hermop.Op(Mn,noise); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(noise)<<std::endl;
+	
+	// Dependent vector rule
+	// a) noise = A. Mn;
+	noise = Mn; // Already normaliseed
+	// c) noise = fourier_phase * Mn; // etc..
+
+	if ( b<bb+dependent-1 ) { 
+	  hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+	}
+
+
+      }
     }
 
     Orthogonalise();
@@ -426,8 +459,10 @@ public:
     CoarseVector oProj(Grid()); 
     CoarseScalar InnerProd(Grid()); 
 
+    std::cout << GridLogMessage<< "CoarsenMatrix" << std::endl;
     // Orthogonalise the subblocks over the basis
     blockOrthogonalise(InnerProd,Subspace.subspace);
+    std::cout << GridLogMessage<< "CoarsenMatrix orthogonalised" << std::endl;
 
     // Compute the matrix elements of linop between this orthonormal
     // set of vectors.
@@ -443,10 +478,11 @@ public:
     for(int i=0;i<nbasis;i++){
       phi=Subspace.subspace[i];
 	
-      std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
+      std::cout<<GridLogMessage<<"("<<i<<") "<<std::endl;
 
       for(int p=0;p<geom.npoint;p++){ 
 
+	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << std::endl;
 	int dir   = geom.directions[p];
 	int disp  = geom.displacements[p];
 
@@ -460,6 +496,7 @@ public:
 	else  {
 	  linop.OpDir(phi,Mphi,dir,disp); 
 	}
+	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "Mdir done "<< std::endl;
 
 	////////////////////////////////////////////////////////////////////////
 	// Pick out contributions coming from this cell and neighbour cell
@@ -476,16 +513,23 @@ public:
 	} else {
 	  assert(0);
 	}
+	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "selected "<< std::endl;
 
 	Subspace.ProjectToSubspace(iProj,iblock);
 	Subspace.ProjectToSubspace(oProj,oblock);
+	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "proojected "<< std::endl;
+
+	// 4x gain possible in this loop. Profile and identify time loss.
+	// i)  Assume Hermiticity, upper diagonal only (2x)
+	// ii) Local inner product, then pick the local inners and sum. (2x)
+	//
 	//	  blockProject(iProj,iblock,Subspace.subspace);
 	//	  blockProject(oProj,oblock,Subspace.subspace);
 	auto iProj_v = iProj.View() ;
 	auto oProj_v = oProj.View() ;
 	auto A_p     =  A[p].View();
 	auto A_self  = A[self_stencil].View();
-	thread_for(ss, Grid()->oSites(),{
+	accelerator_for(ss, Grid()->oSites(),1,{
 	  for(int j=0;j<nbasis;j++){
 	    if( disp!= 0 ) {
 	      A_p[ss](j,i) = oProj_v[ss](j);

From 0b3a3562c32b56f1fcd8d1068d68cafbe3f8dc9b Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 10 Dec 2019 21:49:12 -0500
Subject: [PATCH 03/43] Some MPI (summit) create sigusr2, so trap that

---
 Grid/util/Init.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 472013f4..570f4234 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -628,6 +628,7 @@ void Grid_debug_handler_init(void)
   sigaction(SIGSEGV,&sa,NULL);
   sigaction(SIGTRAP,&sa,NULL);
   sigaction(SIGBUS,&sa,NULL);
+  sigaction(SIGUSR2,&sa,NULL);
 
   feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
 

From d73f0b8618e1c4c84a5b0dc76d6b3e8e6dace17f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 10 Dec 2019 21:50:06 -0500
Subject: [PATCH 04/43] Verbose for temporary debug

---
 Grid/threads/Pragmas.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/threads/Pragmas.h b/Grid/threads/Pragmas.h
index d05f8ee9..4d713258 100644
--- a/Grid/threads/Pragmas.h
+++ b/Grid/threads/Pragmas.h
@@ -43,6 +43,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef _OPENMP
 #define GRID_OMP
 #include <omp.h>
+#warning "Grid is using OpenMP for host loops"
 #endif
 
 #ifdef GRID_OMP

From 736b19485e1d7306f9894dcd42f5854102912780 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 13 Dec 2019 21:30:48 -0500
Subject: [PATCH 05/43] Faster set up and some dead code ifdef'ed out

---
 Grid/algorithms/CoarsenedMatrix.h | 58 +++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index c19bef19..7f729bbc 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -240,7 +240,7 @@ public:
   // 
   // World of possibilities here. 
   // Experiments
-  // i)  Use inverse iteration method equivaleent with Chebyshve
+  // i)  Use inverse iteration method equivaleent with Chebyshev
   // ii) Multiply by Fourier phases
   // iii) Multiply by Fourier phases and refilter
   //
@@ -248,10 +248,31 @@ public:
 
     RealD scale;
 
-    const int dependent=4;
+    const int dependent=16;
 
-    Chebyshev<FineField> ChebDependent(1.0,64.0,100);
-    Chebyshev<FineField> ChebFilt     (0.1,64.0,900);
+    Chebyshev<FineField> ChebFilt     (0.03,64.0,500);
+    Chebyshev<FineField> ChebDependent(0.01,64.0,200);
+
+#if 0
+    auto latt_size = FineGrid->GlobalDimensions();
+    Coordinate Fourier[dependent] =  {
+      Coordinate({0, 0,0,0,0}),
+      Coordinate({0, 1,0,0,0}),
+      Coordinate({0,-1,0,0,0}),
+      Coordinate({0,0, 1,0,0}),
+      Coordinate({0,0,-1,0,0}),
+      Coordinate({0,0,0, 1,0}),
+      Coordinate({0,0,0,-1,0}),
+      Coordinate({0,0,0,0, 1}),
+      Coordinate({0,0,0,0,-1})
+    };
+    
+    ComplexD ci(0.0,1.0);
+    Lattice<CComplex> C(FineGrid);   
+    Lattice<CComplex> coor(FineGrid);   
+    FineField save(FineGrid);
+    FineField tmp (FineGrid);
+#endif
 
     FineField noise(FineGrid);
     FineField Mn(FineGrid);
@@ -262,16 +283,29 @@ public:
       gaussian(RNG,noise);
       scale = std::pow(norm2(noise),-0.5); 
       noise=noise*scale;
+      //      save=noise;
 
       // Initial matrix element
       hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<bb<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
 
-      for(int b=bb;b<bb+dependent;b++) {
+      int dep=0;
+      for(int b=bb;b<MIN(bb+dependent,nn);b++) {
 
 	// Filter
 	if(b==bb) {
 	  ChebFilt(hermop,noise,Mn);
 	} else { 
+#if 0
+	  C=Zero();
+	  for(int mu=0;mu<5;mu++){
+	    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+	    LatticeCoordinate(coor,mu);
+	    C = C + (TwoPiL * Fourier[dep][mu]) * coor;
+	  }
+	  C = exp(C*ci); // Fourier phase
+	  noise=C*save;
+	  hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+#endif
 	  ChebDependent(hermop,noise,Mn);
 	}
 
@@ -290,11 +324,7 @@ public:
 	noise = Mn; // Already normaliseed
 	// c) noise = fourier_phase * Mn; // etc..
 
-	if ( b<bb+dependent-1 ) { 
-	  hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-	}
-
-
+	dep++;
       }
     }
 
@@ -482,7 +512,7 @@ public:
 
       for(int p=0;p<geom.npoint;p++){ 
 
-	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << std::endl;
+	//	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << std::endl;
 	int dir   = geom.directions[p];
 	int disp  = geom.displacements[p];
 
@@ -496,7 +526,7 @@ public:
 	else  {
 	  linop.OpDir(phi,Mphi,dir,disp); 
 	}
-	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "Mdir done "<< std::endl;
+	//	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "Mdir done "<< std::endl;
 
 	////////////////////////////////////////////////////////////////////////
 	// Pick out contributions coming from this cell and neighbour cell
@@ -513,11 +543,11 @@ public:
 	} else {
 	  assert(0);
 	}
-	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "selected "<< std::endl;
+	//	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "selected "<< std::endl;
 
 	Subspace.ProjectToSubspace(iProj,iblock);
 	Subspace.ProjectToSubspace(oProj,oblock);
-	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "proojected "<< std::endl;
+	//	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "proojected "<< std::endl;
 
 	// 4x gain possible in this loop. Profile and identify time loss.
 	// i)  Assume Hermiticity, upper diagonal only (2x)

From d18994eddc242441670bde53129da7f364e2d13a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 13 Dec 2019 22:08:11 -0500
Subject: [PATCH 06/43] offload more of mgrid to GPU

---
 Grid/algorithms/CoarsenedMatrix.h | 32 -------------------------------
 Grid/lattice/Lattice_transfer.h   | 20 +++++++++++--------
 2 files changed, 12 insertions(+), 40 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index 7f729bbc..e47137f9 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -253,27 +253,6 @@ public:
     Chebyshev<FineField> ChebFilt     (0.03,64.0,500);
     Chebyshev<FineField> ChebDependent(0.01,64.0,200);
 
-#if 0
-    auto latt_size = FineGrid->GlobalDimensions();
-    Coordinate Fourier[dependent] =  {
-      Coordinate({0, 0,0,0,0}),
-      Coordinate({0, 1,0,0,0}),
-      Coordinate({0,-1,0,0,0}),
-      Coordinate({0,0, 1,0,0}),
-      Coordinate({0,0,-1,0,0}),
-      Coordinate({0,0,0, 1,0}),
-      Coordinate({0,0,0,-1,0}),
-      Coordinate({0,0,0,0, 1}),
-      Coordinate({0,0,0,0,-1})
-    };
-    
-    ComplexD ci(0.0,1.0);
-    Lattice<CComplex> C(FineGrid);   
-    Lattice<CComplex> coor(FineGrid);   
-    FineField save(FineGrid);
-    FineField tmp (FineGrid);
-#endif
-
     FineField noise(FineGrid);
     FineField Mn(FineGrid);
 
@@ -295,17 +274,6 @@ public:
 	if(b==bb) {
 	  ChebFilt(hermop,noise,Mn);
 	} else { 
-#if 0
-	  C=Zero();
-	  for(int mu=0;mu<5;mu++){
-	    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-	    LatticeCoordinate(coor,mu);
-	    C = C + (TwoPiL * Fourier[dep][mu]) * coor;
-	  }
-	  C = exp(C*ci); // Fourier phase
-	  noise=C*save;
-	  hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-#endif
 	  ChebDependent(hermop,noise,Mn);
 	}
 
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 865a4b14..02a9e91b 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -111,7 +111,10 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
 
   auto fineData_   = fineData.View();
   auto coarseData_ = coarseData.View();
-  // Loop over coars parallel, and then loop over fine associated with coarse.
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
+  // Otherwise do finee inner product per site, and make the update atomic
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
   thread_for( sf, fine->oSites(), {
     int sc;
     Coordinate coor_c(_ndimension);
@@ -120,10 +123,11 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
     for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
     Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 
-    thread_critical {
-      for(int i=0;i<nbasis;i++) {
-	auto Basis_      = Basis[i].View();
-	coarseData_[sc](i)=coarseData_[sc](i) + innerProduct(Basis_[sf],fineData_[sf]);
+    for(int i=0;i<nbasis;i++) {
+      auto Basis_      = Basis[i].View();
+      auto ip          = innerProduct(Basis_[sf],fineData_[sf]);
+      thread_critical {
+	coarseData_[sc](i)=coarseData_[sc](i) + ip;
       }
     }
   });
@@ -160,7 +164,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
   auto fineY_  = fineY.View();
   auto coarseA_= coarseA.View();
 
-  thread_for(sf, fine->oSites(), {
+  accelerator_for(sf, fine->oSites(), 1, {
     
     int sc;
     Coordinate coor_c(_ndimension);
@@ -196,7 +200,7 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
 
   fine_inner = localInnerProduct(fineX,fineY);
   blockSum(coarse_inner,fine_inner);
-  thread_for(ss, coarse->oSites(),{
+  accelerator_for(ss, coarse->oSites(), 1, {
     CoarseInner_[ss] = coarse_inner_[ss];
   });
 }
@@ -321,7 +325,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
   auto coarseData_ = coarseData.View();
 
   // Loop with a cache friendly loop ordering
-  thread_for(sf,fine->oSites(),{
+  acceelerator_for(sf,fine->oSites(),1,{
     int sc;
     Coordinate coor_c(_ndimension);
     Coordinate coor_f(_ndimension);

From 152b525a4dc8b8211b8de001fe0333c9db11ac4d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 13 Dec 2019 22:44:42 -0500
Subject: [PATCH 07/43] Typo fix

---
 Grid/lattice/Lattice_transfer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 02a9e91b..c1c3b542 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -325,7 +325,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
   auto coarseData_ = coarseData.View();
 
   // Loop with a cache friendly loop ordering
-  acceelerator_for(sf,fine->oSites(),1,{
+  accelerator_for(sf,fine->oSites(),1,{
     int sc;
     Coordinate coor_c(_ndimension);
     Coordinate coor_f(_ndimension);

From 9e154749998eb0cfe4c5ada95d79ddaca1e3c4e3 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 14 Dec 2019 05:28:16 -0500
Subject: [PATCH 08/43] Accelerator loop attempt at speed up

---
 Grid/lattice/Lattice_transfer.h | 57 ++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index c1c3b542..9e4003b0 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -106,6 +106,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
     block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
     assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
   }
+  int blockVol = fine->oSites()/coarse->oSites();
 
   coarseData=Zero();
 
@@ -113,20 +114,26 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
   auto coarseData_ = coarseData.View();
   ////////////////////////////////////////////////////////////////////////////////////////////////////////
   // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
-  // Otherwise do finee inner product per site, and make the update atomic
+  // Otherwise do fine inner product per site, and make the update atomic
   ////////////////////////////////////////////////////////////////////////////////////////////////////////
-  thread_for( sf, fine->oSites(), {
-    int sc;
-    Coordinate coor_c(_ndimension);
-    Coordinate coor_f(_ndimension);
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+  accelerator_for( sc, coarse->oSites(), {
 
-    for(int i=0;i<nbasis;i++) {
-      auto Basis_      = Basis[i].View();
-      auto ip          = innerProduct(Basis_[sf],fineData_[sf]);
-      thread_critical {
+    Coordinate coor_c(_ndimension);
+    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+    coarseData_[sc]=Zero();
+
+    for(int sb=0;sb<blockVol;sb++){
+
+      Coordinate coor_b(_ndimension);
+      Coordinate coor_f(_ndimension);
+
+      Lexicographic::CoorFromIndex(coor_b,sb,block_r);
+      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
+      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
+
+      for(int i=0;i<nbasis;i++) {
+	auto Basis_      = Basis[i].View();
+	auto ip          = innerProduct(Basis_[sf],fineData_[sf]);
 	coarseData_[sc](i)=coarseData_[sc](i) + ip;
       }
     }
@@ -230,23 +237,29 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
   for(int d=0 ; d<_ndimension;d++){
     block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
   }
+  int blockVol = fine->oSites()/coarse->oSites();
 
   // Turn this around to loop threaded over sc and interior loop 
   // over sf would thread better
-  coarseData=Zero();
   auto coarseData_ = coarseData.View();
   auto fineData_   = fineData.View();
 
-  thread_for(sf,fine->oSites(),{
-    int sc;
+  accelerator_for(sc,coarse->oSites(),1,{
+
+    // One thread per sub block
     Coordinate coor_c(_ndimension);
-    Coordinate coor_f(_ndimension);
-    
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
-    
-    thread_critical { 
+    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+    coarseData_[sc]=Zero();
+
+    for(int sb=0;sb<blockVol;sb++){
+      
+      int sf;
+      Coordinate coor_b(_ndimension);
+      Coordinate coor_f(_ndimension);
+      Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
+      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
+      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
+
       coarseData_[sc]=coarseData_[sc]+fineData_[sf];
     }
 

From 9aafd204683487795733c0fbe1456e9b84f50179 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 17 Dec 2019 05:01:39 -0500
Subject: [PATCH 09/43] Simple block project promote runs faster on GPU

---
 Grid/lattice/Lattice_transfer.h | 72 +++++++++++++++++++++++++++------
 1 file changed, 60 insertions(+), 12 deletions(-)

diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 9e4003b0..0041f47a 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -1,5 +1,4 @@
 /*************************************************************************************
-
     Grid physics library, www.github.com/paboyle/Grid 
 
     Source file: ./lib/lattice/Lattice_transfer.h
@@ -83,12 +82,35 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
   });
 }
   
-
 template<class vobj,class CComplex,int nbasis>
 inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
+			  const             Lattice<vobj>   &fineData,
+			  const std::vector<Lattice<vobj> > &Basis)
+{
+  GridBase * fine  = fineData.Grid();
+  GridBase * coarse= coarseData.Grid();
+
+  Lattice<CComplex> ip(coarse); 
+
+  //  auto fineData_   = fineData.View();
+  auto coarseData_ = coarseData.View();
+  auto ip_         = ip.View();
+  for(int v=0;v<nbasis;v++) {
+    blockInnerProduct(ip,Basis[v],fineData);
+    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
+	coalescedWrite(coarseData_[sc](v),ip_(sc));
+      });
+  }
+}
+
+template<class vobj,class CComplex,int nbasis>
+inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 const             Lattice<vobj>   &fineData,
 			 const std::vector<Lattice<vobj> > &Basis)
 {
+  typedef iVector<CComplex,nbasis > coarseSiteData;
+  coarseSiteData elide;
+  typedef decltype(coalescedRead(elide)) ScalarComplex;
   GridBase * fine  = fineData.Grid();
   GridBase * coarse= coarseData.Grid();
   int  _ndimension = coarse->_ndimension;
@@ -116,11 +138,17 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
   // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
   // Otherwise do fine inner product per site, and make the update atomic
   ////////////////////////////////////////////////////////////////////////////////////////////////////////
-  accelerator_for( sc, coarse->oSites(), {
+  accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
+
+    auto sc=sci/nbasis;
+    auto i=sci%nbasis;
+    auto Basis_      = Basis[i].View();
 
     Coordinate coor_c(_ndimension);
     Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
-    coarseData_[sc]=Zero();
+
+    int sf;
+    decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
 
     for(int sb=0;sb<blockVol;sb++){
 
@@ -130,13 +158,10 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
       Lexicographic::CoorFromIndex(coor_b,sb,block_r);
       for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
       Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
-
-      for(int i=0;i<nbasis;i++) {
-	auto Basis_      = Basis[i].View();
-	auto ip          = innerProduct(Basis_[sf],fineData_[sf]);
-	coarseData_[sc](i)=coarseData_[sc](i) + ip;
-      }
+      
+      reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
     }
+    coalescedWrite(coarseData_[sc](i),reduce);
   });
   return;
 }
@@ -313,6 +338,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
   }
 }
 
+#if 0
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -349,13 +375,35 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 
     for(int i=0;i<nbasis;i++) {
       auto basis_ = Basis[i].View();
-      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf];
-      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf];
+      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
+      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
     }
   });
   return;
   
 }
+#else
+template<class vobj,class CComplex,int nbasis>
+inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
+			 Lattice<vobj>   &fineData,
+			 const std::vector<Lattice<vobj> > &Basis)
+{
+  GridBase * fine  = fineData.Grid();
+  GridBase * coarse= coarseData.Grid();
+
+  fineData=Zero();
+  for(int i=0;i<nbasis;i++) {
+    Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
+    Lattice<CComplex> cip(coarse);
+    auto cip_ = cip.View();
+    auto  ip_ =  ip.View();
+    accelerator_for(sc,coarse->oSites(),1,{
+      cip_[sc] = ip_[sc]();
+    });
+    blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
+  }
+}
+#endif
 
 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local

From e4784042915d5f622fefa7e7f4189c11943ae11f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 17 Dec 2019 05:03:25 -0500
Subject: [PATCH 10/43] Tuned up significantly on GPU, but another 10x in
 coarse space required

---
 tests/solver/Test_dwf_hdcr.cc | 54 ++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc
index 74adc417..3e603a26 100644
--- a/tests/solver/Test_dwf_hdcr.cc
+++ b/tests/solver/Test_dwf_hdcr.cc
@@ -1,4 +1,4 @@
-   /*************************************************************************************
+/*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
 
@@ -33,7 +33,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 using namespace std;
 using namespace Grid;
- ;
 
 class myclass: Serializable {
 public:
@@ -126,7 +125,7 @@ public:
     CoarseVector Csol(_CoarseOperator.Grid());
 
     ConjugateGradient<CoarseVector>  CG(1.0e-10,100000);
-    ConjugateGradient<FineField>    fCG(3.0e-2,1000);
+    ConjugateGradient<FineField>    fCG(1.0e-3,1000);
 
     HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
     MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
@@ -191,7 +190,7 @@ public:
     CoarseVector Csol(_CoarseOperator.Grid()); Csol=Zero();
 
     ConjugateGradient<CoarseVector>  CG(1.0e-10,100000);
-    ConjugateGradient<FineField>    fCG(3.0e-2,1000);
+    ConjugateGradient<FineField>    fCG(1.0e-3,1000);
 
     HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
     MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
@@ -279,8 +278,7 @@ public:
     Chebyshev<FineField> Cheby    (params.lo,params.hi,params.order,InverseApproximation);
     Chebyshev<FineField> ChebyAccu(params.lo,params.hi,params.order,InverseApproximation);
 
-    _Aggregates.ProjectToSubspace  (Csrc,in);
-
+    //    _Aggregates.ProjectToSubspace  (Csrc,in);
     //    _Aggregates.PromoteFromSubspace(Csrc,out);
     //    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
     
@@ -297,8 +295,10 @@ public:
 
     RealD Ni = norm2(in);
 
+    std::cout<<GridLogMessage << "Smoother calling Cheby" <<std::endl;
     _SmootherOperator.AdjOp(in,vec1);// this is the G5 herm bit
     ChebyAccu(fMdagMOp,vec1,out);    // solves  MdagM = g5 M g5M
+    std::cout<<GridLogMessage << "Smoother called Cheby" <<std::endl;
 
     // Update with residual for out
     _FineOperator.Op(out,vec1);// this is the G5 herm bit
@@ -308,25 +308,32 @@ public:
 
     std::cout<<GridLogMessage << "Smoother resid "<<std::sqrt(r/Ni)<< " " << r << " " << Ni <<std::endl;
     
+    std::cout<<GridLogMessage << "ProjectToSubspace" <<std::endl;
     _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    std::cout<<GridLogMessage << "ProjectToSubspaceDone" <<std::endl;
     
     HermOp.AdjOp(Csrc,Ctmp);// Normal equations // This appears to be zero.
     CG(MdagMOp,Ctmp,Csol);
+    std::cout<<GridLogMessage << "PromoteFromSubspace" <<std::endl;
     _Aggregates.PromoteFromSubspace(Csol,vec1); // Ass^{-1} [in - A Min]_s
                                                 // Q = Q[in - A Min]  
+    std::cout<<GridLogMessage << "PromoteFromSubspaceDone" <<std::endl;
     out = out+vec1;
 
     // Three preconditioner smoothing -- hermitian if C3 = C1
     // Recompute error
     _FineOperator.Op(out,vec1);// this is the G5 herm bit
+    std::cout<<GridLogMessage << "FineOp" <<std::endl;
     vec1  = in - vec1;   // tmp  = in - A Min
     r=norm2(vec1);
 
     std::cout<<GridLogMessage << "Coarse resid "<<std::sqrt(r/Ni)<<std::endl;
 
     // Reapply smoother
+    std::cout<<GridLogMessage << "Smoother calling Cheby" <<std::endl;
     _SmootherOperator.Op(vec1,vec2);  // this is the G5 herm bit
     ChebyAccu(fMdagMOp,vec2,vec1);    // solves  MdagM = g5 M g5M
+    std::cout<<GridLogMessage << "Smoother called Cheby" <<std::endl;
 
     out =out+vec1;
     vec1  = in - vec1;   // tmp  = in - A Min
@@ -360,9 +367,14 @@ int main (int argc, char ** argv)
   const int nbasis= 32;
 
   auto clatt = GridDefaultLatt();
+  std::cout << GridLogMessage << " Coarse lattice is ";
   for(int d=0;d<clatt.size();d++){
     clatt[d] = clatt[d]/block[d];
+    std::cout << clatt[d];
+    if ( d!=clatt.size()-1) 
+      std::cout << "x";
   }
+  std::cout << std::endl;
   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
   GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
 
@@ -466,12 +478,14 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Solving posdef-CG on coarse space "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
-  ConjugateGradient<CoarseVector> CG(1.0e-6,100000);
+  ConjugateGradient<CoarseVector> CG(1.0e-2,100000);
   CG(PosdefLdop,c_src,c_res);
 
+  /*
     std::cout<<GridLogMessage << "**************************************************"<< std::endl;
     std::cout<<GridLogMessage << "Solving indef-MCR on coarse space "<< std::endl;
     std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  */
 //    HermitianLinearOperator<CoarseOperator,CoarseVector> HermIndefLdop(LDOp);
 //    ConjugateResidual<CoarseVector> MCR(1.0e-6,100000);
 //    MCR(HermIndefLdop,c_src,c_res);
@@ -489,10 +503,10 @@ int main (int argc, char ** argv)
   //											   HermIndefOpDD,DdwfDD);
   //  TrivialPrecon<LatticeFermion> simple;
 
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  Precon.SmootherTest(src);
+  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  //  std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
+  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  //  Precon.SmootherTest(src);
 
   //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   //  std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
@@ -512,15 +526,15 @@ int main (int argc, char ** argv)
   //  ConjugateGradient<LatticeFermion> fCG(1.0e-8,100000);
   //  fCG(HermDefOp,src,result);
 
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  LatticeFermion    src_o(FrbGrid);
-  LatticeFermion result_o(FrbGrid);
-  pickCheckerboard(Odd,src_o,src);
-  result_o=Zero();
-  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
-  ConjugateGradient<LatticeFermion> pCG(1.0e-8,10000);
+  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  //  std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
+  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  //  LatticeFermion    src_o(FrbGrid);
+  //  LatticeFermion result_o(FrbGrid);
+  //  pickCheckerboard(Odd,src_o,src);
+  //  result_o=Zero();
+  //  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
+  //  ConjugateGradient<LatticeFermion> pCG(1.0e-8,10000);
   //  pCG(HermOpEO,src_o,result_o);
 
   //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;

From 9cfd64c604a3b287eef1e0036a78dbf5d1d4112c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 17 Dec 2019 05:24:45 -0500
Subject: [PATCH 11/43] Coarse grid on GPU, not fast enough yet. Need a 10x

---
 Grid/algorithms/CoarsenedMatrix.h | 89 +++++++++++++++++++++----------
 1 file changed, 61 insertions(+), 28 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index e47137f9..5e5bcbfa 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -117,8 +117,8 @@ public:
     CoarseScalar InnerProd(CoarseGrid); 
     std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
     blockOrthogonalise(InnerProd,subspace);
-    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
-    blockOrthogonalise(InnerProd,subspace);
+    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
+    //    blockOrthogonalise(InnerProd,subspace);
     //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
     //      CheckOrthogonal();
   } 
@@ -137,11 +137,15 @@ public:
     std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
   }
   void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
+    //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
     blockProject(CoarseVec,FineVec,subspace);
+    //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
   }
   void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
     FineVec.Checkerboard() = subspace[0].Checkerboard();
+    //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
     blockPromote(CoarseVec,FineVec,subspace);
+    //    std::cout << GridLogMessage<< "BlockPromote done"<<std::endl;
   }
   void CreateSubspaceRandom(GridParallelRNG &RNG){
     for(int i=0;i<nbasis;i++){
@@ -262,12 +266,10 @@ public:
       gaussian(RNG,noise);
       scale = std::pow(norm2(noise),-0.5); 
       noise=noise*scale;
-      //      save=noise;
 
       // Initial matrix element
       hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<bb<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
 
-      int dep=0;
       for(int b=bb;b<MIN(bb+dependent,nn);b++) {
 
 	// Filter
@@ -283,16 +285,13 @@ public:
 
 	// set this new vector
 	subspace[b]   = Mn;
-	
+
 	// new matrix element
 	hermop.Op(Mn,noise); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(noise)<<std::endl;
-	
-	// Dependent vector rule
-	// a) noise = A. Mn;
-	noise = Mn; // Already normaliseed
-	// c) noise = fourier_phase * Mn; // etc..
 
-	dep++;
+	// Dependent vector rule
+	noise = Mn; // Already normaliseed
+
       }
     }
 
@@ -310,7 +309,7 @@ public:
   typedef iVector<CComplex,nbasis >             siteVector;
   typedef Lattice<siteVector>                 CoarseVector;
   typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-
+  typedef iMatrix<CComplex,nbasis >  Cobj;
   typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
   typedef Lattice<Fobj >        FineField;
 
@@ -336,36 +335,70 @@ public:
     conformable(_grid,in.Grid());
     conformable(in.Grid(),out.Grid());
 
-    RealD Nin = norm2(in);
+
+    //    RealD Nin = norm2(in);
     SimpleCompressor<siteVector> compressor;
 
+    double comms_usec = -usecond();
     Stencil.HaloExchange(in,compressor);
+    comms_usec += usecond();
 
     auto in_v = in.View();
     auto out_v = out.View();
-    accelerator_for(ss,Grid()->oSites(),1,{
-      siteVector res = Zero();
-      siteVector nbr;
+    typedef LatticeView<Cobj> Aview;
+
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
+    Aview *Aview_p = & AcceleratorViewContainer[0];
+
+    const int Nsimd = CComplex::Nsimd();
+
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+
+    GridStopWatch ArithmeticTimer;
+    int osites=Grid()->oSites();
+    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
+    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
+    double usecs =-usecond();
+
+    assert(geom.npoint==9);
+
+    accelerator_for(ss, Grid()->oSites(), Nsimd, {
+	
+      calcVector res = Zero();
+      calcVector nbr;
       int ptype;
       StencilEntry *SE;
-      for(int point=0;point<geom.npoint;point++){
+
+      int lane=SIMTlane(Nsimd);
+      for(int point=0;point<9;point++){
 
 	SE=Stencil.GetEntry(ptype,point,ss);
 	  
-	if(SE->_is_local&&SE->_permute) { 
-	  permute(nbr,in_v[SE->_offset],ptype);
-	} else if(SE->_is_local) { 
-	  nbr = in_v[SE->_offset];
+	if(SE->_is_local) { 
+	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
 	} else {
-	  nbr = Stencil.CommBuf()[SE->_offset];
+	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
 	}
-	auto A_point = A[point].View();
-	res = res + A_point[ss]*nbr;
-      }
-      vstream(out_v[ss],res);
-    });
+	synchronise();
 
+	auto A = coalescedRead(Aview_p[point][ss]);
+	res = res + A*nbr;
+      }
+      coalescedWrite(out_v[ss],res,lane);
+    });
+    usecs +=usecond();
+
+    double nrm_usec=-usecond();
     RealD Nout= norm2(out);
+    nrm_usec+=usecond();
+    /*
+        std::cout << GridLogMessage << "\tNorm        " << nrm_usec << " us" <<std::endl;
+        std::cout << GridLogMessage << "\tHalo        " << comms_usec << " us" <<std::endl;
+        std::cout << GridLogMessage << "\tMatrix      " << usecs << " us" <<std::endl;
+        std::cout << GridLogMessage << "\t  mflop/s   " << flops/usecs<<std::endl;
+        std::cout << GridLogMessage << "\t  MB/s      " << bytes/usecs<<std::endl;
+    */
     return Nout;
   };
 
@@ -459,7 +492,7 @@ public:
 
     std::cout << GridLogMessage<< "CoarsenMatrix" << std::endl;
     // Orthogonalise the subblocks over the basis
-    blockOrthogonalise(InnerProd,Subspace.subspace);
+    //    blockOrthogonalise(InnerProd,Subspace.subspace);
     std::cout << GridLogMessage<< "CoarsenMatrix orthogonalised" << std::endl;
 
     // Compute the matrix elements of linop between this orthonormal

From c0d8e4dce5d73ce3e13fce09cef47a461eb7018a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 28 Dec 2019 10:32:15 -0500
Subject: [PATCH 12/43] Improved Multigrid for DWF

---
 Grid/algorithms/CoarsenedMatrix.h | 90 ++++++++++++++++++++-----------
 1 file changed, 59 insertions(+), 31 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index 5e5bcbfa..68c820ca 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -117,8 +117,8 @@ public:
     CoarseScalar InnerProd(CoarseGrid); 
     std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
     blockOrthogonalise(InnerProd,subspace);
-    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
-    //    blockOrthogonalise(InnerProd,subspace);
+    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
+    blockOrthogonalise(InnerProd,subspace);
     //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
     //      CheckOrthogonal();
   } 
@@ -150,7 +150,7 @@ public:
   void CreateSubspaceRandom(GridParallelRNG &RNG){
     for(int i=0;i<nbasis;i++){
       random(RNG,subspace[i]);
-      std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
+      //      std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
     }
     Orthogonalise();
   }
@@ -248,14 +248,16 @@ public:
   // ii) Multiply by Fourier phases
   // iii) Multiply by Fourier phases and refilter
   //
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       std::vector<double> &lo,
+				       std::vector<int>    &order
+				       ) {
 
     RealD scale;
 
-    const int dependent=16;
-
-    Chebyshev<FineField> ChebFilt     (0.03,64.0,500);
-    Chebyshev<FineField> ChebDependent(0.01,64.0,200);
+    const int dependent=lo.size();
 
     FineField noise(FineGrid);
     FineField Mn(FineGrid);
@@ -269,15 +271,12 @@ public:
 
       // Initial matrix element
       hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<bb<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
+      int bbb=0;
       for(int b=bb;b<MIN(bb+dependent,nn);b++) {
 
 	// Filter
-	if(b==bb) {
-	  ChebFilt(hermop,noise,Mn);
-	} else { 
-	  ChebDependent(hermop,noise,Mn);
-	}
+	Chebyshev<FineField> Cheb(lo[bbb],hi,order[bbb]); bbb++;
+	Cheb(hermop,noise,Mn);
 
 	// normalise
 	scale = std::pow(norm2(Mn),-0.5); 
@@ -354,6 +353,7 @@ public:
     const int Nsimd = CComplex::Nsimd();
 
     typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 
     GridStopWatch ArithmeticTimer;
     int osites=Grid()->oSites();
@@ -361,17 +361,18 @@ public:
     double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
     double usecs =-usecond();
 
-    assert(geom.npoint==9);
+    // assert(geom.npoint==9);
 
-    accelerator_for(ss, Grid()->oSites(), Nsimd, {
-	
-      calcVector res = Zero();
+    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
+      int ss = sss/nbasis;
+      int b  = sss%nbasis;
+      calcComplex res = Zero();
       calcVector nbr;
       int ptype;
       StencilEntry *SE;
 
       int lane=SIMTlane(Nsimd);
-      for(int point=0;point<9;point++){
+      for(int point=0;point<geom.npoint;point++){
 
 	SE=Stencil.GetEntry(ptype,point,ss);
 	  
@@ -382,16 +383,18 @@ public:
 	}
 	synchronise();
 
-	auto A = coalescedRead(Aview_p[point][ss]);
-	res = res + A*nbr;
+	for(int bb=0;bb<nbasis;bb++) {
+	  res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+	}
       }
-      coalescedWrite(out_v[ss],res,lane);
+      coalescedWrite(out_v[ss](b),res,lane);
     });
     usecs +=usecond();
 
     double nrm_usec=-usecond();
     RealD Nout= norm2(out);
     nrm_usec+=usecond();
+
     /*
         std::cout << GridLogMessage << "\tNorm        " << nrm_usec << " us" <<std::endl;
         std::cout << GridLogMessage << "\tHalo        " << comms_usec << " us" <<std::endl;
@@ -418,21 +421,46 @@ public:
   };
 
   void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
-    
+
     conformable(_grid,in.Grid());
     conformable(in.Grid(),out.Grid());
     
     SimpleCompressor<siteVector> compressor;
 
     Stencil.HaloExchange(in,compressor);
-    
-    auto point = [dir, disp](){
+
+    int ndim = in.Grid()->Nd();
+
+    //////////////
+    // 4D action like wilson
+    // 0+ => 0 
+    // 0- => 1
+    // 1+ => 2 
+    // 1- => 3
+    // etc..
+    //////////////
+    // 5D action like DWF
+    // 1+ => 0 
+    // 1- => 1
+    // 2+ => 2 
+    // 2- => 3
+    // etc..
+
+    auto point = [dir, disp, ndim](){
       if(dir == 0 and disp == 0)
 	return 8;
-      else
+      else if ( ndim==4 ) { 
 	return (4 * dir + 1 - disp) / 2;
+      } else { 
+	return (4 * (dir-1) + 1 - disp) / 2;
+      }
     }();
 
+    typedef LatticeView<Cobj> Aview;
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
+    Aview *Aview_p = & AcceleratorViewContainer[0];
+
     auto out_v = out.View();
     auto in_v  = in.View();
     accelerator_for(ss,Grid()->oSites(),1,{
@@ -451,11 +479,11 @@ public:
 	nbr = Stencil.CommBuf()[SE->_offset];
       }
 
-      auto A_point = A[point].View();
-      res = res + A_point[ss]*nbr;
+      res = res + Aview_p[point][ss]*nbr;
       
-      vstream(out_v[ss],res);
+      out_v[ss]=res;
     });
+
   };
 
   void Mdiag(const CoarseVector &in, CoarseVector &out){
@@ -493,7 +521,7 @@ public:
     std::cout << GridLogMessage<< "CoarsenMatrix" << std::endl;
     // Orthogonalise the subblocks over the basis
     //    blockOrthogonalise(InnerProd,Subspace.subspace);
-    std::cout << GridLogMessage<< "CoarsenMatrix orthogonalised" << std::endl;
+    //    std::cout << GridLogMessage<< "CoarsenMatrix orthogonalised" << std::endl;
 
     // Compute the matrix elements of linop between this orthonormal
     // set of vectors.
@@ -509,7 +537,7 @@ public:
     for(int i=0;i<nbasis;i++){
       phi=Subspace.subspace[i];
 	
-      std::cout<<GridLogMessage<<"("<<i<<") "<<std::endl;
+      //      std::cout<<GridLogMessage<<"("<<i<<") "<<std::endl;
 
       for(int p=0;p<geom.npoint;p++){ 
 

From aa920aa532fb9c88bc9761bda9643cfcf1973276 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 28 Dec 2019 10:32:35 -0500
Subject: [PATCH 13/43] Improved DWF multigrid

---
 tests/solver/Test_dwf_hdcr.cc | 290 +++++++++++++++++++++++-----------
 1 file changed, 195 insertions(+), 95 deletions(-)

diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc
index 3e603a26..817f51c7 100644
--- a/tests/solver/Test_dwf_hdcr.cc
+++ b/tests/solver/Test_dwf_hdcr.cc
@@ -50,13 +50,12 @@ public:
   myclass(){};
 
 };
-myclass params;
 
 RealD InverseApproximation(RealD x){
   return 1.0/x;
 }
 
-template<class Fobj,class CComplex,int nbasis, class Matrix>
+template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
 
@@ -76,17 +75,28 @@ public:
   FineOperator   & _FineOperator;
   Matrix         & _SmootherMatrix;
   FineOperator   & _SmootherOperator;
+  Guesser        & _Guess;
+
+  double cheby_hi;
+  double cheby_lo;
+  int    cheby_ord;
+
+  myclass _params;
 
   // Constructor
   MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
 			  FineOperator &Fine,Matrix &FineMatrix,
-			  FineOperator &Smooth,Matrix &SmootherMatrix) 
+			  FineOperator &Smooth,Matrix &SmootherMatrix,
+			  Guesser &Guess_,
+			  myclass params_)
     : _Aggregates(Agg),
       _CoarseOperator(Coarse),
       _FineOperator(Fine),
       _FineMatrix(FineMatrix),
       _SmootherOperator(Smooth),
-      _SmootherMatrix(SmootherMatrix)
+      _SmootherMatrix(SmootherMatrix),
+      _Guess(Guess_),
+      _params(params_)
   {
   }
 
@@ -98,7 +108,7 @@ public:
     MdagMLinearOperator<Matrix,FineField>   fMdagMOp(_FineMatrix);
 
     p1=in;
-    for(int i=0;i<20;i++){
+    for(int i=0;i<50;i++){
       RealD absp1=std::sqrt(norm2(p1));
       fMdagMOp.HermOp(p1,p2);// this is the G5 herm bit      
       //      _FineOperator.Op(p1,p2);// this is the G5 herm bit      
@@ -109,8 +119,9 @@ public:
     }
   }
 
-  void operator()(const FineField &in, FineField & out) {
+  void operator()(const FineField &in, FineField & out ) {
     operatorCheby(in,out);
+    //operatorADEF2(in,out);
   }
 
     ////////////////////////////////////////////////////////////////////////
@@ -124,8 +135,8 @@ public:
     CoarseVector Ctmp(_CoarseOperator.Grid());
     CoarseVector Csol(_CoarseOperator.Grid());
 
-    ConjugateGradient<CoarseVector>  CG(1.0e-10,100000);
-    ConjugateGradient<FineField>    fCG(1.0e-3,1000);
+    ConjugateGradient<CoarseVector>  CG(1.0e-3,1000,false);
+    ConjugateGradient<FineField>    fCG(1.0e-3,15,false);
 
     HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
     MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
@@ -152,9 +163,9 @@ public:
     _FineOperator.Op(Min,tmp);
     tmp = in - tmp;   // in - A Min
 
-    Csol=Zero();
     _Aggregates.ProjectToSubspace  (Csrc,tmp);
     HermOp.AdjOp(Csrc,Ctmp);// Normal equations
+    Csol=Zero();
     CG(MdagMOp,Ctmp,Csol);
 
     HermOp.Op(Csol,Ctmp);
@@ -263,9 +274,9 @@ public:
 
     CoarseVector Csrc(_CoarseOperator.Grid());
     CoarseVector Ctmp(_CoarseOperator.Grid());
-    CoarseVector Csol(_CoarseOperator.Grid()); Csol=Zero();
-
-    ConjugateGradient<CoarseVector>  CG(3.0e-3,100000);
+    CoarseVector Csol(_CoarseOperator.Grid()); 
+    
+    ConjugateGradient<CoarseVector>  CG(3.0e-2,100000);
 
     HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
     MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
@@ -275,8 +286,8 @@ public:
     FineField vec1(in.Grid());
     FineField vec2(in.Grid());
 
-    Chebyshev<FineField> Cheby    (params.lo,params.hi,params.order,InverseApproximation);
-    Chebyshev<FineField> ChebyAccu(params.lo,params.hi,params.order,InverseApproximation);
+    Chebyshev<FineField> Cheby    (_params.lo,_params.hi,_params.order,InverseApproximation);
+    Chebyshev<FineField> ChebyAccu(_params.lo,_params.hi,_params.order,InverseApproximation);
 
     //    _Aggregates.ProjectToSubspace  (Csrc,in);
     //    _Aggregates.PromoteFromSubspace(Csrc,out);
@@ -313,6 +324,8 @@ public:
     std::cout<<GridLogMessage << "ProjectToSubspaceDone" <<std::endl;
     
     HermOp.AdjOp(Csrc,Ctmp);// Normal equations // This appears to be zero.
+
+    _Guess(Ctmp,Csol);
     CG(MdagMOp,Ctmp,Csol);
     std::cout<<GridLogMessage << "PromoteFromSubspace" <<std::endl;
     _Aggregates.PromoteFromSubspace(Csol,vec1); // Ass^{-1} [in - A Min]_s
@@ -348,6 +361,9 @@ int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
+  myclass params;
+  myclass cparams;
+
   XmlReader RD("params.xml");
   read(RD,"params",params);
   std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
@@ -365,16 +381,12 @@ int main (int argc, char ** argv)
   ///////////////////////////////////////////////////
   std::vector<int> block ({2,2,2,2});
   const int nbasis= 32;
-
   auto clatt = GridDefaultLatt();
-  std::cout << GridLogMessage << " Coarse lattice is ";
   for(int d=0;d<clatt.size();d++){
     clatt[d] = clatt[d]/block[d];
-    std::cout << clatt[d];
-    if ( d!=clatt.size()-1) 
-      std::cout << "x";
   }
-  std::cout << std::endl;
+
+
   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
   GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
 
@@ -431,26 +443,39 @@ int main (int argc, char ** argv)
   typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
   typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
   typedef CoarseOperator::CoarseVector                                 CoarseVector;
-
+  typedef CoarseOperator::siteVector siteVector;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
+
   Subspace Aggregates(Coarse5d,FGrid,0);
-  //  Aggregates.CreateSubspace(RNG5,HermDefOp,nbasis);
+
   assert ( (nbasis & 0x1)==0);
   int nb=nbasis/2;
   std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
   //  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
   //  Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
-  Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb);
+
+  double f_first = 0.03;
+  double f_div   = 1.2;
+  std::vector<double> f_lo(nb);
+  f_lo[0] = f_first;
+  for(int b=1;b<nb;b++) {
+    f_lo[b] = f_lo[b-1]/f_div;
+  }
+  std::vector<int> f_ord(nb,200);
+  f_ord[0]=500;
+
+  Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,f_lo,f_ord);
   for(int n=0;n<nb;n++){
     G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
-    std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
-  }
-  for(int n=0;n<nbasis;n++){
-    std::cout<<GridLogMessage << "vec["<<n<<"] = "<<norm2(Aggregates.subspace[n])  <<std::endl;
+    //    std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
   }
+  //  for(int n=0;n<nbasis;n++){
+  //    std::cout<<GridLogMessage << "vec["<<n<<"] = "<<norm2(Aggregates.subspace[n])  <<std::endl;
+  //  }
+
 
 //  for(int i=0;i<nbasis;i++){
 //    result =     Aggregates.subspace[i];
@@ -466,57 +491,53 @@ int main (int argc, char ** argv)
   CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LDOp(*Coarse5d,1); // Hermitian matrix
   LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
 
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage << "Testing some coarse space solvers  " <<std::endl;
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   CoarseVector c_src (Coarse5d);
   CoarseVector c_res (Coarse5d);
   gaussian(CRNG,c_src);
   c_res=Zero();
 
+  //////////////////////////////////////////////////
+  // Deflate the course space. Recursive multigrid?
+  //////////////////////////////////////////////////
+
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> Level1Op;
+  typedef CoarsenedMatrix<siteVector,iScalar<vTComplex>,nbasis> Level2Op;
+
+  auto cclatt = clatt;
+  for(int d=0;d<clatt.size();d++){
+    cclatt[d] = clatt[d]/block[d];
+  }
+  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
+
+  typedef Aggregation<siteVector,iScalar<vTComplex>,nbasis>                   CoarseSubspace;
+  CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0);
+
+  double c_first = 0.2;
+  double c_div   = 1.2;
+  std::vector<double> c_lo(nb);
+  c_lo[0] = c_first;
+  for(int b=1;b<nb;b++) {
+    c_lo[b] = c_lo[b-1]/c_div;
+  }
+  std::vector<int> c_ord(nb,200);
+  c_ord[0]=500;
+
+#define RECURSIVE_MULTIGRID
+#ifdef RECURSIVE_MULTIGRID
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage << "Solving posdef-CG on coarse space "<< std::endl;
+  std::cout<<GridLogMessage << "Build deflation space in coarse operator "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
   MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
-  ConjugateGradient<CoarseVector> CG(1.0e-2,100000);
-  CG(PosdefLdop,c_src,c_res);
+  //  CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nbasis,14.0,c_lo,c_ord);
+  //  CoarseAggregates.CreateSubspaceRandom(CRNG);
 
-  /*
-    std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-    std::cout<<GridLogMessage << "Solving indef-MCR on coarse space "<< std::endl;
-    std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  */
-//    HermitianLinearOperator<CoarseOperator,CoarseVector> HermIndefLdop(LDOp);
-//    ConjugateResidual<CoarseVector> MCR(1.0e-6,100000);
-//    MCR(HermIndefLdop,c_src,c_res);
+  //  Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix
+  //  HermitianLinearOperator<Level1Op,CoarseVector> L1LinOp(LDOp);
+  //  L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates);
+#endif
 
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage << "Building deflation preconditioner "<< std::endl;
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-
-  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR> Precon  (Aggregates, LDOp,
-											   HermIndefOp,Ddwf,
-											   HermIndefOp,Ddwf);
-
-  //  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR> PreconDD(Aggregates, LDOp,
-  //											   HermIndefOp,Ddwf,
-  //											   HermIndefOpDD,DdwfDD);
-  //  TrivialPrecon<LatticeFermion> simple;
-
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  Precon.SmootherTest(src);
-
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  PreconDD.SmootherTest(src);
-
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  PreconDD.SAP(src,result);
 
   //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   //  std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
@@ -526,39 +547,89 @@ int main (int argc, char ** argv)
   //  ConjugateGradient<LatticeFermion> fCG(1.0e-8,100000);
   //  fCG(HermDefOp,src,result);
 
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  LatticeFermion    src_o(FrbGrid);
-  //  LatticeFermion result_o(FrbGrid);
-  //  pickCheckerboard(Odd,src_o,src);
-  //  result_o=Zero();
-  //  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
-  //  ConjugateGradient<LatticeFermion> pCG(1.0e-8,10000);
-  //  pCG(HermOpEO,src_o,result_o);
-
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  std::cout<<GridLogMessage << "Testing GCR on indef matrix "<< std::endl;
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  PrecGeneralisedConjugateResidual<LatticeFermion> UPGCR(1.0e-8,100000,simple,8,128);
-  //  UPGCR(HermIndefOp,src,result);
-
+    std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+    std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
+    std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+    LatticeFermion    src_o(FrbGrid);
+    LatticeFermion result_o(FrbGrid);
+    pickCheckerboard(Odd,src_o,src);
+    result_o=Zero();
+    SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
+    ConjugateGradient<LatticeFermion> pCG(1.0e-8,10000);
+    //    pCG(HermOpEO,src_o,result_o);
   
-  /// Get themax eval
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage <<" Applying power method to find spectral range      "<<std::endl;
+  std::cout<<GridLogMessage << " Running coarse grid Lanczos "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  Precon.PowerMethod(src);
+  MdagMLinearOperator<Level1Op,CoarseVector> IRLHermOp(LDOp);
+  Chebyshev<CoarseVector> IRLCheby(0.01,14,161);
+  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,IRLHermOp);
+     PlainHermOp<CoarseVector> IRLOp    (IRLHermOp);
+
+  int Nstop=32;
+  int Nk=32;
+  int Nm=48;
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-4,20);
+  int Nconv;
+  std::vector<RealD>          eval(Nm);
+  std::vector<CoarseVector>   evec(Nm,Coarse5d);
+  IRL.calc(eval,evec,c_src,Nconv);
 
 
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
-  //  result=Zero();
-  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
-  //  PGCRDD(HermIndefOp,src,result);
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "coarse grid CG "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 
+  //  ConjugateGradient<CoarseVector> CG(3.0e-3,100000);
+  //  CG(PosdefLdop,c_src,c_res);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "coarse grid Deflated CG with "<< eval.size() << " evecs" << std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  
+  c_res=Zero();
+  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+  DeflCoarseGuesser(c_src,c_res);
+  //  CG(PosdefLdop,c_src,c_res);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage <<" Applying Fine power method to find spectral range      "<<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+
+  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR,
+			   ZeroGuesser<CoarseVector> >
+    Precon  (Aggregates, LDOp,
+	     HermIndefOp,Ddwf,
+	     HermIndefOp,Ddwf,
+	     CoarseZeroGuesser,
+	     params);
+
+  //  Precon.PowerMethod(src);
+  
+  /*  
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage <<" Applying Coarse power method to find spectral range      "<<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  
+  cparams = params;
+  cparams.hi   = 20.0;
+  cparams.lo   =  0.2;
+  cparams.order=  20;
+
+  MultiGridPreconditioner <siteVector,iScalar<vTComplex>,nbasis,Level1Op,ZeroGuesser<CoarseVector> > 
+  CoarsePrecon (CoarseAggregates, 
+		L2Op,
+		L1LinOp,LDOp,
+		L1LinOp,LDOp,
+		CoarseZeroGuesser,
+		cparams);
+  
+  CoarsePrecon.PowerMethod(c_src);
+  */
+
+  /*
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
@@ -566,6 +637,35 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
   result=Zero();
   PGCR(HermIndefOp,src,result);
+  */
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building a two level deflated PGCR "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR, DeflatedGuesser<CoarseVector> >
+    DeflatedPrecon  (Aggregates, LDOp,
+		     HermIndefOp,Ddwf,
+		     HermIndefOp,Ddwf,
+		     DeflCoarseGuesser,
+		     params);
+
+  PrecGeneralisedConjugateResidual<LatticeFermion> deflPGCR(1.0e-8,100000,DeflatedPrecon,16,16);
+
+  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  result=Zero();
+  deflPGCR(HermIndefOp,src,result);
+
+
+  /*
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building deflation preconditioner "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  PrecGeneralisedConjugateResidual<CoarseVector> CPGCR(1.0e-3,10000,CoarsePrecon,8,8);
+  std::cout<<GridLogMessage<<"checking norm src "<<norm2(c_src)<<std::endl;
+  c_res=Zero();
+  CPGCR(L1LinOp,c_src,c_res);
+  */
 
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Done "<< std::endl;

From ba40a3f7637bb04ceae2c71a69e1906b1ac8c046 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 3 Jan 2020 05:29:09 -0500
Subject: [PATCH 14/43] Alternate low pass filter option

---
 Grid/algorithms/approx/Chebyshev.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h
index 97e0e807..74789ead 100644
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -94,6 +94,24 @@ public:
     Coeffs.assign(0.,order);
     Coeffs[order-1] = 1.;
   };
+  
+  // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
+  // Similar kick effect below the threshold as Lanczos filter approach
+  void InitLowPass(RealD _lo,RealD _hi,int _order)
+  {
+    lo=_lo;
+    hi=_hi;
+    order=_order;
+      
+    if(order < 2) exit(-1);
+    Coeffs.resize(order);
+    for(int j=0;j<order;j++){
+      RealD k=(order-1.0);
+      RealD s=std::cos( j*M_PI*(k+0.5)/order );
+      Coeffs[j] = s * 2.0/order;
+    }
+    
+  };
 
   void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
   {

From 0afecfcae7441e7fba2f69e973b972a00e63b1a2 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 4 Jan 2020 03:11:19 -0500
Subject: [PATCH 15/43] Nearing well optimised state

---
 tests/solver/Test_dwf_hdcr.cc | 77 ++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 41 deletions(-)

diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc
index 817f51c7..5a131a57 100644
--- a/tests/solver/Test_dwf_hdcr.cc
+++ b/tests/solver/Test_dwf_hdcr.cc
@@ -135,8 +135,8 @@ public:
     CoarseVector Ctmp(_CoarseOperator.Grid());
     CoarseVector Csol(_CoarseOperator.Grid());
 
-    ConjugateGradient<CoarseVector>  CG(1.0e-3,1000,false);
-    ConjugateGradient<FineField>    fCG(1.0e-3,15,false);
+    ConjugateGradient<CoarseVector>  CG(1.0e-3,100,false);
+    ConjugateGradient<FineField>    fCG(1.0e-3,10,false);
 
     HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
     MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
@@ -165,7 +165,7 @@ public:
 
     _Aggregates.ProjectToSubspace  (Csrc,tmp);
     HermOp.AdjOp(Csrc,Ctmp);// Normal equations
-    Csol=Zero();
+    _Guess(Ctmp,Csol);
     CG(MdagMOp,Ctmp,Csol);
 
     HermOp.Op(Csol,Ctmp);
@@ -274,9 +274,10 @@ public:
 
     CoarseVector Csrc(_CoarseOperator.Grid());
     CoarseVector Ctmp(_CoarseOperator.Grid());
+    CoarseVector Ctmp1(_CoarseOperator.Grid());
     CoarseVector Csol(_CoarseOperator.Grid()); 
     
-    ConjugateGradient<CoarseVector>  CG(3.0e-2,100000);
+    ConjugateGradient<CoarseVector>  CG(5.0e-2,100000);
 
     HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
     MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
@@ -323,10 +324,20 @@ public:
     _Aggregates.ProjectToSubspace  (Csrc,vec1);
     std::cout<<GridLogMessage << "ProjectToSubspaceDone" <<std::endl;
     
-    HermOp.AdjOp(Csrc,Ctmp);// Normal equations // This appears to be zero.
+    HermOp.AdjOp(Csrc,Ctmp1);// Normal equations
+
+    _Guess(Ctmp1,Csol);
+    CG(MdagMOp,Ctmp1,Csol);
+
+    //////////////////////////////
+    // Recompute true residual
+    //////////////////////////////
+    MdagMOp.HermOp(Csol,Ctmp);
+    Ctmp = Ctmp1 - Ctmp;      // r=Csrc - M^dagM sol // This is already computed inside CG
+    HermOp.AdjOp(Ctmp,Ctmp1);// Normal equations
+    _Guess(Ctmp1,Ctmp);      // sol = sol' + MdagM^-1 (Csrc' - MdagM sol')
+    Csol = Csol + Ctmp;
 
-    _Guess(Ctmp,Csol);
-    CG(MdagMOp,Ctmp,Csol);
     std::cout<<GridLogMessage << "PromoteFromSubspace" <<std::endl;
     _Aggregates.PromoteFromSubspace(Csol,vec1); // Ass^{-1} [in - A Min]_s
                                                 // Q = Q[in - A Min]  
@@ -452,36 +463,17 @@ int main (int argc, char ** argv)
   Subspace Aggregates(Coarse5d,FGrid,0);
 
   assert ( (nbasis & 0x1)==0);
-  int nb=nbasis/2;
-  std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
-  //  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
-  //  Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
 
-  double f_first = 0.03;
-  double f_div   = 1.2;
-  std::vector<double> f_lo(nb);
-  f_lo[0] = f_first;
-  for(int b=1;b<nb;b++) {
-    f_lo[b] = f_lo[b-1]/f_div;
+  {
+    int nb=nbasis/2;
+    std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
+
+   Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,500,110);
+    for(int n=0;n<nb;n++){
+      G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
+    }
   }
-  std::vector<int> f_ord(nb,200);
-  f_ord[0]=500;
 
-  Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,f_lo,f_ord);
-  for(int n=0;n<nb;n++){
-    G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
-    //    std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
-  }
-  //  for(int n=0;n<nbasis;n++){
-  //    std::cout<<GridLogMessage << "vec["<<n<<"] = "<<norm2(Aggregates.subspace[n])  <<std::endl;
-  //  }
-
-
-//  for(int i=0;i<nbasis;i++){
-//    result =     Aggregates.subspace[i];
-//    Aggregates.subspace[i]=result+g5*result;
-//  }
-  result=Zero();
   
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
@@ -490,10 +482,12 @@ int main (int argc, char ** argv)
   Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOpDD(DdwfDD);
   CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LDOp(*Coarse5d,1); // Hermitian matrix
   LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
+  exit(0);
 
   CoarseVector c_src (Coarse5d);
   CoarseVector c_res (Coarse5d);
   gaussian(CRNG,c_src);
+  result=Zero();
   c_res=Zero();
 
   //////////////////////////////////////////////////
@@ -515,12 +509,12 @@ int main (int argc, char ** argv)
 
   double c_first = 0.2;
   double c_div   = 1.2;
-  std::vector<double> c_lo(nb);
+  std::vector<double> c_lo(nbasis/2);
   c_lo[0] = c_first;
-  for(int b=1;b<nb;b++) {
+  for(int b=1;b<nbasis/2;b++) {
     c_lo[b] = c_lo[b-1]/c_div;
   }
-  std::vector<int> c_ord(nb,200);
+  std::vector<int> c_ord(nbasis/2,200);
   c_ord[0]=500;
 
 #define RECURSIVE_MULTIGRID
@@ -562,14 +556,15 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << " Running coarse grid Lanczos "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   MdagMLinearOperator<Level1Op,CoarseVector> IRLHermOp(LDOp);
-  Chebyshev<CoarseVector> IRLCheby(0.01,14,161);
+  Chebyshev<CoarseVector> IRLCheby(0.005,16.0,51);
+  //  IRLCheby.InitLowPass(0.01,18.0,51);
   FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,IRLHermOp);
      PlainHermOp<CoarseVector> IRLOp    (IRLHermOp);
 
-  int Nstop=32;
-  int Nk=32;
+     int Nstop=24;
+     int Nk=24;
   int Nm=48;
-  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-4,20);
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20);
   int Nconv;
   std::vector<RealD>          eval(Nm);
   std::vector<CoarseVector>   evec(Nm,Coarse5d);

From f7e4bd1f6d5f5aa65a8151c0faee2f845d50894d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 4 Jan 2020 03:11:53 -0500
Subject: [PATCH 16/43] Getting more optimised

---
 Grid/algorithms/CoarsenedMatrix.h | 214 ++++++++++++++----------------
 1 file changed, 100 insertions(+), 114 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index 68c820ca..450b76df 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -63,7 +63,7 @@ public:
     //// report back
     std::cout<<GridLogMessage<<"directions    :";
     for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
-    std::cout <<std::endl;
+    std::cout<<std::endl;
     std::cout<<GridLogMessage<<"displacements :";
     for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
     std::cout<<std::endl;
@@ -117,8 +117,8 @@ public:
     CoarseScalar InnerProd(CoarseGrid); 
     std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
     blockOrthogonalise(InnerProd,subspace);
-    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
-    blockOrthogonalise(InnerProd,subspace);
+    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
+    //    blockOrthogonalise(InnerProd,subspace);
     //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
     //      CheckOrthogonal();
   } 
@@ -152,59 +152,8 @@ public:
       random(RNG,subspace[i]);
       //      std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
     }
-    Orthogonalise();
   }
 
-  /*
-    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
-    {
-    // Run a Lanczos with sloppy convergence
-    const int Nstop = nn;
-    const int Nk = nn+20;
-    const int Np = nn+20;
-    const int Nm = Nk+Np;
-    const int MaxIt= 10000;
-    RealD resid = 1.0e-3;
-
-    Chebyshev<FineField> Cheb(0.5,64.0,21);
-    ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
-    //	IRL.lock = 1;
-
-    FineField noise(FineGrid); gaussian(RNG,noise);
-    FineField tmp(FineGrid); 
-    std::vector<RealD>     eval(Nm);
-    std::vector<FineField> evec(Nm,FineGrid);
-
-    int Nconv;
-    IRL.calc(eval,evec,
-    noise,
-    Nconv);
-
-    // pull back nn vectors
-    for(int b=0;b<nn;b++){
-
-    subspace[b]   = evec[b];
-
-    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
-
-    hermop.Op(subspace[b],tmp); 
-    std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
-
-    noise = tmp -  sqrt(eval[b])*subspace[b] ;
-
-    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
-
-    noise = tmp +  eval[b]*subspace[b] ;
-
-    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
-
-    }
-    Orthogonalise();
-    for(int b=0;b<nn;b++){
-    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
-    }
-    }
-  */
   virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
 
     RealD scale;
@@ -236,69 +185,94 @@ public:
       subspace[b]   = noise;
 
     }
-
-    Orthogonalise();
-
   }
 
   // 
   // World of possibilities here. 
-  // Experiments
-  // i)  Use inverse iteration method equivaleent with Chebyshev
-  // ii) Multiply by Fourier phases
-  // iii) Multiply by Fourier phases and refilter
   //
   virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
-				       std::vector<double> &lo,
-				       std::vector<int>    &order
+				       double lo,
+				       int order,
+				       int orderstep
 				       ) {
 
     RealD scale;
 
-    const int dependent=lo.size();
-
     FineField noise(FineGrid);
     FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
 
-    for(int bb=0;bb<nn;bb+=dependent){
-      
-      // New normalised noise
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
 
-      // Initial matrix element
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<bb<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-      int bbb=0;
-      for(int b=bb;b<MIN(bb+dependent,nn);b++) {
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
 
-	// Filter
-	Chebyshev<FineField> Cheb(lo[bbb],hi,order[bbb]); bbb++;
-	Cheb(hermop,noise,Mn);
-
-	// normalise
-	scale = std::pow(norm2(Mn),-0.5); 
-	Mn=Mn*scale;
-
-	// set this new vector
-	subspace[b]   = Mn;
-
-	// new matrix element
-	hermop.Op(Mn,noise); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(noise)<<std::endl;
-
-	// Dependent vector rule
-	noise = Mn; // Already normaliseed
-
-      }
+    int b =0;
+    {
+      // Filter
+      Chebyshev<FineField> Cheb(lo,hi,order);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      b++;
     }
 
-    Orthogonalise();
+    // Generate a full sequence of Chebyshevs
+    {
+      lo=0;
+      noise=Mn;
 
+      FineField T0(FineGrid); T0 = noise;  
+      FineField T1(FineGrid); 
+      FineField T2(FineGrid);
+      FineField y(FineGrid);
+      
+      FineField *Tnm = &T0;
+      FineField *Tn  = &T1;
+      FineField *Tnp = &T2;
+
+      // Tn=T1 = (xscale M + mscale)in
+      RealD xscale = 2.0/(hi-lo);
+      RealD mscale = -(hi+lo)/(hi-lo);
+      hermop.HermOp(T0,y);
+      T1=y*xscale+noise*mscale;
+
+      for(int n=2;n<=orderstep*(nn-1);n++){
+	
+	hermop.HermOp(*Tn,y);
+
+	y=xscale*y+mscale*(*Tn);
+	
+	*Tnp=2.0*y-(*Tnm);
+
+	if ( (n%orderstep)==0 ) {
+	  Mn=*Tnp;
+	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
+	  subspace[b] = Mn;
+	  hermop.Op(Mn,tmp); 
+	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  b++;
+	}
+
+	// Cycle pointers to avoid copies
+	FineField *swizzle = Tnm;
+	Tnm    =Tn;
+	Tn     =Tnp;
+	Tnp    =swizzle;
+	  
+      }
+    }
   }
-
 };
+
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
@@ -511,6 +485,7 @@ public:
     FineField     tmp(FineGrid);
     FineField     zz(FineGrid); zz=Zero();
     FineField    Mphi(FineGrid);
+    std::vector<FineField>     Mphi_p(geom.npoint,FineGrid);
 
     Lattice<iScalar<vInteger> > coor(FineGrid);
 
@@ -518,10 +493,9 @@ public:
     CoarseVector oProj(Grid()); 
     CoarseScalar InnerProd(Grid()); 
 
-    std::cout << GridLogMessage<< "CoarsenMatrix" << std::endl;
+
     // Orthogonalise the subblocks over the basis
-    //    blockOrthogonalise(InnerProd,Subspace.subspace);
-    //    std::cout << GridLogMessage<< "CoarsenMatrix orthogonalised" << std::endl;
+    blockOrthogonalise(InnerProd,Subspace.subspace);
 
     // Compute the matrix elements of linop between this orthonormal
     // set of vectors.
@@ -536,12 +510,21 @@ public:
 
     for(int i=0;i<nbasis;i++){
       phi=Subspace.subspace[i];
-	
-      //      std::cout<<GridLogMessage<<"("<<i<<") "<<std::endl;
 
       for(int p=0;p<geom.npoint;p++){ 
 
-	//	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << std::endl;
+	int dir   = geom.directions[p];
+	int disp  = geom.displacements[p];
+
+	std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" stencil point "<<p << std::endl;
+	if ( disp==0 ) linop.OpDiag(phi,Mphi_p[p]);
+	else           linop.OpDir (phi,Mphi_p[p],dir,disp); 
+	std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" applied" << std::endl;
+      }
+      for(int p=0;p<geom.npoint;p++){ 
+
+	Mphi = Mphi_p[p];
+
 	int dir   = geom.directions[p];
 	int disp  = geom.displacements[p];
 
@@ -549,13 +532,6 @@ public:
 
 	LatticeCoordinate(coor,dir);
 
-	if ( disp==0 ){
-	  linop.OpDiag(phi,Mphi);
-	}
-	else  {
-	  linop.OpDir(phi,Mphi,dir,disp); 
-	}
-	//	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "Mdir done "<< std::endl;
 
 	////////////////////////////////////////////////////////////////////////
 	// Pick out contributions coming from this cell and neighbour cell
@@ -572,18 +548,19 @@ public:
 	} else {
 	  assert(0);
 	}
-	//	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "selected "<< std::endl;
 
+	// Could do local inner products,
+	// and then block pick the IP's.
+	// Ideally write a routine to do two masked block sums at once
+	std::cout << GridLogMessage<< "CoarsenMatrix picked "<<p<< std::endl;
 	Subspace.ProjectToSubspace(iProj,iblock);
 	Subspace.ProjectToSubspace(oProj,oblock);
-	//	std::cout << GridLogMessage<< "CoarsenMatrix direction "<<p << "proojected "<< std::endl;
+	std::cout << GridLogMessage<< "CoarsenMatrix projected"<<p<< std::endl;
 
 	// 4x gain possible in this loop. Profile and identify time loss.
 	// i)  Assume Hermiticity, upper diagonal only (2x)
 	// ii) Local inner product, then pick the local inners and sum. (2x)
-	//
-	//	  blockProject(iProj,iblock,Subspace.subspace);
-	//	  blockProject(oProj,oblock,Subspace.subspace);
+
 	auto iProj_v = iProj.View() ;
 	auto oProj_v = oProj.View() ;
 	auto A_p     =  A[p].View();
@@ -596,6 +573,7 @@ public:
 	    A_self[ss](j,i) =	A_self[ss](j,i) + iProj_v[ss](j);
 	  }
 	});
+
       }
     }
 
@@ -620,7 +598,15 @@ public:
     std::cout<<GridLogMessage<< iProj <<std::endl;
     std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
-      //      ForceHermitian();
+    /*
+    if(hermitian) {
+      std::cout << GridLogMessage << " ForceHermitian "<<std::endl;
+      ForceHermitian();
+    }
+    for(int p=0;p<geom.npoint;p++){
+      std::cout << GridLogMessage<< " dir "<< norm2(A[p]) <<std::endl;
+    }
+    */
       // AssertHermitian();
       // ForceDiagonal();
   }

From 039eb7b2eb1432ce8458484ac4da9cc7d5a6a816 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 4 Jan 2020 03:12:17 -0500
Subject: [PATCH 17/43] Make the force term and coarsening multigrid more
 optimised

---
 Grid/qcd/action/fermion/WilsonKernels.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h
index 35715097..bf06d000 100644
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -102,6 +102,15 @@ private:
 
   static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
 				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
+
+  static accelerator void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
       
   // Specialised variants
   static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,

From 205ea4bbb2d86f33c6fd2e2dc4e193a0d386a951 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 4 Jan 2020 03:13:40 -0500
Subject: [PATCH 18/43] More verboose Lanczos

---
 .../iterative/ImplicitlyRestartedLanczos.h    | 34 ++++++++++++-------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
index e5573c8e..8e059048 100644
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -43,6 +43,11 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
 {
+  // If assume basis[j] are already orthonormal,
+  // can take all inner products in parallel saving 2x bandwidth
+  // Save 3x bandwidth on the second line of loop.
+  // perhaps 2.5x speed up.
+  // 2x overall in Multigrid Lanczos  
   for(int j=0; j<k; ++j){
     auto ip = innerProduct(basis[j],w);
     w = w - ip*basis[j];
@@ -282,7 +287,7 @@ public:
 			    RealD _eresid, // resid in lmdue deficit 
 			    int _MaxIter, // Max iterations
 			    RealD _betastp=0.0, // if beta(k) < betastp: converged
-			    int _MinRestart=1, int _orth_period = 1,
+			    int _MinRestart=0, int _orth_period = 1,
 			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
     SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester),
     Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
@@ -347,7 +352,7 @@ until convergence
     GridBase *grid = src.Grid();
     assert(grid == evec[0].Grid());
     
-    GridLogIRL.TimingMode(1);
+    //    GridLogIRL.TimingMode(1);
     std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
     std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
     std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -577,11 +582,11 @@ until convergence
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
 2. For k = 1,2,...,m Do:
-3. wk:=Avk−βkv_{k−1}      
-4. αk:=(wk,vk)       // 
-5. wk:=wk−αkvk       // wk orthog vk 
-6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-7. vk+1 := wk/βk+1
+3. wk:=Avk - b_k v_{k-1}      
+4. ak:=(wk,vk)       // 
+5. wk:=wk-akvk       // wk orthog vk 
+6. bk+1 := ||wk||_2. If b_k+1 = 0 then Stop
+7. vk+1 := wk/b_k+1
 8. EndDo
  */
   void step(std::vector<RealD>& lmd,
@@ -589,6 +594,7 @@ until convergence
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
   {
+    std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
     const RealD tiny = 1.0e-20;
     assert( k< Nm );
 
@@ -600,20 +606,20 @@ until convergence
 
     if(k>0) w -= lme[k-1] * evec[k-1];
 
-    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
+    ComplexD zalph = innerProduct(evec_k,w);
     RealD     alph = real(zalph);
 
-    w = w - alph * evec_k;// 5. wk:=wk−αkvk
+    w = w - alph * evec_k;
 
-    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-    // 7. vk+1 := wk/βk+1
+    RealD beta = normalise(w); 
 
     lmd[k] = alph;
     lme[k] = beta;
 
-    if (k>0 && k % orth_period == 0) {
+    if ( (k>0) && ( (k % orth_period) == 0 )) {
+      std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
       orthogonalize(w,evec,k); // orthonormalise
-      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
+      std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
     }
 
     if(k < Nm-1) evec[k+1] = w;
@@ -621,6 +627,8 @@ until convergence
     std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
     if ( beta < tiny ) 
       std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
+
+    std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
   }
 
   void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 

From 3c3d6a94f3ad1df6a38fb3549f016f8c7dc979e6 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 4 Jan 2020 03:16:23 -0500
Subject: [PATCH 19/43] OPtimising the force term a bit

---
 .../WilsonKernelsImplementation.h             | 84 +++++++++++++------
 1 file changed, 60 insertions(+), 24 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
index a787fa79..f13bfdde 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -91,8 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
   }								\
   synchronise();						
 
-#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
-  if (gamma == Dir) {						\
+#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
     if (SE->_is_local ) {					\
       int perm= SE->_permute;					\
       auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
@@ -102,10 +101,14 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
     }								\
     synchronise();						\
     Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
-    Recon(result, Uchi);					\
-    synchronise();						\
+    Recon(result, Uchi);					
+
+#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
+  if (gamma == Dir) {						\
+    GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon);			\
   }
 
+
   ////////////////////////////////////////////////////////////////////
   // All legs kernels ; comms then compute
   ////////////////////////////////////////////////////////////////////
@@ -284,7 +287,36 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
   }
 };
 
-template <class Impl>
+#define DhopDirMacro(Dir,spProj,spRecon)	\
+  template <class Impl>							\
+  void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
+					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
+  {									\
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;		\
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;			\
+  calcHalfSpinor chi;							\
+  calcSpinor result;							\
+  calcHalfSpinor Uchi;							\
+  StencilEntry *SE;							\
+  int ptype;								\
+  const int Nsimd = SiteHalfSpinor::Nsimd();				\
+  const int lane=SIMTlane(Nsimd);					\
+									\
+  SE = st.GetEntry(ptype, dir, sF);					\
+  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
+  coalescedWrite(out[sF], result,lane);					\
+  }									
+
+DhopDirMacro(Xp,spProjXp,spReconXp);
+DhopDirMacro(Yp,spProjYp,spReconYp);
+DhopDirMacro(Zp,spProjZp,spReconZp);
+DhopDirMacro(Tp,spProjTp,spReconTp);
+DhopDirMacro(Xm,spProjXm,spReconXm);
+DhopDirMacro(Ym,spProjYm,spReconYm);
+DhopDirMacro(Zm,spProjZm,spReconZm);
+DhopDirMacro(Tm,spProjTm,spReconTm);
+
+template <class Impl> 
 void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
 {
@@ -299,18 +331,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
   const int lane=SIMTlane(Nsimd);
 
   SE = st.GetEntry(ptype, dir, sF);
-  if (gamma == Xp) {						
-    if (SE->_is_local ) {					
-      int perm= SE->_permute;					
-      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	
-      spProjXp(chi,tmp);						
-    } else {							
-      chi = coalescedRead(buf[SE->_offset],lane);			
-    }								
-    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		
-    spReconXp(result, Uchi);					
-  }
-
+  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
   GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
   GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
   GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
@@ -332,13 +353,28 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    auto in_v  = in.View();
    auto out_v = out.View();
    auto st_v  = st.View();
-   accelerator_for(ss,Nsite,Simd::Nsimd(),{
-    for(int s=0;s<Ls;s++){
-      int sU=ss;
-      int sF = s+Ls*sU; 
-      DhopDirK(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp,gamma);
-    }
-  });
+#define LoopBody(Dir)				\
+   if (gamma==Dir) {				\
+     accelerator_forNB(ss,Nsite,Simd::Nsimd(),{	\
+       for(int s=0;s<Ls;s++){			\
+	 int sU=ss;				\
+	 int sF = s+Ls*sU;						\
+	 DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
+       }							       \
+       });							       \
+   }
+
+   LoopBody(Xp);
+   LoopBody(Yp);
+   LoopBody(Zp);
+   LoopBody(Tp);
+
+   LoopBody(Xm);
+   LoopBody(Ym);
+   LoopBody(Zm);
+   LoopBody(Tm);
+
+#undef LoopBody
 } 
 
 #define KERNEL_CALLNB(A) \

From e5830356146c3d884849713b98875a0bad778cdd Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 6 Jan 2020 11:43:59 -0500
Subject: [PATCH 20/43] Change to interface to minise comms in evaluating
 coarse space operator

---
 Grid/algorithms/CoarsenedMatrix.h             | 129 ++++++++++--------
 Grid/algorithms/LinearOperator.h              |  16 +++
 Grid/algorithms/SparseMatrix.h                |  13 +-
 Grid/qcd/action/fermion/CayleyFermion5D.h     |   3 +-
 .../fermion/ContinuedFractionFermion5D.h      |  15 +-
 Grid/qcd/action/fermion/FermionOperator.h     |   1 +
 .../action/fermion/ImprovedStaggeredFermion.h |   1 +
 .../fermion/ImprovedStaggeredFermion5D.h      |   3 +-
 .../action/fermion/PartialFractionFermion5D.h |  11 +-
 Grid/qcd/action/fermion/WilsonFermion.h       |   5 +-
 Grid/qcd/action/fermion/WilsonFermion5D.h     |  10 +-
 Grid/qcd/action/fermion/g5HermitianLinop.h    |  14 ++
 .../CayleyFermion5DImplementation.h           |   8 ++
 ...ContinuedFractionFermion5DImplementation.h |  19 +++
 ...ImprovedStaggeredFermion5DImplementation.h |   8 +-
 .../ImprovedStaggeredFermionImplementation.h  |  12 +-
 .../PartialFractionFermion5DImplementation.h  |  21 ++-
 .../WilsonFermion5DImplementation.h           |  24 ++++
 .../WilsonFermionImplementation.h             |  40 ++++--
 Grid/qcd/utils/CovariantLaplacian.h           |   1 +
 20 files changed, 262 insertions(+), 92 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index 450b76df..ba71faf8 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -35,14 +35,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 
 class Geometry {
-  //    int dimension;
 public:
   int npoint;
   std::vector<int> directions   ;
   std::vector<int> displacements;
 
   Geometry(int _d)  {
-  
+    
     int base = (_d==5) ? 1:0;
 
     // make coarse grid stencil for 4d , not 5d
@@ -187,9 +186,10 @@ public:
     }
   }
 
-  // 
-  // World of possibilities here. 
-  //
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
+  // and this is the best I found
+  ////////////////////////////////////////////////////////////////////////////////////////////////
   virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
@@ -249,11 +249,18 @@ public:
 	
 	hermop.HermOp(*Tn,y);
 
-	y=xscale*y+mscale*(*Tn);
-	
-	*Tnp=2.0*y-(*Tnm);
+	auto y_v = y.View();
+	auto Tn_v = Tn->View();
+	auto Tnp_v = Tnp->View();
+	auto Tnm_v = Tnm->View();
+	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+        });
 
-	if ( (n%orderstep)==0 ) {
+	// Possible more fine grained control is needed than a linear sweep,
+	// but huge productivity gain if this is simple algorithm and not a tunable
+	if ( (n%orderstep)==0 ) { 
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
@@ -270,6 +277,7 @@ public:
 	  
       }
     }
+    assert(b==nn);
   }
 };
 
@@ -393,42 +401,15 @@ public:
       return norm2(out);
     }
   };
-
-  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
-
-    conformable(_grid,in.Grid());
-    conformable(in.Grid(),out.Grid());
-    
+  void MdirComms(const CoarseVector &in)
+  {
     SimpleCompressor<siteVector> compressor;
-
     Stencil.HaloExchange(in,compressor);
-
-    int ndim = in.Grid()->Nd();
-
-    //////////////
-    // 4D action like wilson
-    // 0+ => 0 
-    // 0- => 1
-    // 1+ => 2 
-    // 1- => 3
-    // etc..
-    //////////////
-    // 5D action like DWF
-    // 1+ => 0 
-    // 1- => 1
-    // 2+ => 2 
-    // 2- => 3
-    // etc..
-
-    auto point = [dir, disp, ndim](){
-      if(dir == 0 and disp == 0)
-	return 8;
-      else if ( ndim==4 ) { 
-	return (4 * dir + 1 - disp) / 2;
-      } else { 
-	return (4 * (dir-1) + 1 - disp) / 2;
-      }
-    }();
+  }
+  void MdirCalc(const CoarseVector &in, CoarseVector &out, int point)
+  {
+    conformable(_grid,in.Grid());
+    conformable(_grid,out.Grid());
 
     typedef LatticeView<Cobj> Aview;
     Vector<Aview> AcceleratorViewContainer;
@@ -458,10 +439,54 @@ public:
       out_v[ss]=res;
     });
 
+  }
+  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
+  {
+    this->MdirComms(in);
+    int ndir=geom.npoint-1;
+    assert(out.size()==ndir);
+    for(int p=0;p<ndir;p++){
+      MdirCalc(in,out[p],p);
+    }
+  };
+  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
+
+    this->MdirComms(in);
+
+    int ndim = in.Grid()->Nd();
+
+    //////////////
+    // 4D action like wilson
+    // 0+ => 0 
+    // 0- => 1
+    // 1+ => 2 
+    // 1- => 3
+    // etc..
+    //////////////
+    // 5D action like DWF
+    // 1+ => 0 
+    // 1- => 1
+    // 2+ => 2 
+    // 2- => 3
+    // etc..
+    auto point = [dir, disp, ndim](){
+      if(dir == 0 and disp == 0)
+	return 8;
+      else if ( ndim==4 ) { 
+	return (4 * dir + 1 - disp) / 2;
+      } else { 
+	return (4 * (dir-1) + 1 - disp) / 2;
+      }
+    }();
+
+    MdirCalc(in,out,point);
+
   };
 
-  void Mdiag(const CoarseVector &in, CoarseVector &out){
-    Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
+  void Mdiag(const CoarseVector &in, CoarseVector &out)
+  {
+    int point=geom.npoint-1;
+    MdirCalc(in, out, point); // No comms
   };
 
   
@@ -511,16 +536,12 @@ public:
     for(int i=0;i<nbasis;i++){
       phi=Subspace.subspace[i];
 
-      for(int p=0;p<geom.npoint;p++){ 
+      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir " << std::endl;
+      linop.OpDirAll(phi,Mphi_p);
+      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir calculated" << std::endl;
+      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);
+      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDiag calculated" << std::endl;
 
-	int dir   = geom.directions[p];
-	int disp  = geom.displacements[p];
-
-	std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" stencil point "<<p << std::endl;
-	if ( disp==0 ) linop.OpDiag(phi,Mphi_p[p]);
-	else           linop.OpDir (phi,Mphi_p[p],dir,disp); 
-	std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" applied" << std::endl;
-      }
       for(int p=0;p<geom.npoint;p++){ 
 
 	Mphi = Mphi_p[p];
diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h
index a768f431..4ea8ca8b 100644
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -47,6 +47,7 @@ public:
   // Support for coarsening to a multigrid
   virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
   virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
+  virtual void OpDirAll  (const Field &in, std::vector<Field> &out) = 0; // Abstract base
 
   virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
   virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
@@ -83,6 +84,9 @@ public:
   void OpDir  (const Field &in, Field &out,int dir,int disp) {
     _Mat.Mdir(in,out,dir,disp);
   }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
   void Op     (const Field &in, Field &out){
     _Mat.M(in,out);
   }
@@ -116,6 +120,9 @@ public:
     _Mat.Mdir(in,out,dir,disp);
     assert(0);
   }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
   void Op     (const Field &in, Field &out){
     _Mat.M(in,out);
     assert(0);
@@ -154,6 +161,9 @@ public:
   void OpDir  (const Field &in, Field &out,int dir,int disp) {
     _Mat.Mdir(in,out,dir,disp);
   }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
   void Op     (const Field &in, Field &out){
     _Mat.M(in,out);
   }
@@ -183,6 +193,9 @@ public:
   void OpDir  (const Field &in, Field &out,int dir,int disp) {
     _Mat.Mdir(in,out,dir,disp);
   }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
   void Op     (const Field &in, Field &out){
     _Mat.M(in,out);
   }
@@ -234,6 +247,9 @@ public:
       void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	assert(0);
       }
+      void OpDirAll  (const Field &in, std::vector<Field> &out){
+	assert(0);
+      };
     };
     template<class Matrix,class Field>
     class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
diff --git a/Grid/algorithms/SparseMatrix.h b/Grid/algorithms/SparseMatrix.h
index ffed7527..fd713e9f 100644
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -47,6 +47,7 @@ public:
   }
   virtual  void Mdiag    (const Field &in, Field &out)=0;
   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -56,12 +57,12 @@ template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrix
 public:
   virtual GridBase *RedBlackGrid(void)=0;
 
-      //////////////////////////////////////////////////////////////////////
-      // Query the even even properties to make algorithmic decisions
-      //////////////////////////////////////////////////////////////////////
-      virtual RealD  Mass(void)        { return 0.0; };
-      virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
-      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+  //////////////////////////////////////////////////////////////////////
+  // Query the even even properties to make algorithmic decisions
+  //////////////////////////////////////////////////////////////////////
+  virtual RealD  Mass(void)        { return 0.0; };
+  virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
+  virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
 
   // half checkerboard operaions
   virtual  void Meooe    (const Field &in, Field &out)=0;
diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h
index 333ba49b..c2ccb98b 100644
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -101,7 +101,8 @@ public:
   virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
 
   // Efficient support for multigrid coarsening
-  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
 
   void   Meooe5D       (const FermionField &in, FermionField &out);
   void   MeooeDag5D    (const FermionField &in, FermionField &out);
diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
index 379c5f8f..5aa7bfbd 100644
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -62,14 +62,15 @@ public:
 
   // Efficient support for multigrid coarsening
   virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
 
-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
-      //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
+  //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
 
   // Constructors
   ContinuedFractionFermion5D(GaugeField &_Umu,
diff --git a/Grid/qcd/action/fermion/FermionOperator.h b/Grid/qcd/action/fermion/FermionOperator.h
index c60a2e84..cbc6ca63 100644
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -89,6 +89,7 @@ public:
 
   virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
   virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
 
 
       virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
index b4d8d60b..5cb95ca6 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -103,6 +103,7 @@ public:
   // Multigrid assistance; force term uses too
   ///////////////////////////////////////////////////////////////
   void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
   void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
 
   ///////////////////////////////////////////////////////////////
diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
index b10c0356..8e3d4be5 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -86,7 +86,8 @@ public:
   void   MooeeDag    (const FermionField &in, FermionField &out);
   void   MooeeInvDag (const FermionField &in, FermionField &out);
 
-  void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
   void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
 
   // These can be overridden by fancy 5d chiral action
diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
index d61515f0..928abd3f 100644
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -67,12 +67,13 @@ public:
 
   // Efficient support for multigrid coarsening
   virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
 
-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
 
   // Constructors
   PartialFractionFermion5D(GaugeField &_Umu,
diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h
index 3a712435..a3f5d2d7 100644
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -115,9 +115,10 @@ public:
   // Multigrid assistance; force term uses too
   ///////////////////////////////////////////////////////////////
   void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
   void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
-  void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp,
-                   int gamma, int dag);
+  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out);
+  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag);
 
   ///////////////////////////////////////////////////////////////
   // Extra methods added by derived
diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h
index 8f1073db..58b54421 100644
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -111,15 +111,16 @@ public:
   virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
   virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
   virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
 
   // These can be overridden by fancy 5d chiral action
   virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
   virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
   virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
 
-      void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
 
   // Implement hopping term non-hermitian hopping term; half cb or both
   // Implement s-diagonal DW
@@ -131,6 +132,9 @@ public:
   // add a DhopComm
   // -- suboptimal interface will presently trigger multiple comms.
   void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
+  void DhopDirComms(const FermionField &in);
+  void DhopDirCalc(const FermionField &in, FermionField &out,int point);
     
   ///////////////////////////////////////////////////////////////
   // New methods added 
diff --git a/Grid/qcd/action/fermion/g5HermitianLinop.h b/Grid/qcd/action/fermion/g5HermitianLinop.h
index 2e417ceb..90bb6d59 100644
--- a/Grid/qcd/action/fermion/g5HermitianLinop.h
+++ b/Grid/qcd/action/fermion/g5HermitianLinop.h
@@ -54,6 +54,14 @@ public:
     _Mat.Mdir(in,tmp,dir,disp);
     G5R5(out,tmp);
   }
+  void OpDirAll(const Field &in, std::vector<Field> &out) {
+    Field tmp(in.Grid());
+    _Mat.MdirAll(in,out);
+    for(int p=0;p<out.size();p++) {
+      tmp=out[p];
+      G5R5(out[p],tmp);
+    }
+  }
 
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 
@@ -96,6 +104,12 @@ public:
     _Mat.Mdir(in,tmp,dir,disp);
     out=g5*tmp;
   }
+  void OpDirAll(const Field &in, std::vector<Field> &out) {
+    _Mat.MdirAll(in,out);
+    for(int p=0;p<out.size();p++) {
+      out[p]=g5*out[p];
+    }
+  }
 
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 
diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
index 6823eb26..c3c14ae9 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -389,6 +389,14 @@ void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,in
   Meo5D(psi,tmp);
   this->DhopDir(tmp,chi,dir,disp);
 }
+template<class Impl>
+void  CayleyFermion5D<Impl>::MdirAll(const FermionField &psi, std::vector<FermionField> &out)
+{
+  FermionField tmp(psi.Grid());
+  Meo5D(psi,tmp);
+  this->DhopDirAll(tmp,out);
+}
+
 // force terms; five routines; default to Dhop on diagonal
 template<class Impl>
 void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
diff --git a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
index 7af02263..beeb3e00 100644
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -143,6 +143,25 @@ void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFi
   }
 }
 template<class Impl>
+void  ContinuedFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi)
+{
+  int Ls = this->Ls;
+
+  this->DhopDirAll(psi,chi); // Dslash on diagonal. g5 Dslash is hermitian
+
+  for(int p=0;p<chi.size();p++){
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(chi[p],Beta[s]*ZoloHiInv,chi[p],0.0,chi[p],s,s);
+      } else {
+	ag5xpby_ssp(chi[p],cc[s]*Beta[s]*sign*ZoloHiInv,chi[p],0.0,chi[p],s,s);
+      }
+      sign=-sign; 
+    }
+  }
+}
+template<class Impl>
 void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
   int Ls = this->Ls;
diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
index c42e896f..fdaa2f71 100644
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@@ -538,10 +538,16 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
 // Implement the general interface. Here we use SAME mass on all slices
 /////////////////////////////////////////////////////////////////////////
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
   DhopDir(in, out, dir, disp);
 }
 template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0);
+}
+template <class Impl>
 RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
   out.Checkerboard() = in.Checkerboard();
   Dhop(in, out, DaggerNo);
diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
index e2605d81..b4359879 100644
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -362,12 +362,19 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
   DhopDir(in, out, dir, disp);
 }
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0); // Not implemented yet
+}
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
 
   Compressor compressor;
   Stencil.HaloExchange(in, compressor);
@@ -380,6 +387,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
   });
 };
 
+
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 						  DoubledGaugeField &U,
diff --git a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
index 9f8f91ad..edc674cc 100644
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -31,7 +31,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 NAMESPACE_BEGIN(Grid);
 
-template<class Impl>
+ template<class Impl>
 void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
   // this does both dag and undag but is trivial; make a common helper routing
   int Ls = this->Ls;
@@ -45,8 +45,25 @@ void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFiel
     ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
   }
   ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-
 }
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi){
+  // this does both dag and undag but is trivial; make a common helper routing
+  int Ls = this->Ls;
+
+  this->DhopDirAll(psi,chi);
+
+  for(int point=0;point<chi.size();point++){
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(chi[point],-scale,chi[point],0.0,chi[point],s,s); 
+      ag5xpby_ssp(chi[point], scale,chi[point],0.0,chi[point],s+1,s+1); 
+    }
+    ag5xpby_ssp(chi[point],p[nblock]*scale/amax,chi[point],0.0,chi[point],Ls-1,Ls-1);
+  }
+}
+
 template<class Impl>
 void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
 {
diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
index 1bdc9a64..a2092960 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -241,6 +241,30 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
   Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);
 
 };
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
+{
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+
+  assert( (out.size()==8)||(out.size()==9));
+  for(int dir5=1;dir5<=4;dir5++) {
+    for(int disp=-1;disp<=1;disp+=2){
+      int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
+                    // we drop off the innermost fifth dimension
+      assert( (disp==1)||(disp==-1) );
+      assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
+      
+      int skip = (disp==1) ? 0 : 1;
+      int dirdisp = dir+skip*4;
+      int gamma   = dir+(1-skip)*4;
+
+      uint64_t Nsite = Umu.Grid()->oSites();
+      Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out[dirdisp],dirdisp,gamma);
+    }
+  }
+};
+
 
 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
index 756bdbf4..76b904e9 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -319,28 +319,51 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
 }
 
 template <class Impl>
-void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
   DhopDir(in, out, dir, disp);
 }
+template <class Impl>
+void WilsonFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  DhopDirAll(in, out);
+}
 
 template <class Impl>
 void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in, compressor);
+
   int skip = (disp == 1) ? 0 : 1;
   int dirdisp = dir + skip * 4;
   int gamma = dir + (1 - skip) * 4;
 
-  DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
+  DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
 };
-
 template <class Impl>
-void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
-  Compressor compressor(dag);
-
+  Compressor compressor(DaggerNo);
   Stencil.HaloExchange(in, compressor);
+
+  assert((out.size()==8)||(out.size()==9)); 
+  for(int dir=0;dir<Nd;dir++){
+    for(int disp=-1;disp<=1;disp+=2){
+
+      int skip = (disp == 1) ? 0 : 1;
+      int dirdisp = dir + skip * 4;
+      int gamma = dir + (1 - skip) * 4;
+
+      DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
+    }
+  }
+}
+template <class Impl>
+void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+{
   int Ls=1;
-  int Nsite=in.oSites();
+  uint64_t Nsite=in.oSites();
   Kernels::DhopDirKernel(Stencil, Umu, Stencil.CommBuf(), Ls, Nsite, in, out, dirdisp, gamma);
 };
 
@@ -348,7 +371,8 @@ template <class Impl>
 void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                        DoubledGaugeField &U,
                                        const FermionField &in,
-                                       FermionField &out, int dag) {
+                                       FermionField &out, int dag) 
+{
 #ifdef GRID_OMP
   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
     DhopInternalOverlappedComms(st,lo,U,in,out,dag);
diff --git a/Grid/qcd/utils/CovariantLaplacian.h b/Grid/qcd/utils/CovariantLaplacian.h
index 0e0620a7..94322507 100644
--- a/Grid/qcd/utils/CovariantLaplacian.h
+++ b/Grid/qcd/utils/CovariantLaplacian.h
@@ -92,6 +92,7 @@ public:
   };
 
   void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
+  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);}
   void Mdiag(const GaugeField&, GaugeField&){ assert(0);}
 
   void ImportGauge(const GaugeField& _U) {

From 03da4040e21d41388b61d6fd9ddd6531ef6b5180 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 6 Jan 2020 11:47:48 -0500
Subject: [PATCH 21/43] Make summit happy

---
 configure.ac | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index f4a5e503..9a16989e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -280,7 +280,8 @@ case ${CXX} in
 #    CXXLD="nvcc -v -link"
     CXX="nvcc -x cu "
     CXXLD="nvcc -link"
-    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
+#    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
+    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
     if test $ac_openmp = yes; then
        CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
     fi

From 55cdb17691fb2adefbd63c0b8cf2934dfca7927a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:27:45 -0500
Subject: [PATCH 22/43] Integer divide for blocking

---
 Grid/simd/Grid_gpu_vec.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h
index 471fbccc..4584fb36 100644
--- a/Grid/simd/Grid_gpu_vec.h
+++ b/Grid/simd/Grid_gpu_vec.h
@@ -403,6 +403,10 @@ namespace Optimization {
     accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){
       return a/b;
     }
+    accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){
+      return a/b;
+    }
+
     // Danger -- element wise divide fro complex, not complex div. 
     // See Grid_vector_types.h lines around 735, applied after "toReal"
     accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b){

From 48008e4d8b3958191f754240791bb229daba7b8f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:28:16 -0500
Subject: [PATCH 23/43] Thread coordinate creation loop

---
 Grid/lattice/Lattice_coordinate.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/Grid/lattice/Lattice_coordinate.h b/Grid/lattice/Lattice_coordinate.h
index 16f3641b..a1abe58d 100644
--- a/Grid/lattice/Lattice_coordinate.h
+++ b/Grid/lattice/Lattice_coordinate.h
@@ -37,19 +37,18 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
   GridBase *grid = l.Grid();
   int Nsimd = grid->iSites();
 
-  Coordinate gcoor;
-  ExtractBuffer<scalar_type> mergebuf(Nsimd);
-
-  vector_type vI;
   auto l_v = l.View();
-  for(int o=0;o<grid->oSites();o++){
+  thread_for( o, grid->oSites(), {
+    vector_type vI;
+    Coordinate gcoor;
+    ExtractBuffer<scalar_type> mergebuf(Nsimd);
     for(int i=0;i<grid->iSites();i++){
       grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
       mergebuf[i]=(Integer)gcoor[mu];
     }
     merge<vector_type,scalar_type>(vI,mergebuf);
     l_v[o]=vI;
-  }
+  });
 };
 
 // LatticeCoordinate();

From fa856c9669eda8793a58769d47c366bb2ce3a7ec Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:28:46 -0500
Subject: [PATCH 24/43] Disable information message

---
 Grid/threads/Pragmas.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Grid/threads/Pragmas.h b/Grid/threads/Pragmas.h
index 4d713258..d05f8ee9 100644
--- a/Grid/threads/Pragmas.h
+++ b/Grid/threads/Pragmas.h
@@ -43,7 +43,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef _OPENMP
 #define GRID_OMP
 #include <omp.h>
-#warning "Grid is using OpenMP for host loops"
 #endif
 
 #ifdef GRID_OMP

From 1bd87c35d7af979ae7181aced485658a432a3adb Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:29:56 -0500
Subject: [PATCH 25/43] Read coalescing on Nvidia

---
 Grid/lattice/Lattice_transfer.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 0041f47a..abe42733 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -196,7 +196,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
   auto fineY_  = fineY.View();
   auto coarseA_= coarseA.View();
 
-  accelerator_for(sf, fine->oSites(), 1, {
+  accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
     
     int sc;
     Coordinate coor_c(_ndimension);
@@ -207,7 +207,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
     Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 
     // z = A x + y
-    fineZ_[sf]=coarseA_[sc]*fineX_[sf]+fineY_[sf];
+    coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf));
 
   });
 
@@ -397,8 +397,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
     Lattice<CComplex> cip(coarse);
     auto cip_ = cip.View();
     auto  ip_ =  ip.View();
-    accelerator_for(sc,coarse->oSites(),1,{
-      cip_[sc] = ip_[sc]();
+    accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
+	coalescedWrite(cip_[sc], ip_(sc)());
     });
     blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
   }

From d8b97420924b0b997b5674610af4b86f8524cc3f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:34:54 -0500
Subject: [PATCH 26/43] DhopDirAll for faster matrix elements of little Dirac
 operator

---
 Grid/qcd/action/fermion/WilsonKernels.h | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h
index bf06d000..7348a463 100644
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -60,6 +60,9 @@ public:
 			    int Ls, int Nsite, const FermionField &in, FermionField &out,
 			    int interior=1,int exterior=1) ;
 
+  static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
+			  int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
+
   static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
 
@@ -100,17 +103,17 @@ public:
 
 private:
 
-  static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
+  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
 				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
 
-  static accelerator void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
-  static accelerator void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
-  static accelerator void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
-  static accelerator void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
-  static accelerator void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
-  static accelerator void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
-  static accelerator void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
-  static accelerator void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
       
   // Specialised variants
   static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,

From 8016a465aeed007baa936ff8ed97acfbb8746dab Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:35:37 -0500
Subject: [PATCH 27/43] Remove extraneous variable

---
 .../implementation/ImprovedStaggeredFermionImplementation.h      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
index b4359879..0b723c47 100644
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -412,7 +412,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 #ifdef GRID_OMP
   Compressor compressor; 
   int len =  U.Grid()->oSites();
-  const int LLs =  1;
 
   DhopTotalTime   -= usecond();
 

From e5d1c0966577a037456ead69515b758bbe0ea776 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:38:54 -0500
Subject: [PATCH 28/43] Faster DhopDirAll for little dirac operator coarsening

---
 .../WilsonKernelsImplementation.h             | 42 +++++++++++++++++--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
index f13bfdde..9e032b04 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -342,6 +342,38 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
   coalescedWrite(out[sF], result,lane);
 }
 
+template <class Impl>
+void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
+				      int Nsite, const FermionField &in, std::vector<FermionField> &out) 
+{
+   auto U_v   = U.View();
+   auto in_v  = in.View();
+   auto st_v  = st.View();
+
+   auto out_Xm = out[0].View();
+   auto out_Ym = out[1].View();
+   auto out_Zm = out[2].View();
+   auto out_Tm = out[3].View();
+   auto out_Xp = out[4].View();
+   auto out_Yp = out[5].View();
+   auto out_Zp = out[6].View();
+   auto out_Tp = out[7].View();
+
+   accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
+      int sU=sss/Ls;				
+      int sF =sss;				
+      DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
+      DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
+      DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
+      DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
+      DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
+      DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
+      DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
+      DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
+   });
+}
+
+
 template <class Impl>
 void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) 
@@ -354,7 +386,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    auto out_v = out.View();
    auto st_v  = st.View();
 #define LoopBody(Dir)				\
-   if (gamma==Dir) {				\
+   case Dir :			\
      accelerator_forNB(ss,Nsite,Simd::Nsimd(),{	\
        for(int s=0;s<Ls;s++){			\
 	 int sU=ss;				\
@@ -362,8 +394,9 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 	 DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
        }							       \
        });							       \
-   }
+     break;
 
+   switch(gamma){
    LoopBody(Xp);
    LoopBody(Yp);
    LoopBody(Zp);
@@ -373,7 +406,10 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    LoopBody(Ym);
    LoopBody(Zm);
    LoopBody(Tm);
-
+   default:
+     assert(0);
+     break;
+   }
 #undef LoopBody
 } 
 

From 7c061e20c95a01868546deece1989bb84d147a04 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:40:13 -0500
Subject: [PATCH 29/43] All directions of dirac operator for fastt coarsening

---
 .../WilsonFermion5DImplementation.h           | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
index a2092960..613eaa7b 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -246,23 +246,8 @@ void WilsonFermion5D<Impl>::DhopDirAll(const FermionField &in, std::vector<Fermi
 {
   Compressor compressor(DaggerNo);
   Stencil.HaloExchange(in,compressor);
-
-  assert( (out.size()==8)||(out.size()==9));
-  for(int dir5=1;dir5<=4;dir5++) {
-    for(int disp=-1;disp<=1;disp+=2){
-      int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
-                    // we drop off the innermost fifth dimension
-      assert( (disp==1)||(disp==-1) );
-      assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
-      
-      int skip = (disp==1) ? 0 : 1;
-      int dirdisp = dir+skip*4;
-      int gamma   = dir+(1-skip)*4;
-
-      uint64_t Nsite = Umu.Grid()->oSites();
-      Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out[dirdisp],dirdisp,gamma);
-    }
-  }
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out);
 };
 
 

From afc7426f390687778a673d7a3073b224c2f388a7 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:41:16 -0500
Subject: [PATCH 30/43] Much bigger pointer cache in case of Nvidia due to cost
 of setting up UVM allocations

---
 Grid/allocator/AlignedAllocator.cc | 10 ++++++++--
 Grid/allocator/AlignedAllocator.h  | 11 ++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc
index ed27a8bd..d53c4dc2 100644
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -6,6 +6,12 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 
+#ifdef GRID_NVCC
+#define SMALL_LIMIT (0)
+#else
+#define SMALL_LIMIT (4096)
+#endif
+
 #ifdef POINTER_CACHE
 int PointerCache::victim;
 
@@ -13,7 +19,7 @@ PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
 
 void *PointerCache::Insert(void *ptr,size_t bytes) {
 
-  if (bytes < 4096 ) return ptr;
+  if (bytes < SMALL_LIMIT ) return ptr;
 
 #ifdef GRID_OMP
   assert(omp_in_parallel()==0);
@@ -50,7 +56,7 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
 
 void *PointerCache::Lookup(size_t bytes) {
 
-  if (bytes < 4096 ) return NULL;
+  if (bytes < SMALL_LIMIT ) return NULL;
 
 #ifdef GRID_OMP
   assert(omp_in_parallel()==0);
diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index 2aa7d82b..8c189be8 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -49,8 +49,13 @@ NAMESPACE_BEGIN(Grid);
 #ifdef POINTER_CACHE
 class PointerCache {
 private:
-
+/*Pinning pages is costly*/
+/*Could maintain separate large and small allocation caches*/
+#ifdef GRID_NVCC 
+  static const int Ncache=128;
+#else
   static const int Ncache=8;
+#endif
   static int victim;
 
   typedef struct { 
@@ -63,7 +68,6 @@ private:
 
 public:
 
-
   static void *Insert(void *ptr,size_t bytes) ;
   static void *Lookup(size_t bytes) ;
 
@@ -170,13 +174,14 @@ public:
     // Unified (managed) memory
     ////////////////////////////////////
     if ( ptr == (_Tp *) NULL ) {
+      //      printf(" alignedAllocater cache miss %ld bytes ",bytes);      BACKTRACEFP(stdout);
       auto err = cudaMallocManaged((void **)&ptr,bytes);
       if( err != cudaSuccess ) {
 	ptr = (_Tp *) NULL;
 	std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
 	assert(0);
       }
-    }
+    } 
     assert( ptr != (_Tp *)NULL);
 #else 
     //////////////////////////////////////////////////////////////////////////////////////////

From 086256a032f26df6edf6a357bc0b18ef1bba8110 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:41:59 -0500
Subject: [PATCH 31/43] Less sloppy convergence test on PowerMethod

---
 Grid/algorithms/iterative/PowerMethod.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/algorithms/iterative/PowerMethod.h b/Grid/algorithms/iterative/PowerMethod.h
index 8a238ea6..6aa8e923 100644
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@@ -30,12 +30,12 @@ template<class Field> class PowerMethod
       RealD vden = norm2(src_n); 
       RealD na = vnum/vden; 
       
-      if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) { 
+      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
  	evalMaxApprox = na; 
+	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
  	return evalMaxApprox; 
       } 
       evalMaxApprox = na; 
-      std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
       src_n = tmp;
     }
     assert(0);

From b2736ec80b92d99abc5a7ff7539156021a19e50f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:42:48 -0500
Subject: [PATCH 32/43] Make PrecGCR recursive - it can precondition itself

---
 .../PrecGeneralisedConjugateResidual.h        | 70 +++++++------------
 1 file changed, 26 insertions(+), 44 deletions(-)

diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
index d3188ecb..a61b62e0 100644
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -38,10 +38,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 
+#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" " 
+
 template<class Field>
-class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
+class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-  using OperatorFunction<Field>::operator();
 
   RealD   Tolerance;
   Integer MaxIterations;
@@ -49,23 +50,29 @@ public:
   int mmax;
   int nstep;
   int steps;
+  int level;
   GridStopWatch PrecTimer;
   GridStopWatch MatTimer;
   GridStopWatch LinalgTimer;
 
-  LinearFunction<Field> &Preconditioner;
+  LinearFunction<Field>     &Preconditioner;
+  LinearOperatorBase<Field> &Linop;
 
-  PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
+  void Level(int lv) { level=lv; };
+
+  PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
     Tolerance(tol), 
     MaxIterations(maxit),
+    Linop(_Linop),
     Preconditioner(Prec),
     mmax(_mmax),
     nstep(_nstep)
   { 
+    level=1;
     verbose=1;
   };
 
-  void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+  void operator() (const Field &src, Field &psi){
 
     psi=Zero();
     RealD cp, ssq,rsq;
@@ -84,9 +91,9 @@ public:
     steps=0;
     for(int k=0;k<MaxIterations;k++){
 
-      cp=GCRnStep(Linop,src,psi,rsq);
+      cp=GCRnStep(src,psi,rsq);
 
-      std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
+      GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
 
       if(cp<rsq) {
 
@@ -95,24 +102,26 @@ public:
 	Linop.HermOp(psi,r);
 	axpy(r,-1.0,src,r);
 	RealD tr = norm2(r);
-	std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
+	GCRLogLevel<<"PGCR: Converged on iteration " <<steps
 		 << " computed residual "<<sqrt(cp/ssq)
 		 << " true residual "    <<sqrt(tr/ssq)
 		 << " target "           <<Tolerance <<std::endl;
 
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
+	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
+	/*
+	  GCRLogLevel<<"PGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
+	  GCRLogLevel<<"PGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
+	  GCRLogLevel<<"PGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
+	*/
 	return;
       }
 
     }
-    std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
+    GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
     assert(0);
   }
 
-  RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
+  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
 
     RealD cp;
     RealD a, b;
@@ -134,9 +143,7 @@ public:
     std::vector<Field> p(mmax,grid);
     std::vector<RealD> qq(mmax);
       
-    std::cout<<GridLogIterative<< " ************** "<< std::endl;
-    std::cout<<GridLogIterative<< "   GCRnStep("<<nstep<<")"<<std::endl;
-    std::cout<<GridLogIterative<< " ************** "<< std::endl;
+    GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
 
     //////////////////////////////////
     // initial guess x0 is taken as nonzero.
@@ -150,35 +157,15 @@ public:
     LinalgTimer.Start();
     r=src-Az;
     LinalgTimer.Stop();
-    std::cout<<GridLogIterative<< " GCRnStep true residual r = src - A psi   "<<norm2(r) <<std::endl;
+    GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl;
     
     /////////////////////
     // p = Prec(r)
     /////////////////////
 
-    std::cout<<GridLogIterative<< " GCRnStep apply preconditioner z= M^-1 r "<< std::endl;
-    std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
     PrecTimer.Start();
     Preconditioner(r,z);
     PrecTimer.Stop();
-    std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
-    std::cout<<GridLogIterative<< " GCRnStep called Preconditioner z "<< norm2(z) <<std::endl;
-
-    //    MatTimer.Start();
-    //    Linop.HermOp(z,tmp); 
-    //    MatTimer.Stop();
-
-    //    LinalgTimer.Start();
-    //    ttmp=tmp;
-    //    tmp=tmp-r;
-    //    LinalgTimer.Stop();
-
-    /*
-      std::cout<<GridLogMessage<<r<<std::endl;
-      std::cout<<GridLogMessage<<z<<std::endl;
-      std::cout<<GridLogMessage<<ttmp<<std::endl;
-      std::cout<<GridLogMessage<<tmp<<std::endl;
-    */
 
     MatTimer.Start();
     Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
@@ -190,7 +177,6 @@ public:
     p[0]= z;
     q[0]= Az;
     qq[0]= zAAz;
-    std::cout<<GridLogIterative<< " GCRnStep p0=z, q0 = A p0 " <<std::endl;
     
     cp =norm2(r);
     LinalgTimer.Stop();
@@ -212,20 +198,16 @@ public:
       cp = axpy_norm(r,-a,q[peri_k],r);
       LinalgTimer.Stop();
 
-      std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
+      GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
 
       if((k==nstep-1)||(cp<rsq)){
 	return cp;
       }
 
 
-      std::cout<<GridLogIterative<< " GCRnStep apply preconditioner z= M^-1 r "<< std::endl;
-      std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
       PrecTimer.Start();
       Preconditioner(r,z);// solve Az = r
       PrecTimer.Stop();
-      std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
-      std::cout<<GridLogIterative<< " GCRnStep called Preconditioner z "<< norm2(z) <<std::endl;
 
       MatTimer.Start();
       Linop.HermOpAndNorm(z,Az,zAz,zAAz);

From eb5b720e94be179a57e16fc4d02ac360951c8d48 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:43:29 -0500
Subject: [PATCH 33/43] Normal Equations can be used in HDCR now

---
 Grid/algorithms/iterative/NormalEquations.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/Grid/algorithms/iterative/NormalEquations.h b/Grid/algorithms/iterative/NormalEquations.h
index 270b0115..df82b6dc 100644
--- a/Grid/algorithms/iterative/NormalEquations.h
+++ b/Grid/algorithms/iterative/NormalEquations.h
@@ -33,26 +33,30 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Take a matrix and form an NE solver calling a Herm solver
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class NormalEquations : public OperatorFunction<Field>{
+template<class Field> class NormalEquations {
 private:
   SparseMatrixBase<Field> & _Matrix;
   OperatorFunction<Field> & _HermitianSolver;
-
+  LinearFunction<Field>   & _Guess;
 public:
 
   /////////////////////////////////////////////////////
   // Wrap the usual normal equations trick
   /////////////////////////////////////////////////////
-  NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver) 
-    :  _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; 
+ NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
+		 LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
 
   void operator() (const Field &in, Field &out){
  
     Field src(in.Grid());
+    Field tmp(in.Grid());
 
+    MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
     _Matrix.Mdag(in,src);
-    _HermitianSolver(src,out);  // Mdag M out = Mdag in
- 
+    _Guess(src,out);
+    _HermitianSolver(MdagMOp,src,out);  // Mdag M out = Mdag in
+
   }     
 };
 

From 8cec294ec99a5a63436849ce1de0f6aebc35bf25 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:44:04 -0500
Subject: [PATCH 34/43] Make CG a bit less verbose as gettign annoying in
 nested algorithms. Can use Iterative logging if you want to see more

---
 Grid/algorithms/iterative/ConjugateGradient.h | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h
index 398f578f..32ba4035 100644
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -71,7 +71,6 @@ public:
     // Initial residual computation & set up
     RealD guess = norm2(psi);
     assert(std::isnan(guess) == 0);
-
     
     Linop.HermOpAndNorm(psi, mmp, d, b);
     
@@ -154,18 +153,18 @@ public:
         RealD resnorm = std::sqrt(norm2(p));
         RealD true_residual = resnorm / srcnorm;
 
-        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
-        std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
-	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
-	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
+		  << "\tComputed residual " << std::sqrt(cp / ssq)
+		  << "\tTrue residual " << true_residual
+		  << "\tTarget " << Tolerance << std::endl;
 
-        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
+	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
 
         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 

From 49e123dbda62157df3e204063ad3fd241f51f65c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 12:44:51 -0500
Subject: [PATCH 35/43] Use explicit linalg calls to get coalesce optimisations
 on GPU

---
 Grid/algorithms/approx/Chebyshev.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h
index 74789ead..133db2b4 100644
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -252,20 +252,20 @@ public:
     RealD xscale = 2.0/(hi-lo);
     RealD mscale = -(hi+lo)/(hi-lo);
     Linop.HermOp(T0,y);
-    T1=y*xscale+in*mscale;
+    axpby(T1,xscale,mscale,y,in);
 
     // sum = .5 c[0] T0 + c[1] T1
-    out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
+    //    out = ()*T0 + Coeffs[1]*T1;
+    axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
     for(int n=2;n<order;n++){
 	
       Linop.HermOp(*Tn,y);
-
-      y=xscale*y+mscale*(*Tn);
-
-      *Tnp=2.0*y-(*Tnm);
-
-      out=out+Coeffs[n]* (*Tnp);
-
+      //     y=xscale*y+mscale*(*Tn);
+      //      *Tnp=2.0*y-(*Tnm);
+      //      out=out+Coeffs[n]* (*Tnp);
+      axpby(y,xscale,mscale,y,(*Tn));
+      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
+      axpy(out,Coeffs[n],*Tnp,out);
       // Cycle pointers to avoid copies
       Field *swizzle = Tnm;
       Tnm    =Tn;

From 114db3b99d2a4e3d80fd8d3d69ceb8faec55e4e8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 13:42:51 -0500
Subject: [PATCH 36/43] Optional MdagM without norms

---
 Grid/algorithms/SparseMatrix.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Grid/algorithms/SparseMatrix.h b/Grid/algorithms/SparseMatrix.h
index fd713e9f..b959f53c 100644
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -45,6 +45,10 @@ public:
     ni=M(in,tmp);
     no=Mdag(tmp,out);
   }
+  virtual void  MdagM(const Field &in, Field &out) {
+    RealD ni, no;
+    MdagM(in,out,ni,no);
+  }
   virtual  void Mdiag    (const Field &in, Field &out)=0;
   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
   virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;

From 76c823781e6b54b88657b695a8f77e2e13db0026 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 13:43:19 -0500
Subject: [PATCH 37/43] Much faster coarsening

---
 Grid/algorithms/CoarsenedMatrix.h | 496 ++++++++++++++++++++++++------
 1 file changed, 409 insertions(+), 87 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index ba71faf8..01b0da4d 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -1,3 +1,14 @@
+    // blockZaxpy in bockPromote - 3s, 5%
+    // noncoalesced linalg in Preconditionoer ~ 3s 5%
+    // Lancos tuning or replace 10-20s ~ 25%, open ended
+    // setup tuning   5s  ~  8%
+    //    -- e.g. ordermin, orderstep tunables.
+    // MdagM path without norm in LinOp code.     few seconds
+
+    // Mdir calc blocking kernels
+    // Fuse kernels in blockMaskedInnerProduct
+    // preallocate Vectors in Cayley 5D ~ few percent few seconds
+
 /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
@@ -34,6 +45,34 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 NAMESPACE_BEGIN(Grid);
 
+template<class vobj,class CComplex>
+inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner1,
+				    Lattice<CComplex> &CoarseInner2,
+				    const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask1,
+				    const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask2,
+				    const Lattice<vobj> &fineX,
+				    const Lattice<vobj> &fineY)
+{
+  typedef decltype(innerProduct(vobj(),vobj())) dotp;
+
+  GridBase *coarse(CoarseInner1.Grid());
+  GridBase *fine  (fineX.Grid());
+
+  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
+  Lattice<dotp> fine_inner_msk(fine);
+
+  // Multiply could be fused with innerProduct
+  // Single block sum kernel could do both masks.
+  fine_inner = localInnerProduct(fineX,fineY);
+
+  mult(fine_inner_msk, fine_inner,FineMask1);
+  blockSum(CoarseInner1,fine_inner_msk);
+
+  mult(fine_inner_msk, fine_inner,FineMask2);
+  blockSum(CoarseInner2,fine_inner_msk);
+}
+
+
 class Geometry {
 public:
   int npoint;
@@ -51,10 +90,10 @@ public:
     directions.resize(npoint);
     displacements.resize(npoint);
     for(int d=0;d<_d;d++){
-      directions[2*d  ] = d+base;
-      directions[2*d+1] = d+base;
-      displacements[2*d  ] = +1;
-      displacements[2*d+1] = -1;
+      directions[d   ] = d+base;
+      directions[d+_d] = d+base;
+      displacements[d  ] = +1;
+      displacements[d+_d]= -1;
     }
     directions   [2*_d]=0;
     displacements[2*_d]=0;
@@ -136,20 +175,15 @@ public:
     std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
   }
   void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
-    //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
     blockProject(CoarseVec,FineVec,subspace);
-    //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
   }
   void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
     FineVec.Checkerboard() = subspace[0].Checkerboard();
-    //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
     blockPromote(CoarseVec,FineVec,subspace);
-    //    std::cout << GridLogMessage<< "BlockPromote done"<<std::endl;
   }
   void CreateSubspaceRandom(GridParallelRNG &RNG){
     for(int i=0;i<nbasis;i++){
       random(RNG,subspace[i]);
-      //      std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
     }
   }
 
@@ -190,12 +224,15 @@ public:
   // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
   // and this is the best I found
   ////////////////////////////////////////////////////////////////////////////////////////////////
+#if 1
   virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo,
-				       int order,
-				       int orderstep
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
 				       ) {
 
     RealD scale;
@@ -215,7 +252,7 @@ public:
     int b =0;
     {
       // Filter
-      Chebyshev<FineField> Cheb(lo,hi,order);
+      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
       Cheb(hermop,noise,Mn);
       // normalise
       scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
@@ -227,7 +264,7 @@ public:
 
     // Generate a full sequence of Chebyshevs
     {
-      lo=0;
+      lo=filterlo;
       noise=Mn;
 
       FineField T0(FineGrid); T0 = noise;  
@@ -245,7 +282,7 @@ public:
       hermop.HermOp(T0,y);
       T1=y*xscale+noise*mscale;
 
-      for(int n=2;n<=orderstep*(nn-1);n++){
+      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
 	
 	hermop.HermOp(*Tn,y);
 
@@ -253,6 +290,7 @@ public:
 	auto Tn_v = Tn->View();
 	auto Tnp_v = Tnp->View();
 	auto Tnm_v = Tnm->View();
+	const int Nsimd = CComplex::Nsimd();
 	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
@@ -260,12 +298,14 @@ public:
 
 	// Possible more fine grained control is needed than a linear sweep,
 	// but huge productivity gain if this is simple algorithm and not a tunable
-	if ( (n%orderstep)==0 ) { 
+	int m =1;
+	if ( n>=ordermin ) m=n-ordermin;
+	if ( (m%orderstep)==0 ) { 
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
 	  hermop.Op(Mn,tmp); 
-	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
 	  b++;
 	}
 
@@ -279,6 +319,202 @@ public:
     }
     assert(b==nn);
   }
+#endif
+#if 0
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+    FineField combined(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+#define FILTERb(llo,hhi,oorder)						\
+    {									\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
+      Cheb(hermop,noise,Mn);						\
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
+      subspace[b]   = Mn;						\
+      hermop.Op(Mn,tmp);						\
+      std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+
+    //      JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5);	\
+
+    RealD alpha=-0.8;
+    RealD beta =-0.8;
+#define FILTER(llo,hhi,oorder)						\
+    {									\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
+      /* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
+      Cheb(hermop,noise,Mn);						\
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
+      subspace[b]   = Mn;						\
+      hermop.Op(Mn,tmp);						\
+      std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+    
+#define FILTERc(llo,hhi,oorder)				\
+    {							\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);	\
+      Cheb(hermop,noise,combined);			\
+    }									
+
+    double node = 0.000;
+    FILTERb(lo,hi,orderfilter);// 0
+    //    FILTERc(node,hi,51);// 0
+    noise = Mn;
+    int base = 0;
+    int mult = 100;
+    FILTER(node,hi,base+1*mult);
+    FILTER(node,hi,base+2*mult);
+    FILTER(node,hi,base+3*mult);
+    FILTER(node,hi,base+4*mult);
+    FILTER(node,hi,base+5*mult);
+    FILTER(node,hi,base+6*mult);
+    FILTER(node,hi,base+7*mult);
+    FILTER(node,hi,base+8*mult);
+    FILTER(node,hi,base+9*mult);
+    FILTER(node,hi,base+10*mult);
+    FILTER(node,hi,base+11*mult);
+    FILTER(node,hi,base+12*mult);
+    FILTER(node,hi,base+13*mult);
+    FILTER(node,hi,base+14*mult);
+    FILTER(node,hi,base+15*mult);
+    assert(b==nn);
+  }
+#endif
+
+#if 0
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+    FineField combined(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {						
+      Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
+      //      JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
+      //JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
+      //      JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
+      JacobiPoly(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp);
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; 
+      b++;
+      //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
+      //      subspace[b]   = tmp;      b++;
+      //    }									
+    }									
+
+#define FILTER(lambda)						\
+    {								\
+      hermop.HermOp(subspace[0],tmp);				\
+      tmp = tmp - lambda *subspace[0];				\
+      scale = std::pow(norm2(tmp),-0.5);			\
+      tmp=tmp*scale;							\
+      subspace[b]   = tmp;						\
+      hermop.Op(subspace[b],tmp);					\
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+    //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
+    //      subspace[b]   = tmp;      b++;
+    //    }									
+
+    FILTER(2.0e-5);
+    FILTER(2.0e-4);
+    FILTER(4.0e-4);
+    FILTER(8.0e-4);
+    FILTER(8.0e-4);
+
+    FILTER(2.0e-3);
+    FILTER(3.0e-3);
+    FILTER(4.0e-3);
+    FILTER(5.0e-3);
+    FILTER(6.0e-3);
+
+    FILTER(2.5e-3);
+    FILTER(3.5e-3);
+    FILTER(4.5e-3);
+    FILTER(5.5e-3);
+    FILTER(6.5e-3);
+
+    //    FILTER(6.0e-5);//6
+    //    FILTER(7.0e-5);//8
+    //    FILTER(8.0e-5);//9
+    //    FILTER(9.0e-5);//3
+
+    /*
+    //    FILTER(1.0e-4);//10
+    FILTER(2.0e-4);//11
+    //   FILTER(3.0e-4);//12
+    //    FILTER(4.0e-4);//13
+    FILTER(5.0e-4);//14
+
+    FILTER(6.0e-3);//4
+    FILTER(7.0e-4);//1
+    FILTER(8.0e-4);//7
+    FILTER(9.0e-4);//15
+    FILTER(1.0e-3);//2
+
+    FILTER(2.0e-3);//2
+    FILTER(3.0e-3);//2
+    FILTER(4.0e-3);//2
+    FILTER(5.0e-3);//2
+    FILTER(6.0e-3);//2
+
+    FILTER(7.0e-3);//2
+    FILTER(8.0e-3);//2
+    FILTER(1.0e-2);//2
+    */
+    std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
+    assert(b==nn);
+  }
+#endif
+
+
 };
 
 // Fine Object == (per site) type of fine field
@@ -287,7 +523,8 @@ template<class Fobj,class CComplex,int nbasis>
 class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
 public:
     
-  typedef iVector<CComplex,nbasis >             siteVector;
+  typedef iVector<CComplex,nbasis >           siteVector;
+  typedef Lattice<CComplex >                  CoarseComplexField;
   typedef Lattice<siteVector>                 CoarseVector;
   typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
   typedef iMatrix<CComplex,nbasis >  Cobj;
@@ -304,7 +541,6 @@ public:
   CartesianStencil<siteVector,siteVector,int> Stencil; 
 
   std::vector<CoarseMatrix> A;
-
       
   ///////////////////////
   // Interface
@@ -316,7 +552,6 @@ public:
     conformable(_grid,in.Grid());
     conformable(in.Grid(),out.Grid());
 
-
     //    RealD Nin = norm2(in);
     SimpleCompressor<siteVector> compressor;
 
@@ -333,16 +568,14 @@ public:
     Aview *Aview_p = & AcceleratorViewContainer[0];
 
     const int Nsimd = CComplex::Nsimd();
-
     typedef decltype(coalescedRead(in_v[0])) calcVector;
     typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 
     GridStopWatch ArithmeticTimer;
     int osites=Grid()->oSites();
-    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
-    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
+    //    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
+    //    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
     double usecs =-usecond();
-
     // assert(geom.npoint==9);
 
     accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
@@ -418,7 +651,37 @@ public:
 
     auto out_v = out.View();
     auto in_v  = in.View();
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
+      int ss = sss/nbasis;
+      int b  = sss%nbasis;
+      calcComplex res = Zero();
+      calcVector nbr;
+      int ptype;
+      StencilEntry *SE;
+
+      int lane=SIMTlane(Nsimd);
+      SE=Stencil.GetEntry(ptype,point,ss);
+	  
+      if(SE->_is_local) { 
+	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
+      } else {
+	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+      }
+      synchronise();
+
+      for(int bb=0;bb<nbasis;bb++) {
+	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+      }
+      coalescedWrite(out_v[ss](b),res,lane);
+    });
+#if 0
     accelerator_for(ss,Grid()->oSites(),1,{
+
       siteVector res = Zero();
       siteVector nbr;
       int ptype;
@@ -433,18 +696,23 @@ public:
       } else {
 	nbr = Stencil.CommBuf()[SE->_offset];
       }
+      synchronise();
 
       res = res + Aview_p[point][ss]*nbr;
       
       out_v[ss]=res;
     });
-
+#endif
   }
   void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
   {
     this->MdirComms(in);
     int ndir=geom.npoint-1;
-    assert(out.size()==ndir);
+    if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { 
+      std::cout <<"MdirAll out size "<< out.size()<<std::endl;
+      std::cout <<"MdirAll ndir "<< ndir<<std::endl;
+      assert(0);
+    }
     for(int p=0;p<ndir;p++){
       MdirCalc(in,out[p],p);
     }
@@ -496,28 +764,45 @@ public:
     geom(CoarseGrid._ndimension),
     hermitian(hermitian_),
     Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-    A(geom.npoint,&CoarseGrid)
+      A(geom.npoint,&CoarseGrid)
   {
   };
 
   void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace){
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
+    typedef typename Fobj::scalar_type scalar_type;
 
-    FineField iblock(FineGrid); // contributions from within this block
-    FineField oblock(FineGrid); // contributions from outwith this block
+    FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
+    FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
+
+    std::vector<FineComplexField> masks(geom.npoint,FineGrid);
+    FineComplexField imask(FineGrid); // contributions from within this block
+    FineComplexField omask(FineGrid); // contributions from outwith this block
+
+    FineComplexField evenmask(FineGrid);
+    FineComplexField oddmask(FineGrid); 
 
     FineField     phi(FineGrid);
     FineField     tmp(FineGrid);
     FineField     zz(FineGrid); zz=Zero();
     FineField    Mphi(FineGrid);
+    FineField    Mphie(FineGrid);
+    FineField    Mphio(FineGrid);
     std::vector<FineField>     Mphi_p(geom.npoint,FineGrid);
 
-    Lattice<iScalar<vInteger> > coor(FineGrid);
+    Lattice<iScalar<vInteger> > coor (FineGrid);
+    Lattice<iScalar<vInteger> > bcoor(FineGrid);
+    Lattice<iScalar<vInteger> > bcb  (FineGrid);
 
     CoarseVector iProj(Grid()); 
     CoarseVector oProj(Grid()); 
-    CoarseScalar InnerProd(Grid()); 
+    CoarseVector SelfProj(Grid()); 
+    CoarseComplexField iZProj(Grid()); 
+    CoarseComplexField oZProj(Grid()); 
 
+    CoarseScalar InnerProd(Grid()); 
 
     // Orthogonalise the subblocks over the basis
     blockOrthogonalise(InnerProd,Subspace.subspace);
@@ -525,22 +810,46 @@ public:
     // Compute the matrix elements of linop between this orthonormal
     // set of vectors.
     int self_stencil=-1;
-    for(int p=0;p<geom.npoint;p++){ 
+    for(int p=0;p<geom.npoint;p++)
+    { 
+      int dir   = geom.directions[p];
+      int disp  = geom.displacements[p];
       A[p]=Zero();
       if( geom.displacements[p]==0){
 	self_stencil=p;
       }
+
+      Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+
+      LatticeCoordinate(coor,dir);
+
+      ///////////////////////////////////////////////////////
+      // Work out even and odd block checkerboarding for fast diagonal term
+      ///////////////////////////////////////////////////////
+      if ( disp==1 ) {
+	bcb   = bcb + div(coor,block);
+      }
+	
+      if ( disp==0 ) {
+	  masks[p]= Zero();
+      } else if ( disp==1 ) {
+	masks[p] = where(mod(coor,block)==(block-1),one,zero);
+      } else if ( disp==-1 ) {
+	masks[p] = where(mod(coor,block)==(Integer)0,one,zero);
+      }
     }
+    evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
+    oddmask  = one-evenmask;
+
     assert(self_stencil!=-1);
 
     for(int i=0;i<nbasis;i++){
+
       phi=Subspace.subspace[i];
 
-      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir " << std::endl;
+      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
       linop.OpDirAll(phi,Mphi_p);
-      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir calculated" << std::endl;
       linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);
-      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDiag calculated" << std::endl;
 
       for(int p=0;p<geom.npoint;p++){ 
 
@@ -549,54 +858,66 @@ public:
 	int dir   = geom.directions[p];
 	int disp  = geom.displacements[p];
 
-	Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+	if ( (disp==-1) || (!hermitian ) ) {
 
-	LatticeCoordinate(coor,dir);
+	  ////////////////////////////////////////////////////////////////////////
+	  // Pick out contributions coming from this cell and neighbour cell
+	  ////////////////////////////////////////////////////////////////////////
+	  omask = masks[p];
+	  imask = one-omask;
+	
+	  for(int j=0;j<nbasis;j++){
+	    
+	    blockMaskedInnerProduct(iZProj,oZProj,imask,omask,Subspace.subspace[j],Mphi);
+	    
+	    auto iZProj_v = iZProj.View() ;
+	    auto oZProj_v = oZProj.View() ;
+	    auto A_p     =  A[p].View();
+	    auto A_self  = A[self_stencil].View();
 
+	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
 
-	////////////////////////////////////////////////////////////////////////
-	// Pick out contributions coming from this cell and neighbour cell
-	////////////////////////////////////////////////////////////////////////
-	if ( disp==0 ) {
-	  iblock = Mphi;
-	  oblock = Zero();
-	} else if ( disp==1 ) {
-	  oblock = where(mod(coor,block)==(block-1),Mphi,zz);
-	  iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
-	} else if ( disp==-1 ) {
-	  oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
-	  iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
-	} else {
-	  assert(0);
+	  }
+	}
+      }
+
+      ///////////////////////////////////////////
+      // Faster alternate self coupling.. use hermiticity to save 2x
+      ///////////////////////////////////////////
+      {
+	mult(tmp,phi,evenmask);  linop.Op(tmp,Mphie);
+	mult(tmp,phi,oddmask );   linop.Op(tmp,Mphio);
+
+	//	tmp = Mphie*evenmask + Mphio*oddmask;
+	{
+	  auto tmp_      = tmp.View();
+	  auto evenmask_ = evenmask.View();
+	  auto oddmask_  =  oddmask.View();
+	  auto Mphie_    =  Mphie.View();
+	  auto Mphio_    =  Mphio.View();
+	  accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ 
+	      coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
+	    });
 	}
 
-	// Could do local inner products,
-	// and then block pick the IP's.
-	// Ideally write a routine to do two masked block sums at once
-	std::cout << GridLogMessage<< "CoarsenMatrix picked "<<p<< std::endl;
-	Subspace.ProjectToSubspace(iProj,iblock);
-	Subspace.ProjectToSubspace(oProj,oblock);
-	std::cout << GridLogMessage<< "CoarsenMatrix projected"<<p<< std::endl;
+	blockProject(SelfProj,tmp,Subspace.subspace);
 
-	// 4x gain possible in this loop. Profile and identify time loss.
-	// i)  Assume Hermiticity, upper diagonal only (2x)
-	// ii) Local inner product, then pick the local inners and sum. (2x)
-
-	auto iProj_v = iProj.View() ;
-	auto oProj_v = oProj.View() ;
-	auto A_p     =  A[p].View();
+	auto SelfProj_ = SelfProj.View();
 	auto A_self  = A[self_stencil].View();
-	accelerator_for(ss, Grid()->oSites(),1,{
+	accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
 	  for(int j=0;j<nbasis;j++){
-	    if( disp!= 0 ) {
-	      A_p[ss](j,i) = oProj_v[ss](j);
-	    }
-	    A_self[ss](j,i) =	A_self[ss](j,i) + iProj_v[ss](j);
+	    coalescedWrite(A_self[ss](j,i), SelfProj_(ss)(j));
 	  }
 	});
-
       }
     }
+    if(hermitian) {
+      std::cout << GridLogMessage << " ForceHermitian "<<std::endl;
+      ForceHermitian();
+    }
+      // AssertHermitian();
+      // ForceDiagonal();
+  }
 
 #if 0
     ///////////////////////////
@@ -619,25 +940,26 @@ public:
     std::cout<<GridLogMessage<< iProj <<std::endl;
     std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
-    /*
-    if(hermitian) {
-      std::cout << GridLogMessage << " ForceHermitian "<<std::endl;
-      ForceHermitian();
-    }
-    for(int p=0;p<geom.npoint;p++){
-      std::cout << GridLogMessage<< " dir "<< norm2(A[p]) <<std::endl;
-    }
-    */
-      // AssertHermitian();
-      // ForceDiagonal();
-  }
+
 
   void ForceHermitian(void) {
-    for(int d=0;d<4;d++){
-      int dd=d+1;
-      A[2*d] = adj(Cshift(A[2*d+1],dd,1));
+    CoarseMatrix Diff  (Grid());
+    for(int p=0;p<geom.npoint;p++){
+      int dir   = geom.directions[p];
+      int disp  = geom.displacements[p];
+      if(disp==-1) {
+	// Find the opposite link
+	for(int pp=0;pp<geom.npoint;pp++){
+	  int dirp   = geom.directions[pp];
+	  int dispp  = geom.displacements[pp];
+	  if ( (dirp==dir) && (dispp==1) ){
+	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
+	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
+	    A[pp] = adj(Cshift(A[p],dir,1));
+	  }
+	}
+      }
     }
-    //      A[8] = 0.5*(A[8] + adj(A[8]));
   }
   void AssertHermitian(void) {
     CoarseMatrix AA    (Grid());

From 2e85cae74e01ab214d61b5fc1e2f67d7d24c2777 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 13:43:49 -0500
Subject: [PATCH 38/43] Add Jacobi polynomials

---
 Grid/algorithms/Algorithms.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h
index f1ac1c81..97ab4dc1 100644
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 #include <Grid/algorithms/approx/Zolotarev.h>
 #include <Grid/algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/JacobiPolynomial.h>
 #include <Grid/algorithms/approx/Remez.h>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>

From 2b5de5bba5c178eb072d753866cba42436545082 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 13:44:30 -0500
Subject: [PATCH 39/43] MdagM operator without norm option

---
 Grid/algorithms/LinearOperator.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h
index 4ea8ca8b..26f22ad2 100644
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -97,8 +97,7 @@ public:
     _Mat.MdagM(in,out,n1,n2);
   }
   void HermOp(const Field &in, Field &out){
-    RealD n1,n2;
-    HermOpAndNorm(in,out,n1,n2);
+    _Mat.MdagM(in,out);
   }
 };
 
@@ -172,7 +171,6 @@ public:
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
     _Mat.M(in,out);
-	
     ComplexD dot= innerProduct(in,out); n1=real(dot);
     n2=norm2(out);
   }

From 852fc1b00174b255e41d00591363ce494de8b8f1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 27 Jan 2020 13:45:10 -0500
Subject: [PATCH 40/43] True Hierachical multigrid for DWF

---
 tests/solver/Test_dwf_hdcr.cc | 715 +++++++++++-----------------------
 1 file changed, 224 insertions(+), 491 deletions(-)

diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc
index 5a131a57..873530ff 100644
--- a/tests/solver/Test_dwf_hdcr.cc
+++ b/tests/solver/Test_dwf_hdcr.cc
@@ -1,3 +1,5 @@
+
+
 /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
@@ -29,357 +31,174 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
-//#include <algorithms/iterative/PrecConjugateResidual.h>
 
 using namespace std;
 using namespace Grid;
+/* Params
+ * Grid: 
+ * block1(4)
+ * block2(4)
+ * 
+ * Subspace
+ * * Fine  : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100
+ * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100
 
-class myclass: Serializable {
-public:
-
-  GRID_SERIALIZABLE_CLASS_MEMBERS(myclass,
-			  int, domaindecompose,
-			  int, domainsize,
-			  int, order,
-			  int, Ls,
-			  double, mq,
-			  double, lo,
-			  double, hi,
-			  int, steps);
-
-  myclass(){};
-
-};
+ * Smoother:
+ * * Fine: Cheby(hi, lo, order)            --  60,0.5,10
+ * * Coarse: Cheby(hi, lo, order)          --  12,0.1,4
 
+ * Lanczos:
+ * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order))   24,36,24,0.002,4.0,61 
+ */
 RealD InverseApproximation(RealD x){
   return 1.0/x;
 }
 
-template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser>
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;
+  
+  Chebyshev<Field> Cheby;
+
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) :
+    _SmootherOperator(SmootherOperator),
+    _SmootherMatrix(SmootherMatrix),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    MdagMLinearOperator<Matrix,Field>   MdagMOp(_SmootherMatrix); 
+    _SmootherOperator.AdjOp(in,tmp);
+    Cheby(MdagMOp,tmp,out);         
+  }
+};
+template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & SmootherMatrix;
+  FineOperator   & SmootherOperator;
+  RealD tol;
+  RealD shift;
+  int   maxit;
+
+  MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) :
+    shift(_shift),tol(_tol),maxit(_maxit),
+    SmootherOperator(_SmootherOperator),
+    SmootherMatrix(_SmootherMatrix)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    ZeroGuesser<Field> Guess;
+    ConjugateGradient<Field>  CG(tol,maxit,false);
+ 
+    Field src(in.Grid());
+
+    ShiftedMdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(SmootherMatrix,shift);
+    SmootherOperator.AdjOp(in,src);
+    Guess(src,out);
+    CG(MdagMOp,src,out); 
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
 
   typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
   typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
-
-  typedef typename Aggregation<Fobj,CComplex,nbasis>::siteVector     siteVector;
-  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseScalar CoarseScalar;
   typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
   typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
   typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
   typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
 
   Aggregates     & _Aggregates;
   CoarseOperator & _CoarseOperator;
   Matrix         & _FineMatrix;
   FineOperator   & _FineOperator;
-  Matrix         & _SmootherMatrix;
-  FineOperator   & _SmootherOperator;
   Guesser        & _Guess;
+  FineSmoother   & _Smoother;
+  CoarseSolver   & _CoarseSolve;
 
-  double cheby_hi;
-  double cheby_lo;
-  int    cheby_ord;
+  int    level;  void Level(int lv) {level = lv; };
 
-  myclass _params;
+#define GridLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level <<" "
 
-  // Constructor
   MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
 			  FineOperator &Fine,Matrix &FineMatrix,
-			  FineOperator &Smooth,Matrix &SmootherMatrix,
+			  FineSmoother &Smoother,
 			  Guesser &Guess_,
-			  myclass params_)
+			  CoarseSolver &CoarseSolve_)
     : _Aggregates(Agg),
       _CoarseOperator(Coarse),
       _FineOperator(Fine),
       _FineMatrix(FineMatrix),
-      _SmootherOperator(Smooth),
-      _SmootherMatrix(SmootherMatrix),
+      _Smoother(Smoother),
       _Guess(Guess_),
-      _params(params_)
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
   {
-  }
-
-  void PowerMethod(const FineField &in) {
-
-    FineField p1(in.Grid());
-    FineField p2(in.Grid());
-
-    MdagMLinearOperator<Matrix,FineField>   fMdagMOp(_FineMatrix);
-
-    p1=in;
-    for(int i=0;i<50;i++){
-      RealD absp1=std::sqrt(norm2(p1));
-      fMdagMOp.HermOp(p1,p2);// this is the G5 herm bit      
-      //      _FineOperator.Op(p1,p2);// this is the G5 herm bit      
-      RealD absp2=std::sqrt(norm2(p2));
-      if(i%10==9)
-	std::cout<<GridLogMessage << "Power method on mdagm "<<i<<" " << absp2/absp1<<std::endl;
-      p1=p2*(1.0/std::sqrt(absp2));
-    }
-  }
-
-  void operator()(const FineField &in, FineField & out ) {
-    operatorCheby(in,out);
-    //operatorADEF2(in,out);
-  }
-
-    ////////////////////////////////////////////////////////////////////////
-    // ADEF2: [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
-    // ADEF1: [MP+Q ] in =M [1 - A Q] in + Q in  
-    ////////////////////////////////////////////////////////////////////////
-#if 1
-  void operatorADEF2(const FineField &in, FineField & out) {
-
     CoarseVector Csrc(_CoarseOperator.Grid());
-    CoarseVector Ctmp(_CoarseOperator.Grid());
-    CoarseVector Csol(_CoarseOperator.Grid());
-
-    ConjugateGradient<CoarseVector>  CG(1.0e-3,100,false);
-    ConjugateGradient<FineField>    fCG(1.0e-3,10,false);
-
-    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
-    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    MdagMLinearOperator<Matrix,FineField>               fMdagMOp(_FineMatrix);
-
-    FineField tmp(in.Grid());
-    FineField res(in.Grid());
-    FineField Min(in.Grid());
-
-    // Monitor completeness of low mode space
-    _Aggregates.ProjectToSubspace  (Csrc,in);
-    _Aggregates.PromoteFromSubspace(Csrc,out);
-    std::cout<<GridLogMessage<<"Coarse Grid Preconditioner\nCompleteness in: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
-
-    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
-    _FineOperator.Op(in,tmp);// this is the G5 herm bit
-    fCG(fMdagMOp,tmp,Min);    // solves  MdagM = g5 M g5M
-
-    // Monitor completeness of low mode space
-    _Aggregates.ProjectToSubspace  (Csrc,Min);
-    _Aggregates.PromoteFromSubspace(Csrc,out);
-    std::cout<<GridLogMessage<<"Completeness Min: "<<std::sqrt(norm2(out)/norm2(Min))<<std::endl;
-
-    _FineOperator.Op(Min,tmp);
-    tmp = in - tmp;   // in - A Min
-
-    _Aggregates.ProjectToSubspace  (Csrc,tmp);
-    HermOp.AdjOp(Csrc,Ctmp);// Normal equations
-    _Guess(Ctmp,Csol);
-    CG(MdagMOp,Ctmp,Csol);
-
-    HermOp.Op(Csol,Ctmp);
-    Ctmp=Ctmp-Csrc;
-    std::cout<<GridLogMessage<<"coarse space true residual "<<std::sqrt(norm2(Ctmp)/norm2(Csrc))<<std::endl;
-    _Aggregates.PromoteFromSubspace(Csol,out);
-
-    _FineOperator.Op(out,res);
-    res=res-tmp;
-    std::cout<<GridLogMessage<<"promoted sol residual "<<std::sqrt(norm2(res)/norm2(tmp))<<std::endl;
-    _Aggregates.ProjectToSubspace  (Csrc,res);
-    std::cout<<GridLogMessage<<"coarse space proj of residual "<<norm2(Csrc)<<std::endl;
-
-    
-    out = out+Min; // additive coarse space correction
-    //    out = Min; // no additive coarse space correction
-
-    _FineOperator.Op(out,tmp);
-    tmp=tmp-in;         // tmp is new residual
-
-    std::cout<<GridLogMessage<< " Preconditioner in  " << norm2(in)<<std::endl; 
-    std::cout<<GridLogMessage<< " Preconditioner out " << norm2(out)<<std::endl; 
-    std::cout<<GridLogMessage<<"preconditioner thinks residual is "<<std::sqrt(norm2(tmp)/norm2(in))<<std::endl;
-
-  }
-#endif
-  // ADEF1: [MP+Q ] in =M [1 - A Q] in + Q in  
-#if 1
-  void operatorADEF1(const FineField &in, FineField & out) {
-
-    CoarseVector Csrc(_CoarseOperator.Grid());
-    CoarseVector Ctmp(_CoarseOperator.Grid());
-    CoarseVector Csol(_CoarseOperator.Grid()); Csol=Zero();
-
-    ConjugateGradient<CoarseVector>  CG(1.0e-10,100000);
-    ConjugateGradient<FineField>    fCG(1.0e-3,1000);
-
-    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
-    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    ShiftedMdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix,0.1);
-
-    FineField tmp(in.Grid());
-    FineField res(in.Grid());
-    FineField Qin(in.Grid());
-
-    // Monitor completeness of low mode space
-    //    _Aggregates.ProjectToSubspace  (Csrc,in);
-    //    _Aggregates.PromoteFromSubspace(Csrc,out);
-    //    std::cout<<GridLogMessage<<"Coarse Grid Preconditioner\nCompleteness in: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
-    
-    _Aggregates.ProjectToSubspace  (Csrc,in);
-    HermOp.AdjOp(Csrc,Ctmp);// Normal equations
-    CG(MdagMOp,Ctmp,Csol);
-    _Aggregates.PromoteFromSubspace(Csol,Qin);
-
-    //    Qin=0;
-    _FineOperator.Op(Qin,tmp);// A Q in
-    tmp = in - tmp;            // in - A Q in
-
-    _FineOperator.Op(tmp,res);// this is the G5 herm bit
-    fCG(fMdagMOp,res,out);    // solves  MdagM = g5 M g5M
-
-    out = out + Qin;
-
-    _FineOperator.Op(out,tmp);
-    tmp=tmp-in;         // tmp is new residual
-
-    std::cout<<GridLogMessage<<"preconditioner thinks residual is "<<std::sqrt(norm2(tmp)/norm2(in))<<std::endl;
-
-  }
-#endif
-
-  void SmootherTest (const FineField & in){
-    
-    FineField vec1(in.Grid());
-    FineField vec2(in.Grid());
-    RealD lo[3] = { 0.5, 1.0, 2.0};
-
-    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix);
-    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_SmootherMatrix,0.0);
-
-    RealD Ni,r;
-
-    Ni = norm2(in);
-
-    for(int ilo=0;ilo<3;ilo++){
-      for(int ord=5;ord<50;ord*=2){
-
-	std::cout << " lo "<<lo[ilo]<<" order "<<ord<<std::endl;
-
-	_SmootherOperator.AdjOp(in,vec1);
-
-	Chebyshev<FineField> Cheby  (lo[ilo],70.0,ord,InverseApproximation);
-	Cheby(fMdagMOp,vec1,vec2);    // solves  MdagM = g5 M g5M
-
-	_FineOperator.Op(vec2,vec1);// this is the G5 herm bit
-	vec1  = in - vec1;   // tmp  = in - A Min
-	r=norm2(vec1);
-	std::cout<<GridLogMessage << "Smoother resid "<<std::sqrt(r/Ni)<<std::endl;
-
-      }
-    }
-  }
-
-  void operatorCheby(const FineField &in, FineField & out) {
-
-    CoarseVector Csrc(_CoarseOperator.Grid());
-    CoarseVector Ctmp(_CoarseOperator.Grid());
-    CoarseVector Ctmp1(_CoarseOperator.Grid());
     CoarseVector Csol(_CoarseOperator.Grid()); 
-    
-    ConjugateGradient<CoarseVector>  CG(5.0e-2,100000);
-
-    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
-    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix);
-    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_SmootherMatrix,0.0);
-
     FineField vec1(in.Grid());
     FineField vec2(in.Grid());
 
-    Chebyshev<FineField> Cheby    (_params.lo,_params.hi,_params.order,InverseApproximation);
-    Chebyshev<FineField> ChebyAccu(_params.lo,_params.hi,_params.order,InverseApproximation);
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(in,out);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
 
-    //    _Aggregates.ProjectToSubspace  (Csrc,in);
-    //    _Aggregates.PromoteFromSubspace(Csrc,out);
-    //    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
-    
-    //    ofstream fout("smoother");
-    //    Cheby.csv(fout);
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
 
-    // V11 multigrid.
-    // Use a fixed chebyshev and hope hermiticity helps.
-
-    // To make a working smoother for indefinite operator
-    // must multiply by "Mdag" (ouch loses all low mode content)
-    // and apply to poly approx of (mdagm)^-1.
-    // so that we end up with an odd polynomial.
-
-    RealD Ni = norm2(in);
-
-    std::cout<<GridLogMessage << "Smoother calling Cheby" <<std::endl;
-    _SmootherOperator.AdjOp(in,vec1);// this is the G5 herm bit
-    ChebyAccu(fMdagMOp,vec1,out);    // solves  MdagM = g5 M g5M
-    std::cout<<GridLogMessage << "Smoother called Cheby" <<std::endl;
-
-    // Update with residual for out
-    _FineOperator.Op(out,vec1);// this is the G5 herm bit
-    vec1  = in - vec1;   // tmp  = in - A Min
-
-    RealD r = norm2(vec1);
-
-    std::cout<<GridLogMessage << "Smoother resid "<<std::sqrt(r/Ni)<< " " << r << " " << Ni <<std::endl;
-    
-    std::cout<<GridLogMessage << "ProjectToSubspace" <<std::endl;
+    // Fine to Coarse 
+    t=-usecond();
     _Aggregates.ProjectToSubspace  (Csrc,vec1);
-    std::cout<<GridLogMessage << "ProjectToSubspaceDone" <<std::endl;
-    
-    HermOp.AdjOp(Csrc,Ctmp1);// Normal equations
+    t+=usecond();
+    GridLogLevel << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
 
-    _Guess(Ctmp1,Csol);
-    CG(MdagMOp,Ctmp1,Csol);
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    t+=usecond();
+    GridLogLevel << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
 
-    //////////////////////////////
-    // Recompute true residual
-    //////////////////////////////
-    MdagMOp.HermOp(Csol,Ctmp);
-    Ctmp = Ctmp1 - Ctmp;      // r=Csrc - M^dagM sol // This is already computed inside CG
-    HermOp.AdjOp(Ctmp,Ctmp1);// Normal equations
-    _Guess(Ctmp1,Ctmp);      // sol = sol' + MdagM^-1 (Csrc' - MdagM sol')
-    Csol = Csol + Ctmp;
+    // Coarse to Fine
+    t=-usecond();
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    GridLogLevel << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
 
-    std::cout<<GridLogMessage << "PromoteFromSubspace" <<std::endl;
-    _Aggregates.PromoteFromSubspace(Csol,vec1); // Ass^{-1} [in - A Min]_s
-                                                // Q = Q[in - A Min]  
-    std::cout<<GridLogMessage << "PromoteFromSubspaceDone" <<std::endl;
-    out = out+vec1;
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
 
-    // Three preconditioner smoothing -- hermitian if C3 = C1
-    // Recompute error
-    _FineOperator.Op(out,vec1);// this is the G5 herm bit
-    std::cout<<GridLogMessage << "FineOp" <<std::endl;
-    vec1  = in - vec1;   // tmp  = in - A Min
-    r=norm2(vec1);
-
-    std::cout<<GridLogMessage << "Coarse resid "<<std::sqrt(r/Ni)<<std::endl;
-
-    // Reapply smoother
-    std::cout<<GridLogMessage << "Smoother calling Cheby" <<std::endl;
-    _SmootherOperator.Op(vec1,vec2);  // this is the G5 herm bit
-    ChebyAccu(fMdagMOp,vec2,vec1);    // solves  MdagM = g5 M g5M
-    std::cout<<GridLogMessage << "Smoother called Cheby" <<std::endl;
-
-    out =out+vec1;
-    vec1  = in - vec1;   // tmp  = in - A Min
-    r=norm2(vec1);
-    std::cout<<GridLogMessage << "Smoother resid "<<std::sqrt(r/Ni)<<std::endl;
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(vec1,vec2);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
 
+    add( out,out,vec2);
   }
-
 };
 
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
-  myclass params;
-  myclass cparams;
-
-  XmlReader RD("params.xml");
-  read(RD,"params",params);
-  std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
-
-  const int Ls=params.Ls;
+  const int Ls=16;
 
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -391,15 +210,22 @@ int main (int argc, char ** argv)
   // Construct a coarsened grid; utility for this?
   ///////////////////////////////////////////////////
   std::vector<int> block ({2,2,2,2});
+  std::vector<int> blockc ({2,2,2,2});
   const int nbasis= 32;
+  const int nbasisc= 32;
   auto clatt = GridDefaultLatt();
   for(int d=0;d<clatt.size();d++){
     clatt[d] = clatt[d]/block[d];
   }
-
+  auto cclatt = clatt;
+  for(int d=0;d<clatt.size();d++){
+    cclatt[d] = clatt[d]/blockc[d];
+  }
 
   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
   GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
 
   std::vector<int> seeds4({1,2,3,4});
   std::vector<int> seeds5({5,6,7,8});
@@ -407,49 +233,20 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
   GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
   GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
-
-  Gamma g5(Gamma::Algebra::Gamma5);
-
   LatticeFermion    src(FGrid); gaussian(RNG5,src);// src=src+g5*src;
-  LatticeFermion result(FGrid); result=Zero();
-  LatticeFermion    ref(FGrid); ref=Zero();
-  LatticeFermion    tmp(FGrid);
-  LatticeFermion    err(FGrid);
+  LatticeFermion result(FGrid); 
   LatticeGaugeField Umu(UGrid); 
-  LatticeGaugeField UmuDD(UGrid); 
-  LatticeColourMatrix U(UGrid);
-  LatticeColourMatrix zz(UGrid);
 
   FieldMetaData header;
   std::string file("./ckpoint_lat.4000");
   NerscIO::readConfiguration(Umu,header,file);
 
-
-  if ( params.domaindecompose ) { 
-    Lattice<iScalar<vInteger> > coor(UGrid);
-    zz=Zero();
-    for(int mu=0;mu<Nd;mu++){
-      LatticeCoordinate(coor,mu);
-      U = PeekIndex<LorentzIndex>(Umu,mu);
-      U = where(mod(coor,params.domainsize)==(Integer)0,zz,U);
-      PokeIndex<LorentzIndex>(UmuDD,U,mu);
-    }
-  } else { 
-    UmuDD = Umu;
-  }
-  //  SU3::ColdConfiguration(RNG4,Umu);
-  //  SU3::TepidConfiguration(RNG4,Umu);
-  //  SU3::HotConfiguration(RNG4,Umu);
-  //  Umu=Zero();
-
-  RealD mass=params.mq;
-  RealD M5=1.8;
-
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  RealD mass=0.001;
+  RealD M5=1.8;
   DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  DomainWallFermionR DdwfDD(UmuDD,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 
   typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
   typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
@@ -463,204 +260,140 @@ int main (int argc, char ** argv)
   Subspace Aggregates(Coarse5d,FGrid,0);
 
   assert ( (nbasis & 0x1)==0);
-
   {
     int nb=nbasis/2;
-    std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
-
-   Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,500,110);
+    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,500,100,100,0.0);
     for(int n=0;n<nb;n++){
       G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
     }
+    LatticeFermion A(FGrid);
+    LatticeFermion B(FGrid);
+    for(int n=0;n<nb;n++){
+      A = Aggregates.subspace[n];
+      B = Aggregates.subspace[n+nb];
+      Aggregates.subspace[n]   = A+B; // 1+G5 // eigen value of G5R5 is +1
+      Aggregates.subspace[n+nb]= A-B; // 1-G5 // eigen value of G5R5 is -1
+    }
   }
-
   
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
-  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOpDD(DdwfDD);
-  CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LDOp(*Coarse5d,1); // Hermitian matrix
-  LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
-  exit(0);
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>    Level1Op;
+  typedef CoarsenedMatrix<siteVector,iScalar<vTComplex>,nbasisc> Level2Op;
+
+  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
+
+  Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
 
-  CoarseVector c_src (Coarse5d);
-  CoarseVector c_res (Coarse5d);
-  gaussian(CRNG,c_src);
-  result=Zero();
-  c_res=Zero();
 
   //////////////////////////////////////////////////
   // Deflate the course space. Recursive multigrid?
   //////////////////////////////////////////////////
-
-  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> Level1Op;
-  typedef CoarsenedMatrix<siteVector,iScalar<vTComplex>,nbasis> Level2Op;
-
-  auto cclatt = clatt;
-  for(int d=0;d<clatt.size();d++){
-    cclatt[d] = clatt[d]/block[d];
-  }
-  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
-  GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
-
-  typedef Aggregation<siteVector,iScalar<vTComplex>,nbasis>                   CoarseSubspace;
+  typedef Aggregation<siteVector,iScalar<vTComplex>,nbasisc>                   CoarseSubspace;
   CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0);
 
-  double c_first = 0.2;
-  double c_div   = 1.2;
-  std::vector<double> c_lo(nbasis/2);
-  c_lo[0] = c_first;
-  for(int b=1;b<nbasis/2;b++) {
-    c_lo[b] = c_lo[b-1]/c_div;
-  }
-  std::vector<int> c_ord(nbasis/2,200);
-  c_ord[0]=500;
-
-#define RECURSIVE_MULTIGRID
-#ifdef RECURSIVE_MULTIGRID
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Build deflation space in coarse operator "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 
   MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
-  //  CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nbasis,14.0,c_lo,c_ord);
-  //  CoarseAggregates.CreateSubspaceRandom(CRNG);
+  {
+    int nb=nbasisc/2;
+    CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,12.0,0.02,500,100,100,0.0);
+    for(int n=0;n<nb;n++){
+      auto subspace    = CoarseAggregates.subspace[n].View();
+      auto subspace_g5 = CoarseAggregates.subspace[n+nb].View();
+      for(int nn=0;nn<nb;nn++){
+	for(int site=0;site<Coarse5d->oSites();site++){
+	  subspace_g5[site](nn)   = subspace[site](nn);
+	  subspace_g5[site](nn+nb)=-subspace[site](nn+nb);
+	}
+      }
+    }
+  }
 
-  //  Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix
-  //  HermitianLinearOperator<Level1Op,CoarseVector> L1LinOp(LDOp);
-  //  L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates);
-#endif
+  Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix
+  typedef Level2Op::CoarseVector CoarseCoarseVector;
+  HermitianLinearOperator<Level1Op,CoarseVector> L1LinOp(LDOp);
+  L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates);
 
 
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
-  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running CoarseCoarse grid Lanczos "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 
-  //  TrivialPrecon<LatticeFermion> simple;
-  //  ConjugateGradient<LatticeFermion> fCG(1.0e-8,100000);
-  //  fCG(HermDefOp,src,result);
+  MdagMLinearOperator<Level2Op,CoarseCoarseVector> IRLHermOpL2(L2Op);
+  Chebyshev<CoarseCoarseVector> IRLChebyL2(0.001,4.2,71);
+  FunctionHermOp<CoarseCoarseVector> IRLOpChebyL2(IRLChebyL2,IRLHermOpL2);
+  PlainHermOp<CoarseCoarseVector> IRLOpL2    (IRLHermOpL2);
+  int cNk=24;
+  int cNm=36;
+  int cNstop=24;
+  ImplicitlyRestartedLanczos<CoarseCoarseVector> IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20);
 
-    std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-    std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
-    std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-    LatticeFermion    src_o(FrbGrid);
-    LatticeFermion result_o(FrbGrid);
-    pickCheckerboard(Odd,src_o,src);
-    result_o=Zero();
-    SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
-    ConjugateGradient<LatticeFermion> pCG(1.0e-8,10000);
-    //    pCG(HermOpEO,src_o,result_o);
+  int cNconv;
+  std::vector<RealD>          eval2(cNm);
+  std::vector<CoarseCoarseVector>   evec2(cNm,CoarseCoarse5d);
+  CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0;
+  IRLL2.calc(eval2,evec2,cc_src,cNconv);
+
+  ConjugateGradient<CoarseCoarseVector>  CoarseCoarseCG(0.1,1000);
+  DeflatedGuesser<CoarseCoarseVector> DeflCoarseCoarseGuesser(evec2,eval2);
+  NormalEquations<CoarseCoarseVector> DeflCoarseCoarseCGNE(L2Op,CoarseCoarseCG,DeflCoarseCoarseGuesser);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building 3 level Multigrid            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,DeflatedGuesser<CoarseVector> , NormalEquations<CoarseVector> >   TwoLevelMG;
+  typedef MultiGridPreconditioner<siteVector,iScalar<vTComplex>,nbasisc,Level1Op, DeflatedGuesser<CoarseCoarseVector>, NormalEquations<CoarseCoarseVector> > CoarseMG;
+  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,ZeroGuesser<CoarseVector>, LinearFunction<CoarseVector> >     ThreeLevelMG;
+
+  // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space
+  ChebyshevSmoother<CoarseVector,  Level1Op >        CoarseSmoother(0.1,12.0,3,L1LinOp,LDOp);
+  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf);
+
+  //  MirsSmoother<CoarseVector,  Level1Op >        CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp);
+  //  MirsSmoother<LatticeFermion,DomainWallFermionR> FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf);
+
+  CoarseMG Level2Precon (CoarseAggregates, L2Op,
+			 L1LinOp,LDOp,
+			 CoarseSmoother,
+			 DeflCoarseCoarseGuesser,
+			 DeflCoarseCoarseCGNE);
+  Level2Precon.Level(2);
+
+  // PGCR Applying this solver to solve the coarse space problem
+  PrecGeneralisedConjugateResidual<CoarseVector>  l2PGCR(0.1, 100, L1LinOp,Level2Precon,16,16);
+  l2PGCR.Level(2);
   
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage << " Running coarse grid Lanczos "<< std::endl;
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  MdagMLinearOperator<Level1Op,CoarseVector> IRLHermOp(LDOp);
-  Chebyshev<CoarseVector> IRLCheby(0.005,16.0,51);
-  //  IRLCheby.InitLowPass(0.01,18.0,51);
-  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,IRLHermOp);
-     PlainHermOp<CoarseVector> IRLOp    (IRLHermOp);
-
-     int Nstop=24;
-     int Nk=24;
-  int Nm=48;
-  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20);
-  int Nconv;
-  std::vector<RealD>          eval(Nm);
-  std::vector<CoarseVector>   evec(Nm,Coarse5d);
-  IRL.calc(eval,evec,c_src,Nconv);
-
-
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage << "coarse grid CG "<< std::endl;
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-
-  //  ConjugateGradient<CoarseVector> CG(3.0e-3,100000);
-  //  CG(PosdefLdop,c_src,c_res);
-
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage << "coarse grid Deflated CG with "<< eval.size() << " evecs" << std::endl;
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  
-  c_res=Zero();
-  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
-  DeflCoarseGuesser(c_src,c_res);
-  //  CG(PosdefLdop,c_src,c_res);
-
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage <<" Applying Fine power method to find spectral range      "<<std::endl;
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-
+  // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space
   ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  ThreeLevelMG ThreeLevelPrecon(Aggregates, LDOp,
+				HermIndefOp,Ddwf,
+				FineSmoother,
+				CoarseZeroGuesser,
+				l2PGCR);
+  ThreeLevelPrecon.Level(1);
 
-  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR,
-			   ZeroGuesser<CoarseVector> >
-    Precon  (Aggregates, LDOp,
-	     HermIndefOp,Ddwf,
-	     HermIndefOp,Ddwf,
-	     CoarseZeroGuesser,
-	     params);
+  // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid
+  PrecGeneralisedConjugateResidual<LatticeFermion> l1PGCR(1.0e-8,1000,HermIndefOp,ThreeLevelPrecon,16,16);
+  l1PGCR.Level(1);
 
-  //  Precon.PowerMethod(src);
-  
-  /*  
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage <<" Applying Coarse power method to find spectral range      "<<std::endl;
+  std::cout<<GridLogMessage << "Calling 3 level Multigrid            "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  
-  cparams = params;
-  cparams.hi   = 20.0;
-  cparams.lo   =  0.2;
-  cparams.order=  20;
-
-  MultiGridPreconditioner <siteVector,iScalar<vTComplex>,nbasis,Level1Op,ZeroGuesser<CoarseVector> > 
-  CoarsePrecon (CoarseAggregates, 
-		L2Op,
-		L1LinOp,LDOp,
-		L1LinOp,LDOp,
-		CoarseZeroGuesser,
-		cparams);
-  
-  CoarsePrecon.PowerMethod(c_src);
-  */
-
-  /*
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,8);
-  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
   result=Zero();
-  PGCR(HermIndefOp,src,result);
-  */
+  l1PGCR(src,result);
+
+  CoarseVector c_src(Coarse5d); c_src=1.0;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage << "Building a two level deflated PGCR "<< std::endl;
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-
-  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR, DeflatedGuesser<CoarseVector> >
-    DeflatedPrecon  (Aggregates, LDOp,
-		     HermIndefOp,Ddwf,
-		     HermIndefOp,Ddwf,
-		     DeflCoarseGuesser,
-		     params);
-
-  PrecGeneralisedConjugateResidual<LatticeFermion> deflPGCR(1.0e-8,100000,DeflatedPrecon,16,16);
-
-  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
-  result=Zero();
-  deflPGCR(HermIndefOp,src,result);
-
-
-  /*
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  std::cout<<GridLogMessage << "Building deflation preconditioner "<< std::endl;
-  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-
-  PrecGeneralisedConjugateResidual<CoarseVector> CPGCR(1.0e-3,10000,CoarsePrecon,8,8);
-  std::cout<<GridLogMessage<<"checking norm src "<<norm2(c_src)<<std::endl;
-  c_res=Zero();
-  CPGCR(L1LinOp,c_src,c_res);
-  */
+  std::cout<<GridLogMessage << " Fine        PowerMethod           "<< std::endl;
+  PowerMethod<LatticeFermion>       PM;   PM(HermDefOp,src);
+  std::cout<<GridLogMessage << " Coarse       PowerMethod           "<< std::endl;
+  PowerMethod<CoarseVector>        cPM;  cPM(PosdefLdop,c_src);
+  std::cout<<GridLogMessage << " CoarseCoarse PowerMethod           "<< std::endl;
+  PowerMethod<CoarseCoarseVector> ccPM; ccPM(IRLHermOpL2,cc_src);
 
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Done "<< std::endl;

From d671a63e78d8d437f12c042e98a9e51004d94022 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Fri, 3 Apr 2020 19:52:15 +0100
Subject: [PATCH 41/43] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4b0a86f8..9f690ce0 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
 
 **Data parallel C++ mathematical object library.**
 

From 3b0e07882f3f4f808c69cd5922a5aacc50315eee Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 10 Apr 2020 11:28:33 -0400
Subject: [PATCH 42/43] Adding another form of polynomial

---
 Grid/algorithms/approx/JacobiPolynomial.h | 129 ++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 Grid/algorithms/approx/JacobiPolynomial.h

diff --git a/Grid/algorithms/approx/JacobiPolynomial.h b/Grid/algorithms/approx/JacobiPolynomial.h
new file mode 100644
index 00000000..e68d1dd7
--- /dev/null
+++ b/Grid/algorithms/approx/JacobiPolynomial.h
@@ -0,0 +1,129 @@
+#ifndef GRID_JACOBIPOLYNOMIAL_H
+#define GRID_JACOBIPOLYNOMIAL_H
+
+#include <Grid/algorithms/LinearOperator.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Field>
+class JacobiPolynomial : public OperatorFunction<Field> {
+ private:
+  using OperatorFunction<Field>::operator();
+
+  int order;
+  RealD hi;
+  RealD lo;
+  RealD alpha;
+  RealD beta;
+
+ public:
+  void csv(std::ostream &out){
+    csv(out,lo,hi);
+  }
+  void csv(std::ostream &out,RealD llo,RealD hhi){
+    RealD diff = hhi-llo;
+    RealD delta = diff*1.0e-5;
+    for (RealD x=llo-delta; x<=hhi; x+=delta) {
+      RealD f = approx(x);
+      out<< x<<" "<<f <<std::endl;
+    }
+    return;
+  }
+
+  JacobiPolynomial(){};
+  JacobiPolynomial(RealD _lo,RealD _hi,int _order,RealD _alpha, RealD _beta)
+  {
+      lo=_lo;
+      hi=_hi;
+      alpha=_alpha;
+      beta=_beta;
+      order=_order;
+  };
+
+  RealD approx(RealD x) // Convenience for plotting the approximation                                                       
+  {
+    RealD Tn;
+    RealD Tnm;
+    RealD Tnp;
+
+    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+
+    RealD T0=1.0;
+    RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
+
+    Tn =T1;
+    Tnm=T0;
+    for(int n=2;n<=order;n++){
+      RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
+      RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
+      RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
+      RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
+      Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
+      Tnm=Tn;
+      Tn =Tnp;
+    }
+    return Tnp;
+  };
+
+  // Implement the required interface                                                                                       
+  void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+    GridBase *grid=in.Grid();
+
+    int vol=grid->gSites();
+
+    Field T0(grid);
+    Field T1(grid);
+    Field T2(grid);
+    Field y(grid);
+
+
+    Field *Tnm = &T0;
+    Field *Tn  = &T1;
+    Field *Tnp = &T2;
+
+    //    RealD T0=1.0;                                                                                                     
+    T0=in;
+
+    //    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));                                                                           
+    //           = x * 2/(hi-lo) - (hi+lo)/(hi-lo)                                                                          
+    Linop.HermOp(T0,y);
+    RealD xscale = 2.0/(hi-lo);
+    RealD mscale = -(hi+lo)/(hi-lo);
+    Linop.HermOp(T0,y);
+    y=y*xscale+in*mscale;
+
+    // RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
+    RealD halfAmB  = (alpha-beta)*0.5;
+    RealD halfApBp2= (alpha+beta+2.0)*0.5;
+    T1 = halfAmB * in + halfApBp2*y;
+
+    for(int n=2;n<=order;n++){
+
+      Linop.HermOp(*Tn,y);
+      y=xscale*y+mscale*(*Tn);
+
+      RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
+      RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
+      RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
+      RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
+
+      //      Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;                                                             
+      cny=cny/cnp;
+      cn1=cn1/cnp;
+      cn1=cn1/cnp;
+      cnm=cnm/cnp;
+
+      *Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm);
+
+      // Cycle pointers to avoid copies                                                                                     
+      Field *swizzle = Tnm;
+      Tnm    =Tn;
+      Tn     =Tnp;
+      Tnp    =swizzle;
+    }
+    out=*Tnp;
+
+  }
+};
+NAMESPACE_END(Grid);
+#endif

From 014dbfa4645687770741012d894734258ad7320c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 10 Apr 2020 11:57:09 -0400
Subject: [PATCH 43/43] Compile fix with OpDirAll

---
 Hadrons/Modules/MDistil/LapEvec.hpp     | 1 +
 tests/lanczos/Test_synthetic_lanczos.cc | 1 +
 2 files changed, 2 insertions(+)

diff --git a/Hadrons/Modules/MDistil/LapEvec.hpp b/Hadrons/Modules/MDistil/LapEvec.hpp
index 3c1122ca..4576ffe3 100644
--- a/Hadrons/Modules/MDistil/LapEvec.hpp
+++ b/Hadrons/Modules/MDistil/LapEvec.hpp
@@ -198,6 +198,7 @@ public:
         }
     }
     
+    void OpDirAll  (const Field &in, std::vector<Field> &out){ HADRONS_ERROR(Definition, "OpDirAll() undefined"); };
     void OpDiag (const Field &in, Field &out) { HADRONS_ERROR(Definition, "OpDiag() undefined"); };
     void OpDir  (const Field &in, Field &out,int dir,int disp) { HADRONS_ERROR(Definition, "OpDir() undefined"); };
     void Op     (const Field &in, Field &out) { HADRONS_ERROR(Definition, "Op() undefined"); };
diff --git a/tests/lanczos/Test_synthetic_lanczos.cc b/tests/lanczos/Test_synthetic_lanczos.cc
index 7a3591dd..a1e0e672 100644
--- a/tests/lanczos/Test_synthetic_lanczos.cc
+++ b/tests/lanczos/Test_synthetic_lanczos.cc
@@ -73,6 +73,7 @@ public:
   }
 
   // Support for coarsening to a multigrid
+  void OpDirAll  (const Field &in, std::vector<Field> &out){};
   void OpDiag (const Field &in, Field &out) {};
   void OpDir  (const Field &in, Field &out,int dir,int disp){};