From f0dc0f36214c4dda1eace9d83956e5d7fef4f729 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sat, 22 Aug 2020 13:57:33 +0200
Subject: [PATCH 01/14] fix compile issue on Qpace3

---
 Grid/lattice/Lattice_transfer.h | 5 +++++
 1 file changed, 5 insertions(+)
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index e698e40e..91de721f 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -127,6 +127,11 @@ accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
   convertType(out,in._internal);
 }
 
+template<typename T1, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
+accelerator_inline void convertType(T1 & out, const iScalar<T1> & in) {
+  convertType(out,in._internal);
+}
+
 template<typename T1,typename T2>
 accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
   convertType(out._internal,in);

From 1292d595634172e28158f99d31863d63267ca5ac Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Thu, 11 Jun 2020 13:16:00 +0200
Subject: [PATCH 02/14] Add a typedef + broaden interface of CMat

---
 Grid/algorithms/CoarsenedMatrix.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index 8d184aea..76950baf 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -268,6 +268,21 @@ public:
   typedef iMatrix<CComplex,nbasis >  Cobj;
   typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
   typedef Lattice<Fobj >        FineField;
+  typedef CoarseVector FermionField;
+
+  // enrich interface
+  void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); }
+  void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); }
+  void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); }
+  void MooeeDag(CoarseVector const& in, CoarseVector& out) { assert(0); }
+  void MooeeInv(CoarseVector const& in, CoarseVector& out) { assert(0); }
+  void MooeeInvDag(CoarseVector const& in, CoarseVector& out) { assert(0); }
+  void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; }
+  void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; }
+  void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; }
+  void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; }
+  void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
+  void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
 
   ////////////////////
   // Data members

From dd1ba266b269b093a8bcd9bf815438d9896d7148 Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Fri, 17 Jul 2020 11:58:02 +0200
Subject: [PATCH 03/14] Fix mapping between dir + disp and point in CMat

---
 Grid/algorithms/CoarsenedMatrix.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index 76950baf..d18fba43 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -432,25 +432,25 @@ public:
 
     //////////////
     // 4D action like wilson
-    // 0+ => 0 
-    // 0- => 1
-    // 1+ => 2 
-    // 1- => 3
+    // 0+ => 0
+    // 0- => 4
+    // 1+ => 1
+    // 1- => 5
     // etc..
     //////////////
     // 5D action like DWF
-    // 1+ => 0 
-    // 1- => 1
-    // 2+ => 2 
-    // 2- => 3
+    // 1+ => 0
+    // 1- => 4
+    // 2+ => 1
+    // 2- => 5
     // etc..
     auto point = [dir, disp, ndim](){
       if(dir == 0 and disp == 0)
 	return 8;
       else if ( ndim==4 ) { 
-	return (4 * dir + 1 - disp) / 2;
+	return (1 - disp) / 2 * 4 + dir;
       } else { 
-	return (4 * (dir-1) + 1 - disp) / 2;
+	return (1 - disp) / 2 * 4 + dir - 1;
       }
     }();
 

From b2087f14c48e881ab041bbaef7f6c444c88a895d Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Mon, 24 Aug 2020 16:54:36 +0200
Subject: [PATCH 04/14] Fix CoarsenedMatrix regarding illegal memory accesses

Need a reference to geom since the lambda copies the this pointer which points to host memory, see
- https://docs.nvidia.com/cuda/cuda-c-programming-guide/#star-this-capture
- https://devblogs.nvidia.com/new-compiler-features-cuda-8/
---
 Grid/algorithms/CoarsenedMatrix.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index d18fba43..ba40535c 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -310,6 +310,8 @@ public:
     Stencil.HaloExchange(in,compressor);
     autoView( in_v , in, AcceleratorRead);
     autoView( out_v , out, AcceleratorWrite);
+    autoView( Stencil_v  , Stencil, AcceleratorRead);
+    auto& geom_v = geom;
     typedef LatticeView<Cobj> Aview;
       
     Vector<Aview> AcceleratorViewContainer;
@@ -331,14 +333,14 @@ public:
       int ptype;
       StencilEntry *SE;
 
-      for(int point=0;point<geom.npoint;point++){
+      for(int point=0;point<geom_v.npoint;point++){
 
-	SE=Stencil.GetEntry(ptype,point,ss);
+	SE=Stencil_v.GetEntry(ptype,point,ss);
 	  
 	if(SE->_is_local) { 
 	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
 	} else {
-	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
+	  nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
 	}
 	acceleratorSynchronise();
 
@@ -382,6 +384,7 @@ public:
 
     autoView( out_v , out, AcceleratorWrite);
     autoView( in_v  , in, AcceleratorRead);
+    autoView( Stencil_v  , Stencil, AcceleratorRead);
 
     const int Nsimd = CComplex::Nsimd();
     typedef decltype(coalescedRead(in_v[0])) calcVector;
@@ -395,12 +398,12 @@ public:
       int ptype;
       StencilEntry *SE;
 
-      SE=Stencil.GetEntry(ptype,point,ss);
+      SE=Stencil_v.GetEntry(ptype,point,ss);
 	  
       if(SE->_is_local) { 
 	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
       } else {
-	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
+	nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
       }
       acceleratorSynchronise();
 

From 2a75516330925bd05681f4dba639482ca84270d7 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 26 Aug 2020 12:34:17 -0400
Subject: [PATCH 05/14] state MPI/SLURM message only on world_rank zero

---
 Grid/threads/Accelerator.cc | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc
index 864d90a9..f6df4a31 100644
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -21,22 +21,26 @@ void acceleratorInit(void)
 #define ENV_RANK_SLURM         "SLURM_PROCID"
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
-  // We extract the local rank initialization using an environment variable
-  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) {
-    printf("OPENMPI detected\n");
-    rank = atoi(localRankStr);		
-  } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) {
-    printf("MVAPICH detected\n");
-    rank = atoi(localRankStr);		
-  } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) {
-    printf("SLURM detected\n");
-    rank = atoi(localRankStr);		
-  } else { 
-    printf("MPI version is unknown - bad things may happen\n");
-  }
   if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
   if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
   if ((localRankStr = getenv(ENV_RANK_SLURM  )) != NULL) { world_rank = atoi(localRankStr);}
+  // We extract the local rank initialization using an environment variable
+  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) {
+    if (!world_rank)
+      printf("OPENMPI detected\n");
+    rank = atoi(localRankStr);		
+  } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) {
+    if (!world_rank)
+      printf("MVAPICH detected\n");
+    rank = atoi(localRankStr);		
+  } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) {
+    if (!world_rank)
+      printf("SLURM detected\n");
+    rank = atoi(localRankStr);		
+  } else { 
+    if (!world_rank)
+      printf("MPI version is unknown - bad things may happen\n");
+  }
 
   size_t totalDeviceMem=0;
   for (int i = 0; i < nDevices; i++) {

From cf3535d16e412adf80ca1a2f7ead54003860c94b Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Thu, 27 Aug 2020 14:06:48 +0200
Subject: [PATCH 06/14] Expose more functions in CMat

---
 Grid/algorithms/CoarsenedMatrix.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index ba40535c..cdfc2ac3 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -271,6 +271,9 @@ public:
   typedef CoarseVector FermionField;
 
   // enrich interface
+  void Dhop(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); }
+  void DhopEO(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); }
+  void DhopOE(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); }
   void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); }
   void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); }
   void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); }

From 4d2dc7ba0304397f536a2bc71260266e1475ae83 Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Mon, 7 Sep 2020 17:57:07 +0200
Subject: [PATCH 07/14]  Enable even-odd for CoarsenedMatrix

---
 Grid/algorithms/CoarsenedMatrix.h    | 471 +++++++++++++++++++++++---
 tests/solver/Test_coarse_even_odd.cc | 475 +++++++++++++++++++++++++++
 2 files changed, 899 insertions(+), 47 deletions(-)
 create mode 100644 tests/solver/Test_coarse_even_odd.cc

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index cdfc2ac3..66b9c169 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -31,6 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H
 
+#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
 
 NAMESPACE_BEGIN(Grid);
 
@@ -59,12 +60,14 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
 class Geometry {
 public:
   int npoint;
+  int base;
   std::vector<int> directions   ;
   std::vector<int> displacements;
+  std::vector<int> points_dagger;
 
   Geometry(int _d)  {
     
-    int base = (_d==5) ? 1:0;
+    base = (_d==5) ? 1:0;
 
     // make coarse grid stencil for 4d , not 5d
     if ( _d==5 ) _d=4;
@@ -72,16 +75,51 @@ public:
     npoint = 2*_d+1;
     directions.resize(npoint);
     displacements.resize(npoint);
+    points_dagger.resize(npoint);
     for(int d=0;d<_d;d++){
       directions[d   ] = d+base;
       directions[d+_d] = d+base;
       displacements[d  ] = +1;
       displacements[d+_d]= -1;
+      points_dagger[d   ] = d+_d;
+      points_dagger[d+_d] = d;
     }
     directions   [2*_d]=0;
     displacements[2*_d]=0;
+    points_dagger[2*_d]=2*_d;
   }
 
+  int point(int dir, int disp) {
+    assert(disp == -1 || disp == 0 || disp == 1);
+    assert(base+0 <= dir && dir < base+4);
+
+    // directions faster index = new indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  1  2  3  0  1  2  3  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  2  3  4  1  2  3  4  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+
+    // displacements faster index = old indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  0  1  1  2  2  3  3  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  1  2  2  3  3  4  4  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+
+    if(dir == 0 and disp == 0)
+      return 8;
+    else // New indexing
+      return (1 - disp) / 2 * 4 + dir - base;
+    // else // Old indexing
+    //   return (4 * (dir - base) + 1 - disp) / 2;
+  }
 };
   
 template<class Fobj,class CComplex,int nbasis>
@@ -258,7 +296,7 @@ public:
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
-class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
+class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
 public:
     
   typedef iVector<CComplex,nbasis >           siteVector;
@@ -270,16 +308,7 @@ public:
   typedef Lattice<Fobj >        FineField;
   typedef CoarseVector FermionField;
 
-  // enrich interface
-  void Dhop(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); }
-  void DhopEO(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); }
-  void DhopOE(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); }
-  void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); }
-  void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); }
-  void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); }
-  void MooeeDag(CoarseVector const& in, CoarseVector& out) { assert(0); }
-  void MooeeInv(CoarseVector const& in, CoarseVector& out) { assert(0); }
-  void MooeeInvDag(CoarseVector const& in, CoarseVector& out) { assert(0); }
+  // enrich interface, use default implementation as in FermionOperator ///////
   void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; }
   void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; }
   void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; }
@@ -292,21 +321,36 @@ public:
   ////////////////////
   Geometry         geom;
   GridBase *       _grid; 
+  GridBase*        _cbgrid;
   int hermitian;
 
   CartesianStencil<siteVector,siteVector,int> Stencil; 
+  CartesianStencil<siteVector,siteVector,int> StencilEven;
+  CartesianStencil<siteVector,siteVector,int> StencilOdd;
 
   std::vector<CoarseMatrix> A;
-    
+  std::vector<CoarseMatrix> Aeven;
+  std::vector<CoarseMatrix> Aodd;
+
+  CoarseMatrix AselfInv;
+  CoarseMatrix AselfInvEven;
+  CoarseMatrix AselfInvOdd;
+
+  Vector<RealD> dag_factor;
+
   ///////////////////////
   // Interface
   ///////////////////////
   GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know
+  GridBase * RedBlackGrid()     { return _cbgrid; };
+
+  int ConstEE() { return 0; }
 
   void M (const CoarseVector &in, CoarseVector &out)
   {
     conformable(_grid,in.Grid());
     conformable(in.Grid(),out.Grid());
+    out.Checkerboard() = in.Checkerboard();
 
     SimpleCompressor<siteVector> compressor;
 
@@ -364,12 +408,72 @@ public:
       return M(in,out);
     } else {
       // corresponds to Galerkin coarsening
-      CoarseVector tmp(Grid());
-      G5C(tmp, in); 
-      M(tmp, out);
-      G5C(out, out);
+      return MdagNonHermitian(in, out);
     }
   };
+
+  void MdagNonHermitian(const CoarseVector &in, CoarseVector &out)
+  {
+    conformable(_grid,in.Grid());
+    conformable(in.Grid(),out.Grid());
+    out.Checkerboard() = in.Checkerboard();
+
+    SimpleCompressor<siteVector> compressor;
+
+    Stencil.HaloExchange(in,compressor);
+    autoView( in_v , in, AcceleratorRead);
+    autoView( out_v , out, AcceleratorWrite);
+    autoView( Stencil_v  , Stencil, AcceleratorRead);
+    auto& geom_v = geom;
+    typedef LatticeView<Cobj> Aview;
+
+    Vector<Aview> AcceleratorViewContainer;
+
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
+    Aview *Aview_p = & AcceleratorViewContainer[0];
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    int osites=Grid()->oSites();
+
+    Vector<int> points(geom.npoint, 0);
+    for(int p=0; p<geom.npoint; p++)
+      points[p] = geom.points_dagger[p];
+
+    RealD* dag_factor_p = &dag_factor[0];
+
+    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
+      int ss = sss/nbasis;
+      int b  = sss%nbasis;
+      calcComplex res = Zero();
+      calcVector nbr;
+      int ptype;
+      StencilEntry *SE;
+
+      for(int p=0;p<geom_v.npoint;p++){
+        int point = points[p];
+
+	SE=Stencil_v.GetEntry(ptype,point,ss);
+
+	if(SE->_is_local) {
+	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+	} else {
+	  nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
+	}
+	acceleratorSynchronise();
+
+	for(int bb=0;bb<nbasis;bb++) {
+	  res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+	}
+      }
+      coalescedWrite(out_v[ss](b),res);
+      });
+
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+  }
+
   void MdirComms(const CoarseVector &in)
   {
     SimpleCompressor<siteVector> compressor;
@@ -379,6 +483,7 @@ public:
   {
     conformable(_grid,in.Grid());
     conformable(_grid,out.Grid());
+    out.Checkerboard() = in.Checkerboard();
 
     typedef LatticeView<Cobj> Aview;
     Vector<Aview> AcceleratorViewContainer;
@@ -434,34 +539,7 @@ public:
 
     this->MdirComms(in);
 
-    int ndim = in.Grid()->Nd();
-
-    //////////////
-    // 4D action like wilson
-    // 0+ => 0
-    // 0- => 4
-    // 1+ => 1
-    // 1- => 5
-    // etc..
-    //////////////
-    // 5D action like DWF
-    // 1+ => 0
-    // 1- => 4
-    // 2+ => 1
-    // 2- => 5
-    // etc..
-    auto point = [dir, disp, ndim](){
-      if(dir == 0 and disp == 0)
-	return 8;
-      else if ( ndim==4 ) { 
-	return (1 - disp) / 2 * 4 + dir;
-      } else { 
-	return (1 - disp) / 2 * 4 + dir - 1;
-      }
-    }();
-
-    MdirCalc(in,out,point);
-
+    MdirCalc(in,out,geom.point(dir,disp));
   };
 
   void Mdiag(const CoarseVector &in, CoarseVector &out)
@@ -470,17 +548,269 @@ public:
     MdirCalc(in, out, point); // No comms
   };
 
+  void Mooee(const CoarseVector &in, CoarseVector &out) {
+    MooeeInternal(in, out, DaggerNo, InverseNo);
+  }
+
+  void MooeeInv(const CoarseVector &in, CoarseVector &out) {
+    MooeeInternal(in, out, DaggerNo, InverseYes);
+  }
+
+  void MooeeDag(const CoarseVector &in, CoarseVector &out) {
+    MooeeInternal(in, out, DaggerYes, InverseNo);
+  }
+
+  void MooeeInvDag(const CoarseVector &in, CoarseVector &out) {
+    MooeeInternal(in, out, DaggerYes, InverseYes);
+  }
+
+  void Meooe(const CoarseVector &in, CoarseVector &out) {
+    if(in.Checkerboard() == Odd) {
+      DhopEO(in, out, DaggerNo);
+    } else {
+      DhopOE(in, out, DaggerNo);
+    }
+  }
+
+  void MeooeDag(const CoarseVector &in, CoarseVector &out) {
+    if(in.Checkerboard() == Odd) {
+      DhopEO(in, out, DaggerYes);
+    } else {
+      DhopOE(in, out, DaggerYes);
+    }
+  }
+
+  void Dhop(const CoarseVector &in, CoarseVector &out, int dag) {
+    conformable(in.Grid(), _grid); // verifies full grid
+    conformable(in.Grid(), out.Grid());
+
+    out.Checkerboard() = in.Checkerboard();
+
+    DhopInternal(Stencil, A, in, out, dag);
+  }
+
+  void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) {
+    conformable(in.Grid(), _cbgrid);    // verifies half grid
+    conformable(in.Grid(), out.Grid()); // drops the cb check
+
+    assert(in.Checkerboard() == Even);
+    out.Checkerboard() = Odd;
+
+    DhopInternal(StencilEven, Aodd, in, out, dag);
+  }
+
+  void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) {
+    conformable(in.Grid(), _cbgrid);    // verifies half grid
+    conformable(in.Grid(), out.Grid()); // drops the cb check
+
+    assert(in.Checkerboard() == Odd);
+    out.Checkerboard() = Even;
+
+    DhopInternal(StencilOdd, Aeven, in, out, dag);
+  }
+
+  void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
+    out.Checkerboard() = in.Checkerboard();
+    assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+
+    CoarseMatrix *Aself = nullptr;
+    if(in.Grid()->_isCheckerBoarded) {
+      if(in.Checkerboard() == Odd) {
+        Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1];
+        DselfInternal(StencilOdd, *Aself, in, out, dag);
+      } else {
+        Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1];
+        DselfInternal(StencilEven, *Aself, in, out, dag);
+      }
+    } else {
+      Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
+      DselfInternal(Stencil, *Aself, in, out, dag);
+    }
+    assert(Aself != nullptr);
+  }
+
+  void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
+                       const CoarseVector &in, CoarseVector &out, int dag) {
+    int point = geom.npoint-1;
+    autoView( out_v, out, AcceleratorWrite);
+    autoView( in_v,  in,  AcceleratorRead);
+    autoView( st_v,  st,  AcceleratorRead);
+    autoView( a_v,   a,   AcceleratorRead);
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    RealD* dag_factor_p = &dag_factor[0];
+
+    if(dag) {
+      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
+        int ss = sss/nbasis;
+        int b  = sss%nbasis;
+        calcComplex res = Zero();
+        calcVector nbr;
+        int ptype;
+        StencilEntry *SE;
+
+        SE=st_v.GetEntry(ptype,point,ss);
+
+        if(SE->_is_local) {
+          nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+        } else {
+          nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
+        }
+        acceleratorSynchronise();
+
+        for(int bb=0;bb<nbasis;bb++) {
+          res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(a_v[ss](b,bb))*nbr(bb);
+        }
+        coalescedWrite(out_v[ss](b),res);
+      });
+    } else {
+      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
+        int ss = sss/nbasis;
+        int b  = sss%nbasis;
+        calcComplex res = Zero();
+        calcVector nbr;
+        int ptype;
+        StencilEntry *SE;
+
+        SE=st_v.GetEntry(ptype,point,ss);
+
+        if(SE->_is_local) {
+          nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+        } else {
+          nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
+        }
+        acceleratorSynchronise();
+
+        for(int bb=0;bb<nbasis;bb++) {
+          res = res + coalescedRead(a_v[ss](b,bb))*nbr(bb);
+        }
+        coalescedWrite(out_v[ss](b),res);
+      });
+    }
+  }
+
+  void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
+                    const CoarseVector &in, CoarseVector &out, int dag) {
+    SimpleCompressor<siteVector> compressor;
+
+    st.HaloExchange(in,compressor);
+    autoView( in_v,  in,  AcceleratorRead);
+    autoView( out_v, out, AcceleratorWrite);
+    autoView( st_v , st,  AcceleratorRead);
+    typedef LatticeView<Cobj> Aview;
+
+    // determine in what order we need the points
+    int npoint = geom.npoint-1;
+    Vector<int> points(npoint, 0);
+    for(int p=0; p<npoint; p++)
+      points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
+
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
+    Aview *Aview_p = & AcceleratorViewContainer[0];
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    RealD* dag_factor_p = &dag_factor[0];
+
+    if(dag) {
+      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
+        int ss = sss/nbasis;
+        int b  = sss%nbasis;
+        calcComplex res = Zero();
+        calcVector nbr;
+        int ptype;
+        StencilEntry *SE;
+
+        for(int p=0;p<npoint;p++){
+          int point = points[p];
+          SE=st_v.GetEntry(ptype,point,ss);
+
+          if(SE->_is_local) {
+            nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+          } else {
+            nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
+          }
+          acceleratorSynchronise();
+
+          for(int bb=0;bb<nbasis;bb++) {
+            res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+          }
+        }
+        coalescedWrite(out_v[ss](b),res);
+      });
+    } else {
+      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
+        int ss = sss/nbasis;
+        int b  = sss%nbasis;
+        calcComplex res = Zero();
+        calcVector nbr;
+        int ptype;
+        StencilEntry *SE;
+
+        for(int p=0;p<npoint;p++){
+          int point = points[p];
+          SE=st_v.GetEntry(ptype,point,ss);
+
+          if(SE->_is_local) {
+            nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+          } else {
+            nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
+          }
+          acceleratorSynchronise();
+
+          for(int bb=0;bb<nbasis;bb++) {
+            res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+          }
+        }
+        coalescedWrite(out_v[ss](b),res);
+      });
+    }
+
+    for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
+  }
   
- CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	: 
+ CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) 	:
 
     _grid(&CoarseGrid),
+    _cbgrid(&CoarseRBGrid),
     geom(CoarseGrid._ndimension),
     hermitian(hermitian_),
     Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-      A(geom.npoint,&CoarseGrid)
+    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
+    A(geom.npoint,&CoarseGrid),
+    Aeven(geom.npoint,&CoarseRBGrid),
+    Aodd(geom.npoint,&CoarseRBGrid),
+    AselfInv(&CoarseGrid),
+    AselfInvEven(&CoarseRBGrid),
+    AselfInvOdd(&CoarseRBGrid),
+    dag_factor(nbasis*nbasis)
   {
+    fillFactor();
   };
 
+  void fillFactor() {
+    Eigen::MatrixXd dag_factor_eigen = Eigen::MatrixXd::Ones(nbasis, nbasis);
+    if(!hermitian) {
+      const int nb = nbasis/2;
+      dag_factor_eigen.block(0,nb,nb,nb) *= -1.0;
+      dag_factor_eigen.block(nb,0,nb,nb) *= -1.0;
+    }
+
+    // GPU readable prefactor
+    thread_for(i, nbasis*nbasis, {
+      int j = i/nbasis;
+      int k = i%nbasis;
+      dag_factor[i] = dag_factor_eigen(j, k);
+    });
+  }
+
   void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
   {
@@ -629,6 +959,9 @@ public:
       std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
       ForceHermitian();
     }
+
+    InvertSelfStencilLink(); std::cout << GridLogMessage << "Coarse self link inverted" << std::endl;
+    FillHalfCbs(); std::cout << GridLogMessage << "Coarse half checkerboards filled" << std::endl;
   }
 
   void ForceHermitian(void) {
@@ -650,6 +983,50 @@ public:
       }
     }
   }
+
+  void InvertSelfStencilLink() {
+    std::cout << GridLogDebug << "CoarsenedMatrix::InvertSelfStencilLink" << std::endl;
+    int localVolume = Grid()->lSites();
+
+    typedef typename Cobj::scalar_object scalar_object;
+
+    autoView(Aself_v,    A[geom.npoint-1], CpuRead);
+    autoView(AselfInv_v, AselfInv,         CpuWrite);
+    thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke
+      Eigen::MatrixXcd selfLinkEigen    = Eigen::MatrixXcd::Zero(nbasis, nbasis);
+      Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
+
+      scalar_object selfLink    = Zero();
+      scalar_object selfLinkInv = Zero();
+
+      Coordinate lcoor;
+
+      Grid()->LocalIndexToLocalCoor(site, lcoor);
+      peekLocalSite(selfLink, Aself_v, lcoor);
+
+      for (int i = 0; i < nbasis; ++i)
+        for (int j = 0; j < nbasis; ++j)
+          selfLinkEigen(i, j) = static_cast<ComplexD>(TensorRemove(selfLink(i, j)));
+
+      selfLinkInvEigen = selfLinkEigen.inverse();
+
+      for(int i = 0; i < nbasis; ++i)
+        for(int j = 0; j < nbasis; ++j)
+          selfLinkInv(i, j) = selfLinkInvEigen(i, j);
+
+      pokeLocalSite(selfLinkInv, AselfInv_v, lcoor);
+    });
+  }
+
+  void FillHalfCbs() {
+    std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl;
+    for(int p = 0; p < geom.npoint; ++p) {
+      pickCheckerboard(Even, Aeven[p], A[p]);
+      pickCheckerboard(Odd, Aodd[p], A[p]);
+    }
+    pickCheckerboard(Even, AselfInvEven, AselfInv);
+    pickCheckerboard(Odd, AselfInvOdd, AselfInv);
+  }
 };
 
 NAMESPACE_END(Grid);
diff --git a/tests/solver/Test_coarse_even_odd.cc b/tests/solver/Test_coarse_even_odd.cc
new file mode 100644
index 00000000..dfbab747
--- /dev/null
+++ b/tests/solver/Test_coarse_even_odd.cc
@@ -0,0 +1,475 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./tests/solver/Test_coarse_even_odd.cc
+
+    Copyright (C) 2015-2020
+
+    Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+#ifndef NBASIS
+#define NBASIS 40
+#endif
+
+// NOTE: The tests in this file are written in analogy to
+// - tests/core/Test_wilson_even_odd.cc
+// - tests/core/Test_wilson_clover.cc
+
+std::vector<int> readFromCommandlineIvec(int*                    argc,
+                                         char***                 argv,
+                                         std::string&&           option,
+                                         const std::vector<int>& defaultValue) {
+  std::string      arg;
+  std::vector<int> ret(defaultValue);
+  if(GridCmdOptionExists(*argv, *argv + *argc, option)) {
+    arg = GridCmdOptionPayload(*argv, *argv + *argc, option);
+    GridCmdOptionIntVector(arg, ret);
+  }
+  return ret;
+}
+
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);
+
+  /////////////////////////////////////////////////////////////////////////////
+  //                          Read from command line                         //
+  /////////////////////////////////////////////////////////////////////////////
+
+  const int  nbasis    = NBASIS; static_assert((nbasis & 0x1) == 0, "");
+  const int  nb        = nbasis/2;
+  Coordinate blockSize = readFromCommandlineIvec(&argc, &argv, "--blocksize", {2, 2, 2, 2});
+
+  std::cout << GridLogMessage << "Compiled with nbasis = " << nbasis << " -> nb = " << nb << std::endl;
+
+  /////////////////////////////////////////////////////////////////////////////
+  //                              General setup                              //
+  /////////////////////////////////////////////////////////////////////////////
+
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0; d<clatt.size(); d++) clatt[d] = clatt[d] / blockSize[d];
+
+  GridCartesian*         Grid_f   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
+  GridCartesian*         Grid_c   = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* RBGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(Grid_f);
+  GridRedBlackCartesian* RBGrid_c = SpaceTimeGrid::makeFourDimRedBlackGrid(Grid_c);
+
+  std::cout << GridLogMessage << "Grid_f:" << std::endl; Grid_f->show_decomposition();
+  std::cout << GridLogMessage << "Grid_c:" << std::endl; Grid_c->show_decomposition();
+  std::cout << GridLogMessage << "RBGrid_f:" << std::endl; RBGrid_f->show_decomposition();
+  std::cout << GridLogMessage << "RBGrid_c:" << std::endl; RBGrid_c->show_decomposition();
+
+  GridParallelRNG pRNG_f(Grid_f);
+  GridParallelRNG pRNG_c(Grid_c);
+
+  std::vector<int> seeds({1, 2, 3, 4});
+
+  pRNG_f.SeedFixedIntegers(seeds);
+  pRNG_c.SeedFixedIntegers(seeds);
+
+  /////////////////////////////////////////////////////////////////////////////
+  //                    Setup of Dirac Matrix and Operator                   //
+  /////////////////////////////////////////////////////////////////////////////
+
+  LatticeGaugeField Umu(Grid_f); SU3::HotConfiguration(pRNG_f, Umu);
+
+  RealD checkTolerance = (getPrecision<LatticeFermion>::value == 1) ? 1e-7 : 1e-15;
+
+  RealD mass = -0.30;
+  RealD csw  = 1.9192;
+
+  WilsonCloverFermionR Dwc(Umu, *Grid_f, *RBGrid_f, mass, csw, csw);
+  MdagMLinearOperator<WilsonCloverFermionR, LatticeFermion> MdagMOp_Dwc(Dwc);
+
+  /////////////////////////////////////////////////////////////////////////////
+  //                             Type definitions                            //
+  /////////////////////////////////////////////////////////////////////////////
+
+  typedef Aggregation<vSpinColourVector, vTComplex, nbasis>     Aggregates;
+  typedef CoarsenedMatrix<vSpinColourVector, vTComplex, nbasis> CoarseDiracMatrix;
+  typedef CoarseDiracMatrix::CoarseVector                       CoarseVector;
+
+  /////////////////////////////////////////////////////////////////////////////
+  //                           Setup of Aggregation                          //
+  /////////////////////////////////////////////////////////////////////////////
+
+  Aggregates Aggs(Grid_c, Grid_f, 0);
+  {
+    LatticeFermion tmp(Aggs.subspace[0].Grid());
+    for(int n = 0; n < nb; n++) {
+      gaussian(pRNG_f, Aggs.subspace[n]);
+      G5C(tmp, Aggs.subspace[n]);
+      axpby(Aggs.subspace[n + nb], 0.5, -0.5, Aggs.subspace[n], tmp);
+      axpby(Aggs.subspace[n], 0.5, 0.5, Aggs.subspace[n], tmp);
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  //                  Setup of CoarsenedMatrix and Operator                  //
+  /////////////////////////////////////////////////////////////////////////////
+
+  const int hermitian = 0;
+  CoarseDiracMatrix Dc(*Grid_c, *RBGrid_c, hermitian);
+  Dc.CoarsenOperator(Grid_f, MdagMOp_Dwc, Aggs);
+  MdagMLinearOperator<CoarseDiracMatrix, CoarseVector> MdagMOp_Dc(Dc);
+
+  /////////////////////////////////////////////////////////////////////////////
+  //                     Setup vectors used in all tests                     //
+  /////////////////////////////////////////////////////////////////////////////
+
+  CoarseVector src(Grid_c);  random(pRNG_c, src);
+  CoarseVector diff(Grid_c); diff = Zero();
+
+  /////////////////////////////////////////////////////////////////////////////
+  //                              Start of tests                             //
+  /////////////////////////////////////////////////////////////////////////////
+
+  {
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test Dhop + Mdiag = Munprec" << std::endl;
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+
+    CoarseVector phi(Grid_c); phi = Zero();
+    CoarseVector chi(Grid_c); chi = Zero();
+    CoarseVector res(Grid_c); res = Zero();
+    CoarseVector ref(Grid_c); ref = Zero();
+
+    Dc.Mdiag(src, phi);          std::cout << GridLogMessage << "Applied Mdiag" << std::endl;
+    Dc.Dhop(src, chi, DaggerNo); std::cout << GridLogMessage << "Applied Dhop"  << std::endl;
+    Dc.M(src, ref);              std::cout << GridLogMessage << "Applied M"     << std::endl;
+
+    res = phi + chi;
+
+    diff = ref - res;
+    auto absDev = norm2(diff);
+    auto relDev = absDev / norm2(ref);
+    std::cout << GridLogMessage << "norm2(Munprec), norm2(Dhop + Mdiag), abs. deviation, rel. deviation: "
+              << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev << " -> check "
+              << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
+    assert(relDev <= checkTolerance);
+  }
+
+  {
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test Meo + Moe = Dhop" << std::endl;
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+
+    CoarseVector src_e(RBGrid_c); src_e = Zero();
+    CoarseVector src_o(RBGrid_c); src_o = Zero();
+    CoarseVector res_e(RBGrid_c); res_e = Zero();
+    CoarseVector res_o(RBGrid_c); res_o = Zero();
+    CoarseVector res(Grid_c);     res = Zero();
+    CoarseVector ref(Grid_c);     ref = Zero();
+
+    pickCheckerboard(Even, src_e, src);
+    pickCheckerboard(Odd,  src_o, src);
+
+    Dc.Meooe(src_e, res_o);        std::cout << GridLogMessage << "Applied Meo"  << std::endl;
+    Dc.Meooe(src_o, res_e);        std::cout << GridLogMessage << "Applied Moe"  << std::endl;
+    Dc.Dhop(src, ref, DaggerNo); std::cout << GridLogMessage << "Applied Dhop" << std::endl;
+
+    setCheckerboard(res, res_o);
+    setCheckerboard(res, res_e);
+
+    diff = ref - res;
+    auto absDev = norm2(diff);
+    auto relDev = absDev / norm2(ref);
+    std::cout << GridLogMessage << "norm2(Dhop), norm2(Meo + Moe), abs. deviation, rel. deviation: "
+              << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev
+              << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
+    assert(relDev <= checkTolerance);
+  }
+
+  {
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test |(Im(v^dag M^dag M v)| = 0" << std::endl;
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+
+    CoarseVector tmp(Grid_c); tmp = Zero();
+    CoarseVector phi(Grid_c); phi = Zero();
+
+    Dc.M(src, tmp);    std::cout << GridLogMessage << "Applied M"    << std::endl;
+    Dc.Mdag(tmp, phi); std::cout << GridLogMessage << "Applied Mdag" << std::endl;
+
+    std::cout << GridLogMessage << "src = " << norm2(src) << " tmp = " << norm2(tmp) << " phi = " << norm2(phi) << std::endl;
+
+    ComplexD dot = innerProduct(src, phi);
+
+    auto relDev = abs(imag(dot)) / abs(real(dot));
+    std::cout << GridLogMessage << "Re(v^dag M^dag M v), Im(v^dag M^dag M v), rel.deviation: "
+              << real(dot) << " " << imag(dot) << " " << relDev
+              << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
+    assert(relDev <= checkTolerance);
+  }
+
+  {
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test |(Im(v^dag Mooee^dag Mooee v)| = 0 (full grid)" << std::endl;
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+
+    CoarseVector tmp(Grid_c); tmp = Zero();
+    CoarseVector phi(Grid_c); phi = Zero();
+
+    Dc.Mooee(src, tmp);    std::cout << GridLogMessage << "Applied Mooee"    << std::endl;
+    Dc.MooeeDag(tmp, phi); std::cout << GridLogMessage << "Applied MooeeDag" << std::endl;
+
+    ComplexD dot = innerProduct(src, phi);
+
+    auto relDev = abs(imag(dot)) / abs(real(dot));
+    std::cout << GridLogMessage << "Re(v^dag Mooee^dag Mooee v), Im(v^dag Mooee^dag Mooee v), rel.deviation: "
+              << real(dot) << " " << imag(dot) << " " << relDev
+              << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
+    assert(relDev <= checkTolerance);
+  }
+
+  {
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (full grid)" << std::endl;
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+
+    CoarseVector tmp(Grid_c); tmp = Zero();
+    CoarseVector phi(Grid_c); phi = Zero();
+
+    Dc.Mooee(src, tmp);    std::cout << GridLogMessage << "Applied Mooee"    << std::endl;
+    Dc.MooeeInv(tmp, phi); std::cout << GridLogMessage << "Applied MooeeInv" << std::endl;
+
+    diff        = src - phi;
+    auto absDev = norm2(diff);
+    auto relDev = absDev / norm2(src);
+    std::cout << GridLogMessage << "norm2(src), norm2(MooeeInv Mooee src), abs. deviation, rel. deviation: "
+              << norm2(src) << " " << norm2(phi) << " " << absDev << " " << relDev
+              << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
+    assert(relDev <= checkTolerance);
+  }
+
+  {
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test MeooeDagger is the dagger of Meooe by requiring" << std::endl;
+    std::cout << GridLogMessage << "=  < phi | Meooe | chi > * = < chi | Meooe^dag| phi>" << std::endl;
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+
+    // clang-format off
+    CoarseVector phi(Grid_c); random(pRNG_c, phi);
+    CoarseVector chi(Grid_c); random(pRNG_c, chi);
+    CoarseVector chi_e(RBGrid_c);   chi_e = Zero();
+    CoarseVector chi_o(RBGrid_c);   chi_o = Zero();
+    CoarseVector dchi_e(RBGrid_c); dchi_e = Zero();
+    CoarseVector dchi_o(RBGrid_c); dchi_o = Zero();
+    CoarseVector phi_e(RBGrid_c);   phi_e = Zero();
+    CoarseVector phi_o(RBGrid_c);   phi_o = Zero();
+    CoarseVector dphi_e(RBGrid_c); dphi_e = Zero();
+    CoarseVector dphi_o(RBGrid_c); dphi_o = Zero();
+    // clang-format on
+
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd,  chi_o, chi);
+    pickCheckerboard(Even, phi_e, phi);
+    pickCheckerboard(Odd,  phi_o, phi);
+
+    Dc.Meooe(chi_e, dchi_o);    std::cout << GridLogMessage << "Applied Meo"    << std::endl;
+    Dc.Meooe(chi_o, dchi_e);    std::cout << GridLogMessage << "Applied Moe"    << std::endl;
+    Dc.MeooeDag(phi_e, dphi_o); std::cout << GridLogMessage << "Applied MeoDag" << std::endl;
+    Dc.MeooeDag(phi_o, dphi_e); std::cout << GridLogMessage << "Applied MoeDag" << std::endl;
+
+    ComplexD phiDchi_e = innerProduct(phi_e, dchi_e);
+    ComplexD phiDchi_o = innerProduct(phi_o, dchi_o);
+    ComplexD chiDphi_e = innerProduct(chi_e, dphi_e);
+    ComplexD chiDphi_o = innerProduct(chi_o, dphi_o);
+
+    std::cout << GridLogDebug << "norm dchi_e = " << norm2(dchi_e) << " norm dchi_o = " << norm2(dchi_o) << " norm dphi_e = " << norm2(dphi_e)
+              << " norm dphi_o = " << norm2(dphi_e) << std::endl;
+
+    std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl;
+    std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl;
+
+    std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_o) " << phiDchi_e - conj(chiDphi_o) << std::endl;
+    std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_e) " << phiDchi_o - conj(chiDphi_e) << std::endl;
+  }
+
+  {
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (checkerboards separately)" << std::endl;
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+
+    CoarseVector chi(Grid_c); random(pRNG_c, chi);
+    CoarseVector tmp(Grid_c); tmp = Zero();
+    CoarseVector phi(Grid_c); phi = Zero();
+    CoarseVector chi_e(RBGrid_c); chi_e = Zero();
+    CoarseVector chi_o(RBGrid_c); chi_o = Zero();
+    CoarseVector phi_e(RBGrid_c); phi_e = Zero();
+    CoarseVector phi_o(RBGrid_c); phi_o = Zero();
+    CoarseVector tmp_e(RBGrid_c); tmp_e = Zero();
+    CoarseVector tmp_o(RBGrid_c); tmp_o = Zero();
+
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd,  chi_o, chi);
+    pickCheckerboard(Even, tmp_e, tmp);
+    pickCheckerboard(Odd,  tmp_o, tmp);
+
+    Dc.Mooee(chi_e, tmp_e);    std::cout << GridLogMessage << "Applied Mee"    << std::endl;
+    Dc.MooeeInv(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInv" << std::endl;
+    Dc.Mooee(chi_o, tmp_o);    std::cout << GridLogMessage << "Applied Moo"    << std::endl;
+    Dc.MooeeInv(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInv" << std::endl;
+
+    setCheckerboard(phi, phi_e);
+    setCheckerboard(phi, phi_o);
+
+    diff = chi - phi;
+    auto absDev = norm2(diff);
+    auto relDev = absDev / norm2(chi);
+    std::cout << GridLogMessage << "norm2(chi), norm2(MeeInv Mee chi), abs. deviation, rel. deviation: "
+              << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev
+              << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
+    assert(relDev <= checkTolerance);
+  }
+
+  {
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test MooeeDag MooeeInvDag = 1 (checkerboards separately)" << std::endl;
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+
+    CoarseVector chi(Grid_c); random(pRNG_c, chi);
+    CoarseVector tmp(Grid_c); tmp = Zero();
+    CoarseVector phi(Grid_c); phi = Zero();
+    CoarseVector chi_e(RBGrid_c); chi_e = Zero();
+    CoarseVector chi_o(RBGrid_c); chi_o = Zero();
+    CoarseVector phi_e(RBGrid_c); phi_e = Zero();
+    CoarseVector phi_o(RBGrid_c); phi_o = Zero();
+    CoarseVector tmp_e(RBGrid_c); tmp_e = Zero();
+    CoarseVector tmp_o(RBGrid_c); tmp_o = Zero();
+
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd,  chi_o, chi);
+    pickCheckerboard(Even, tmp_e, tmp);
+    pickCheckerboard(Odd,  tmp_o, tmp);
+
+    Dc.MooeeDag(chi_e, tmp_e);    std::cout << GridLogMessage << "Applied MeeDag"    << std::endl;
+    Dc.MooeeInvDag(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInvDag" << std::endl;
+    Dc.MooeeDag(chi_o, tmp_o);    std::cout << GridLogMessage << "Applied MooDag"    << std::endl;
+    Dc.MooeeInvDag(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInvDag" << std::endl;
+
+    setCheckerboard(phi, phi_e);
+    setCheckerboard(phi, phi_o);
+
+    diff = chi - phi;
+    auto absDev = norm2(diff);
+    auto relDev = absDev / norm2(chi);
+    std::cout << GridLogMessage << "norm2(chi), norm2(MeeDag MeeInvDag chi), abs. deviation, rel. deviation: "
+              << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev
+              << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
+    assert(relDev <= checkTolerance);
+  }
+
+  {
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test Meo + Moe + Moo + Mee = Munprec" << std::endl;
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+
+    CoarseVector chi(Grid_c); chi = Zero();
+    CoarseVector phi(Grid_c); phi = Zero();
+    CoarseVector ref(Grid_c); ref = Zero();
+    CoarseVector src_e(RBGrid_c); src_e = Zero();
+    CoarseVector src_o(RBGrid_c); src_o = Zero();
+    CoarseVector phi_e(RBGrid_c); phi_e = Zero();
+    CoarseVector phi_o(RBGrid_c); phi_o = Zero();
+    CoarseVector chi_e(RBGrid_c); chi_e = Zero();
+    CoarseVector chi_o(RBGrid_c); chi_o = Zero();
+
+    pickCheckerboard(Even, src_e, src);
+    pickCheckerboard(Odd,  src_o, src);
+    pickCheckerboard(Even, phi_e, phi);
+    pickCheckerboard(Odd,  phi_o, phi);
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd,  chi_o, chi);
+
+    // M phi = (Mooee src_e + Meooe src_o , Mooee src_o + Meooe src_e)
+
+    Dc.M(src, ref); // Reference result from the unpreconditioned operator
+
+    // EO matrix
+    Dc.Mooee(src_e, chi_e); std::cout << GridLogMessage << "Applied Mee" << std::endl;
+    Dc.Mooee(src_o, chi_o); std::cout << GridLogMessage << "Applied Moo" << std::endl;
+    Dc.Meooe(src_o, phi_e); std::cout << GridLogMessage << "Applied Moe" << std::endl;
+    Dc.Meooe(src_e, phi_o); std::cout << GridLogMessage << "Applied Meo" << std::endl;
+
+    phi_o += chi_o;
+    phi_e += chi_e;
+
+    setCheckerboard(phi, phi_e);
+    setCheckerboard(phi, phi_o);
+
+    std::cout << GridLogDebug << "norm phi_e = " << norm2(phi_e) << " norm phi_o = " << norm2(phi_o) << " norm phi = " << norm2(phi) << std::endl;
+
+    diff = ref - phi;
+    auto absDev = norm2(diff);
+    auto relDev = absDev / norm2(ref);
+    std::cout << GridLogMessage << "norm2(Dunprec), norm2(Deoprec), abs. deviation, rel. deviation: "
+              << norm2(ref) << " " << norm2(phi) << " " << absDev << " " << relDev
+              << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
+    assert(relDev <= checkTolerance);
+  }
+
+  {
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test MpcDagMpc is hermitian" << std::endl;
+    std::cout << GridLogMessage << "===========================================================================" << std::endl;
+
+    CoarseVector phi(Grid_c); random(pRNG_c, phi);
+    CoarseVector chi(Grid_c); random(pRNG_c, chi);
+    CoarseVector chi_e(RBGrid_c);   chi_e = Zero();
+    CoarseVector chi_o(RBGrid_c);   chi_o = Zero();
+    CoarseVector dchi_e(RBGrid_c); dchi_e = Zero();
+    CoarseVector dchi_o(RBGrid_c); dchi_o = Zero();
+    CoarseVector phi_e(RBGrid_c);   phi_e = Zero();
+    CoarseVector phi_o(RBGrid_c);   phi_o = Zero();
+    CoarseVector dphi_e(RBGrid_c); dphi_e = Zero();
+    CoarseVector dphi_o(RBGrid_c); dphi_o = Zero();
+
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd,  chi_o, chi);
+    pickCheckerboard(Even, phi_e, phi);
+    pickCheckerboard(Odd,  phi_o, phi);
+
+    SchurDiagMooeeOperator<CoarseDiracMatrix,CoarseVector> HermOpEO(Dc);
+
+    HermOpEO.MpcDagMpc(chi_e, dchi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_e" << std::endl;
+    HermOpEO.MpcDagMpc(chi_o, dchi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_o" << std::endl;
+    HermOpEO.MpcDagMpc(phi_e, dphi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_e" << std::endl;
+    HermOpEO.MpcDagMpc(phi_o, dphi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_o" << std::endl;
+
+    ComplexD phiDchi_e = innerProduct(phi_e, dchi_e);
+    ComplexD phiDchi_o = innerProduct(phi_o, dchi_o);
+    ComplexD chiDphi_e = innerProduct(chi_e, dphi_e);
+    ComplexD chiDphi_o = innerProduct(chi_o, dphi_o);
+
+    std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl;
+    std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl;
+
+    std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_e) " << phiDchi_e - conj(chiDphi_e) << std::endl;
+    std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_o) " << phiDchi_o - conj(chiDphi_o) << std::endl;
+  }
+
+  Grid_finalize();
+}

From 01652d8cfea1ad463b037b20aabfc4f3b0dc7d37 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 13 Sep 2020 05:56:02 -0400
Subject: [PATCH 08/14] SlabAllocator

---
 Grid/threads/Accelerator.h    |  13 +--
 Grid/threads/SlabAllocator.cc | 161 ++++++++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+), 11 deletions(-)
 create mode 100644 Grid/threads/SlabAllocator.cc

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 1a3dfdc2..89d45a17 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -153,18 +153,7 @@ inline void *acceleratorAllocShared(size_t bytes)
   }
   return ptr;
 };
-inline void *acceleratorAllocDevice(size_t bytes)
-{
-  void *ptr=NULL;
-  auto err = cudaMalloc((void **)&ptr,bytes);
-  if( err != cudaSuccess ) {
-    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
-  }
-  return ptr;
-};
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
-inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline int  acceleratorIsCommunicable(void *ptr)
@@ -176,6 +165,8 @@ inline int  acceleratorIsCommunicable(void *ptr)
   if(uvm) return 0;
   else    return 1;
 }
+void *acceleratorAllocDevice(size_t bytes);
+void acceleratorFreeDevice(void* ptr);
 
 #endif
 
diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc
new file mode 100644
index 00000000..da863687
--- /dev/null
+++ b/Grid/threads/SlabAllocator.cc
@@ -0,0 +1,161 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./Grid/threads/SlabAllocator.cc
+
+    Copyright (C) 2020
+
+Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/GridCore.h>
+
+#include <unordered_set>
+
+NAMESPACE_BEGIN(Grid);
+
+#ifdef GRID_CUDA
+
+#define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024)
+#define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024)
+
+void *acceleratorAllocDeviceCUDA(size_t bytes) {
+  void *ptr=NULL;
+  auto err = cudaMalloc((void **)&ptr,bytes);
+  if( err != cudaSuccess ) {
+    ptr = (void *) NULL;
+    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+  }
+  return ptr;
+}
+
+void acceleratorFreeDeviceCUDA(void *ptr) {
+  cudaFree(ptr);
+}
+
+struct grid_device_heap_slab_t {
+  void* Ptr;
+  size_t ElementSize;
+  size_t Elements;
+  std::unordered_set<uint32_t> Allocated;
+  std::unordered_set<uint32_t> Available;
+};
+
+std::unordered_map<void*, grid_device_heap_slab_t*> DeviceHeapPtrTable;
+std::unordered_map<size_t, std::unordered_set<grid_device_heap_slab_t*> > DeviceHeapSlabTable;
+
+void* SlabAllocateElement(grid_device_heap_slab_t* slab) {
+  assert(!slab->Available.empty());
+  auto available = slab->Available.begin();
+  auto slot = *available;
+  slab->Allocated.insert(slot);
+  slab->Available.erase(available);
+  
+  void* Ptr = (void*)((char*)slab->Ptr + slot * slab->ElementSize);
+  DeviceHeapPtrTable[Ptr] = slab;
+  
+  //std::cout << "Allocate element " << slot << " of slab " << slab << " of size " << slab->ElementSize << " with elements " << slab->Elements << 
+  //  " (allocated = " << slab->Allocated.size() << ", available = " << slab->Available.size() << ")" << std::endl;
+  
+  return Ptr;
+}
+
+void SlabRemove(grid_device_heap_slab_t* slab) {
+  auto & t = DeviceHeapSlabTable[slab->ElementSize];
+  assert(slab->Ptr);
+  DeviceHeapPtrTable.erase(slab->Ptr);
+  acceleratorFreeDeviceCUDA(slab->Ptr);
+  assert(t.count(slab) == 1);
+  t.erase(slab);
+  delete slab;
+  //std::cout << "Remove slab " << slab << std::endl;
+}
+
+void SlabFreeElement(grid_device_heap_slab_t* slab, void* ElementPtr) {
+  size_t Offset = (size_t)ElementPtr - (size_t)slab->Ptr;
+  //std::cout << "SlabFreeElement offset " << Offset << std::endl;
+  assert(Offset < GRID_DEVICE_HEAP_SLAB_SIZE);
+  assert(Offset % slab->ElementSize == 0);
+  size_t slot = Offset / slab->ElementSize;
+  assert(slot >= 0);
+  assert(slab->Allocated.count(slot) == 1 && slab->Available.count(slot) == 0);
+  slab->Allocated.erase(slot);
+  slab->Available.insert(slot);
+
+  //std::cout << "Free element " << slot << " of slab" << slab << std::endl;
+  
+  if (slab->Allocated.empty()) {
+    SlabRemove(slab);
+  }
+}
+
+grid_device_heap_slab_t* SlabFind(size_t bytes) {
+
+  grid_device_heap_slab_t* slab = 0;
+  std::unordered_set<grid_device_heap_slab_t*>* slab_set = 0;
+
+  decltype(DeviceHeapSlabTable.begin()) slabs = DeviceHeapSlabTable.find(bytes);
+  if (slabs == DeviceHeapSlabTable.end()) {
+    slab_set = &DeviceHeapSlabTable[bytes];
+  } else {
+    slab_set = &slabs->second;
+  }
+
+  for (auto& s : *slab_set) {
+    if (!s->Available.empty()) {
+      slab = &(*s);
+      break;
+    }
+  }  
+
+  if (!slab) {
+    slab = new grid_device_heap_slab_t;
+    slab_set->insert(slab);
+    slab->Ptr = acceleratorAllocDeviceCUDA(GRID_DEVICE_HEAP_SLAB_SIZE);
+    slab->ElementSize = bytes;
+    slab->Elements = GRID_DEVICE_HEAP_SLAB_SIZE / bytes;
+    for (size_t i=0;i<slab->Elements;i++)
+      slab->Available.insert(i);
+    //std::cout << "New slab" << slab << std::endl;
+  }
+
+  return slab;
+}
+
+void *acceleratorAllocDevice(size_t bytes) {
+  if (bytes >= GRID_DEVICE_HEAP_SLAB_THRESHOLD) {
+    return acceleratorAllocDeviceCUDA(bytes);
+  }
+
+  return SlabAllocateElement(SlabFind(bytes));
+}
+
+void acceleratorFreeDevice(void *ptr) {
+  auto p = DeviceHeapPtrTable.find(ptr);
+  if (p == DeviceHeapPtrTable.end()) {
+    acceleratorFreeDeviceCUDA(ptr);
+  } else {
+    SlabFreeElement(p->second,ptr);
+  }
+}
+
+#endif
+
+NAMESPACE_END(Grid);

From 32ff766dbd8416b45ae042463c8f65145b75806f Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 13 Sep 2020 14:02:53 -0400
Subject: [PATCH 09/14] fix evict scheme, slab alloc

---
 Grid/allocator/MemoryManagerCache.cc | 6 ++++--
 Grid/threads/SlabAllocator.cc        | 8 ++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc
index 5dd7575e..7d4581d7 100644
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -227,12 +227,13 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
   // Find if present, otherwise get or force an empty
   ////////////////////////////////////////////////////////////////////////////
   if ( EntryPresent(CpuPtr)==0 ){
-    EvictVictims(bytes);
     EntryCreate(CpuPtr,bytes,mode,hint);
   }
 
   auto AccCacheIterator = EntryLookup(CpuPtr);
   auto & AccCache = AccCacheIterator->second;
+  if (!AccCache.AccPtr)
+    EvictVictims(bytes);
   
   assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
 
@@ -361,12 +362,13 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
   // Find if present, otherwise get or force an empty
   ////////////////////////////////////////////////////////////////////////////
   if ( EntryPresent(CpuPtr)==0 ){
-    EvictVictims(bytes);
     EntryCreate(CpuPtr,bytes,mode,transient);
   }
 
   auto AccCacheIterator = EntryLookup(CpuPtr);
   auto & AccCache = AccCacheIterator->second;
+  if (!AccCache.AccPtr)
+    EvictVictims(bytes);
   
   assert((mode==CpuRead)||(mode==CpuWrite));
   assert(AccCache.accLock==0);  // Programming error
diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc
index da863687..5590f835 100644
--- a/Grid/threads/SlabAllocator.cc
+++ b/Grid/threads/SlabAllocator.cc
@@ -36,6 +36,9 @@ NAMESPACE_BEGIN(Grid);
 #define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024)
 #define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024)
 
+size_t currentDeviceAlloc = 0;
+std::unordered_map<void*,size_t> ptr_size;
+
 void *acceleratorAllocDeviceCUDA(size_t bytes) {
   void *ptr=NULL;
   auto err = cudaMalloc((void **)&ptr,bytes);
@@ -43,11 +46,16 @@ void *acceleratorAllocDeviceCUDA(size_t bytes) {
     ptr = (void *) NULL;
     printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
   }
+  currentDeviceAlloc += bytes;
+  ptr_size[ptr] = bytes;
+  std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl;
   return ptr;
 }
 
 void acceleratorFreeDeviceCUDA(void *ptr) {
   cudaFree(ptr);
+  currentDeviceAlloc -= ptr_size[ptr];
+  std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl;
 }
 
 struct grid_device_heap_slab_t {

From d50a2164d73d2c01048cd1f25f765a720472b2c6 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 13 Sep 2020 14:06:06 -0400
Subject: [PATCH 10/14] remove slab allocator

---
 Grid/threads/Accelerator.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 89d45a17..1a3dfdc2 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -153,7 +153,18 @@ inline void *acceleratorAllocShared(size_t bytes)
   }
   return ptr;
 };
+inline void *acceleratorAllocDevice(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = cudaMalloc((void **)&ptr,bytes);
+  if( err != cudaSuccess ) {
+    ptr = (void *) NULL;
+    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+  }
+  return ptr;
+};
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
+inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline int  acceleratorIsCommunicable(void *ptr)
@@ -165,8 +176,6 @@ inline int  acceleratorIsCommunicable(void *ptr)
   if(uvm) return 0;
   else    return 1;
 }
-void *acceleratorAllocDevice(size_t bytes);
-void acceleratorFreeDevice(void* ptr);
 
 #endif
 

From 5cffa05c7e2965216200e7fff3183fce3f15c8bb Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 13 Sep 2020 14:06:25 -0400
Subject: [PATCH 11/14] remove slab allocator file

---
 Grid/threads/SlabAllocator.cc | 169 ----------------------------------
 1 file changed, 169 deletions(-)
 delete mode 100644 Grid/threads/SlabAllocator.cc

diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc
deleted file mode 100644
index 5590f835..00000000
--- a/Grid/threads/SlabAllocator.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./Grid/threads/SlabAllocator.cc
-
-    Copyright (C) 2020
-
-Author: Christoph Lehner <christoph@lhnr.de>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/GridCore.h>
-
-#include <unordered_set>
-
-NAMESPACE_BEGIN(Grid);
-
-#ifdef GRID_CUDA
-
-#define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024)
-#define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024)
-
-size_t currentDeviceAlloc = 0;
-std::unordered_map<void*,size_t> ptr_size;
-
-void *acceleratorAllocDeviceCUDA(size_t bytes) {
-  void *ptr=NULL;
-  auto err = cudaMalloc((void **)&ptr,bytes);
-  if( err != cudaSuccess ) {
-    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
-  }
-  currentDeviceAlloc += bytes;
-  ptr_size[ptr] = bytes;
-  std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl;
-  return ptr;
-}
-
-void acceleratorFreeDeviceCUDA(void *ptr) {
-  cudaFree(ptr);
-  currentDeviceAlloc -= ptr_size[ptr];
-  std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl;
-}
-
-struct grid_device_heap_slab_t {
-  void* Ptr;
-  size_t ElementSize;
-  size_t Elements;
-  std::unordered_set<uint32_t> Allocated;
-  std::unordered_set<uint32_t> Available;
-};
-
-std::unordered_map<void*, grid_device_heap_slab_t*> DeviceHeapPtrTable;
-std::unordered_map<size_t, std::unordered_set<grid_device_heap_slab_t*> > DeviceHeapSlabTable;
-
-void* SlabAllocateElement(grid_device_heap_slab_t* slab) {
-  assert(!slab->Available.empty());
-  auto available = slab->Available.begin();
-  auto slot = *available;
-  slab->Allocated.insert(slot);
-  slab->Available.erase(available);
-  
-  void* Ptr = (void*)((char*)slab->Ptr + slot * slab->ElementSize);
-  DeviceHeapPtrTable[Ptr] = slab;
-  
-  //std::cout << "Allocate element " << slot << " of slab " << slab << " of size " << slab->ElementSize << " with elements " << slab->Elements << 
-  //  " (allocated = " << slab->Allocated.size() << ", available = " << slab->Available.size() << ")" << std::endl;
-  
-  return Ptr;
-}
-
-void SlabRemove(grid_device_heap_slab_t* slab) {
-  auto & t = DeviceHeapSlabTable[slab->ElementSize];
-  assert(slab->Ptr);
-  DeviceHeapPtrTable.erase(slab->Ptr);
-  acceleratorFreeDeviceCUDA(slab->Ptr);
-  assert(t.count(slab) == 1);
-  t.erase(slab);
-  delete slab;
-  //std::cout << "Remove slab " << slab << std::endl;
-}
-
-void SlabFreeElement(grid_device_heap_slab_t* slab, void* ElementPtr) {
-  size_t Offset = (size_t)ElementPtr - (size_t)slab->Ptr;
-  //std::cout << "SlabFreeElement offset " << Offset << std::endl;
-  assert(Offset < GRID_DEVICE_HEAP_SLAB_SIZE);
-  assert(Offset % slab->ElementSize == 0);
-  size_t slot = Offset / slab->ElementSize;
-  assert(slot >= 0);
-  assert(slab->Allocated.count(slot) == 1 && slab->Available.count(slot) == 0);
-  slab->Allocated.erase(slot);
-  slab->Available.insert(slot);
-
-  //std::cout << "Free element " << slot << " of slab" << slab << std::endl;
-  
-  if (slab->Allocated.empty()) {
-    SlabRemove(slab);
-  }
-}
-
-grid_device_heap_slab_t* SlabFind(size_t bytes) {
-
-  grid_device_heap_slab_t* slab = 0;
-  std::unordered_set<grid_device_heap_slab_t*>* slab_set = 0;
-
-  decltype(DeviceHeapSlabTable.begin()) slabs = DeviceHeapSlabTable.find(bytes);
-  if (slabs == DeviceHeapSlabTable.end()) {
-    slab_set = &DeviceHeapSlabTable[bytes];
-  } else {
-    slab_set = &slabs->second;
-  }
-
-  for (auto& s : *slab_set) {
-    if (!s->Available.empty()) {
-      slab = &(*s);
-      break;
-    }
-  }  
-
-  if (!slab) {
-    slab = new grid_device_heap_slab_t;
-    slab_set->insert(slab);
-    slab->Ptr = acceleratorAllocDeviceCUDA(GRID_DEVICE_HEAP_SLAB_SIZE);
-    slab->ElementSize = bytes;
-    slab->Elements = GRID_DEVICE_HEAP_SLAB_SIZE / bytes;
-    for (size_t i=0;i<slab->Elements;i++)
-      slab->Available.insert(i);
-    //std::cout << "New slab" << slab << std::endl;
-  }
-
-  return slab;
-}
-
-void *acceleratorAllocDevice(size_t bytes) {
-  if (bytes >= GRID_DEVICE_HEAP_SLAB_THRESHOLD) {
-    return acceleratorAllocDeviceCUDA(bytes);
-  }
-
-  return SlabAllocateElement(SlabFind(bytes));
-}
-
-void acceleratorFreeDevice(void *ptr) {
-  auto p = DeviceHeapPtrTable.find(ptr);
-  if (p == DeviceHeapPtrTable.end()) {
-    acceleratorFreeDeviceCUDA(ptr);
-  } else {
-    SlabFreeElement(p->second,ptr);
-  }
-}
-
-#endif
-
-NAMESPACE_END(Grid);

From d4861a362ccaca6bbf300da450f46c10ea40ed29 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 23 Nov 2020 15:38:49 +0000
Subject: [PATCH 12/14] Stencil use non-UVM memory for look up table on
 enable-shared=no

---
 Grid/stencil/Stencil.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index 1e198972..23fc8203 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -269,7 +269,7 @@ public:
   std::vector<Vector<std::pair<int,int> > > face_table ;
   Vector<int> surface_list;
 
-  Vector<StencilEntry>  _entries; // Resident in managed memory
+  stencilVector<StencilEntry>  _entries; // Resident in managed memory
   std::vector<Packet> Packets;
   std::vector<Merge> Mergers;
   std::vector<Merge> MergersSHM;

From 683a5e5bf556f97824fd242f76d0a81ab8dd7bd8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 23 Nov 2020 15:39:51 +0000
Subject: [PATCH 13/14] Stencil use host vector for integera table on
 enable-shared=no and mirror it on device

---
 Grid/allocator/AlignedAllocator.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index 4b357523..91622789 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -173,7 +173,8 @@ template<class T> using cshiftAllocator = devAllocator<T>;
 template<class T> using cshiftAllocator = std::allocator<T>;
 #endif
 
-template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
+template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
+template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
 template<class T> using commVector = std::vector<T,devAllocator<T> >;
 template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
 

From 97e264d0ff6562008be60ee4f3ba355198c7f247 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 23 Nov 2020 15:46:11 +0000
Subject: [PATCH 14/14] Christoph's changes

---
 Grid/allocator/MemoryManagerCache.cc | 36 +++++++++++++++++-----------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc
index 5dd7575e..50076eba 100644
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -1,11 +1,12 @@
 #include <Grid/GridCore.h>
-
 #ifndef GRID_UVM
 
 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
+//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 #define dprintf(...)
 
+
 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
 ////////////////////////////////////////////////////////////
@@ -103,7 +104,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
   ///////////////////////////////////////////////////////////
   assert(AccCache.state!=Empty);
   
-  //  dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+   dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
   assert(AccCache.accLock==0);
   assert(AccCache.cpuLock==0);
   assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -111,7 +112,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
     AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
     DeviceBytes   -=AccCache.bytes;
     LRUremove(AccCache);
-    //    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
   }
   uint64_t CpuPtr = AccCache.CpuPtr;
   EntryErase(CpuPtr);
@@ -125,7 +126,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
   ///////////////////////////////////////////////////////////////////////////
   assert(AccCache.state!=Empty);
   
-  //  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
   assert(AccCache.accLock==0);
   assert(AccCache.cpuLock==0);
   if(AccCache.state==AccDirty) {
@@ -136,7 +137,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
     AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
     DeviceBytes   -=AccCache.bytes;
     LRUremove(AccCache);
-    //    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
   }
   uint64_t CpuPtr = AccCache.CpuPtr;
   EntryErase(CpuPtr);
@@ -149,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
   assert(AccCache.AccPtr!=(uint64_t)NULL);
   assert(AccCache.CpuPtr!=(uint64_t)NULL);
   acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  //  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
   DeviceToHostBytes+=AccCache.bytes;
   DeviceToHostXfer++;
   AccCache.state=Consistent;
@@ -164,7 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
     AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
     DeviceBytes+=AccCache.bytes;
   }
-  //  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
   acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
   HostToDeviceBytes+=AccCache.bytes;
   HostToDeviceXfer++;
@@ -227,18 +228,24 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
   // Find if present, otherwise get or force an empty
   ////////////////////////////////////////////////////////////////////////////
   if ( EntryPresent(CpuPtr)==0 ){
-    EvictVictims(bytes);
     EntryCreate(CpuPtr,bytes,mode,hint);
   }
 
   auto AccCacheIterator = EntryLookup(CpuPtr);
   auto & AccCache = AccCacheIterator->second;
-  
+  if (!AccCache.AccPtr)
+    EvictVictims(bytes); 
+
   assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
 
   assert(AccCache.cpuLock==0);  // Programming error
 
   if(AccCache.state!=Empty) {
+    dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
+		    (uint64_t)AccCache.CpuPtr,
+		    (uint64_t)CpuPtr,
+		    (uint64_t)AccCache.bytes,
+		    (uint64_t)bytes);
     assert(AccCache.CpuPtr == CpuPtr);
     assert(AccCache.bytes  ==bytes);
   }
@@ -285,21 +292,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
       AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
     }
     AccCache.accLock++;
-    //    printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
+    dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
   } else if(AccCache.state==Consistent) {
     if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
       AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
     else
       AccCache.state  = Consistent; // Consistent + AccRead => Consistent
     AccCache.accLock++;
-    //    printf("Consistent entry into device accLock %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
   } else if(AccCache.state==AccDirty) {
     if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
       AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
     else
       AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
     AccCache.accLock++;
-    //    printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
+    dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
   } else {
     assert(0);
   }
@@ -361,13 +368,14 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
   // Find if present, otherwise get or force an empty
   ////////////////////////////////////////////////////////////////////////////
   if ( EntryPresent(CpuPtr)==0 ){
-    EvictVictims(bytes);
     EntryCreate(CpuPtr,bytes,mode,transient);
   }
 
   auto AccCacheIterator = EntryLookup(CpuPtr);
   auto & AccCache = AccCacheIterator->second;
-  
+  if (!AccCache.AccPtr)
+     EvictVictims(bytes);
+
   assert((mode==CpuRead)||(mode==CpuWrite));
   assert(AccCache.accLock==0);  // Programming error