From d29abfdcafd8e0da9803b43cae9951f94dbf833f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 6 Oct 2023 21:03:34 -0400
Subject: [PATCH] Transfer code to Frontier now

---
 Grid/algorithms/CoarsenedMatrix.h             | 1107 -----------------
 Grid/algorithms/GeneralCoarsenedMatrix.h      |  573 ---------
 .../multigrid/{Multigrid.h => MultiGrid.h}    |    0
 3 files changed, 1680 deletions(-)
 delete mode 100644 Grid/algorithms/CoarsenedMatrix.h
 delete mode 100644 Grid/algorithms/GeneralCoarsenedMatrix.h
 rename Grid/algorithms/multigrid/{Multigrid.h => MultiGrid.h} (100%)
diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
deleted file mode 100644
index 177305d9..00000000
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ /dev/null
@@ -1,1107 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/CoarsenedMatrix.h
-
-    Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
-#define  GRID_ALGORITHM_COARSENED_MATRIX_H
-
-#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
-
-NAMESPACE_BEGIN(Grid);
-
-template<class vobj,class CComplex>
-inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
-				    const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask,
-				    const Lattice<vobj> &fineX,
-				    const Lattice<vobj> &fineY)
-{
-  typedef decltype(innerProduct(vobj(),vobj())) dotp;
-
-  GridBase *coarse(CoarseInner.Grid());
-  GridBase *fine  (fineX.Grid());
-
-  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
-  Lattice<dotp> fine_inner_msk(fine);
-
-  // Multiply could be fused with innerProduct
-  // Single block sum kernel could do both masks.
-  fine_inner = localInnerProduct(fineX,fineY);
-  mult(fine_inner_msk, fine_inner,FineMask);
-  blockSum(CoarseInner,fine_inner_msk);
-}
-
-
-class Geometry {
-public:
-  int npoint;
-  int base;
-  std::vector<int> directions   ;
-  std::vector<int> displacements;
-  std::vector<int> points_dagger;
-
-  Geometry(int _d)  {
-    
-    base = (_d==5) ? 1:0;
-
-    // make coarse grid stencil for 4d , not 5d
-    if ( _d==5 ) _d=4;
-
-    npoint = 2*_d+1;
-    directions.resize(npoint);
-    displacements.resize(npoint);
-    points_dagger.resize(npoint);
-    for(int d=0;d<_d;d++){
-      directions[d   ] = d+base;
-      directions[d+_d] = d+base;
-      displacements[d  ] = +1;
-      displacements[d+_d]= -1;
-      points_dagger[d   ] = d+_d;
-      points_dagger[d+_d] = d;
-    }
-    directions   [2*_d]=0;
-    displacements[2*_d]=0;
-    points_dagger[2*_d]=2*_d;
-  }
-
-  int point(int dir, int disp) {
-    assert(disp == -1 || disp == 0 || disp == 1);
-    assert(base+0 <= dir && dir < base+4);
-
-    // directions faster index = new indexing
-    // 4d (base = 0):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   0  1  2  3  0  1  2  3  0
-    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
-    // 5d (base = 1):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   1  2  3  4  1  2  3  4  0
-    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
-
-    // displacements faster index = old indexing
-    // 4d (base = 0):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   0  0  1  1  2  2  3  3  0
-    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
-    // 5d (base = 1):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   1  1  2  2  3  3  4  4  0
-    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
-
-    if(dir == 0 and disp == 0)
-      return 8;
-    else // New indexing
-      return (1 - disp) / 2 * 4 + dir - base;
-    // else // Old indexing
-    //   return (4 * (dir - base) + 1 - disp) / 2;
-  }
-};
-  
-template<class Fobj,class CComplex,int nbasis>
-class Aggregation {
-public:
-  typedef iVector<CComplex,nbasis >             siteVector;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-
-  GridBase *CoarseGrid;
-  GridBase *FineGrid;
-  std::vector<Lattice<Fobj> > subspace;
-  int checkerboard;
-  int Checkerboard(void){return checkerboard;}
-  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
-    CoarseGrid(_CoarseGrid),
-    FineGrid(_FineGrid),
-    subspace(nbasis,_FineGrid),
-    checkerboard(_checkerboard)
-  {
-  };
-  
-  void Orthogonalise(void){
-    CoarseScalar InnerProd(CoarseGrid); 
-    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
-    blockOrthogonalise(InnerProd,subspace);
-  } 
-  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
-    blockProject(CoarseVec,FineVec,subspace);
-  }
-  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
-    FineVec.Checkerboard() = subspace[0].Checkerboard();
-    blockPromote(CoarseVec,FineVec,subspace);
-  }
-
-  virtual void CreateSubspaceRandom(GridParallelRNG  &RNG) {
-    int nn=nbasis;
-    RealD scale;
-    FineField noise(FineGrid);
-    for(int b=0;b<nn;b++){
-      subspace[b] = Zero();
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-      subspace[b] = noise;
-    }
-  }
-  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
-  {
-
-    RealD scale;
-
-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-
-    for(int b=0;b<nn;b++){
-      
-      subspace[b] = Zero();
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-      
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      for(int i=0;i<1;i++){
-
-	CG(hermop,noise,subspace[b]);
-
-	noise = subspace[b];
-	scale = std::pow(norm2(noise),-0.5); 
-	noise=noise*scale;
-
-      }
-
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
-      subspace[b]   = noise;
-
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
-  // and this is the best I found
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter,
-				       int ordermin,
-				       int orderstep,
-				       double filterlo
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    gaussian(RNG,noise);
-    scale = std::pow(norm2(noise),-0.5); 
-    noise=noise*scale;
-
-    std::cout << GridLogMessage<<" Chebyshev subspace pass-1 : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
-    std::cout << GridLogMessage<<" Chebyshev subspace pass-2 : nbasis"<<nn<<" min "
-	      <<ordermin<<" step "<<orderstep
-	      <<" lo"<<filterlo<<std::endl;
-
-    // Initial matrix element
-    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-    int b =0;
-    {
-      // Filter
-      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
-      Cheb(hermop,noise,Mn);
-      // normalise
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-      b++;
-    }
-
-    // Generate a full sequence of Chebyshevs
-    {
-      lo=filterlo;
-      noise=Mn;
-
-      FineField T0(FineGrid); T0 = noise;  
-      FineField T1(FineGrid); 
-      FineField T2(FineGrid);
-      FineField y(FineGrid);
-      
-      FineField *Tnm = &T0;
-      FineField *Tn  = &T1;
-      FineField *Tnp = &T2;
-
-      // Tn=T1 = (xscale M + mscale)in
-      RealD xscale = 2.0/(hi-lo);
-      RealD mscale = -(hi+lo)/(hi-lo);
-      hermop.HermOp(T0,y);
-      T1=y*xscale+noise*mscale;
-
-      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
-	
-	hermop.HermOp(*Tn,y);
-
-	autoView( y_v , y, AcceleratorWrite);
-	autoView( Tn_v , (*Tn), AcceleratorWrite);
-	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
-	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
-	const int Nsimd = CComplex::Nsimd();
-	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
-	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
-	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
-        });
-
-	// Possible more fine grained control is needed than a linear sweep,
-	// but huge productivity gain if this is simple algorithm and not a tunable
-	int m =1;
-	if ( n>=ordermin ) m=n-ordermin;
-	if ( (m%orderstep)==0 ) { 
-	  Mn=*Tnp;
-	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
-	  subspace[b] = Mn;
-	  hermop.Op(Mn,tmp); 
-	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-	  b++;
-	}
-
-	// Cycle pointers to avoid copies
-	FineField *swizzle = Tnm;
-	Tnm    =Tn;
-	Tn     =Tnp;
-	Tnp    =swizzle;
-	  
-      }
-    }
-    assert(b==nn);
-  }
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
-    std::cout << GridLogMessage<<" Chebyshev subspace pure noise  : nbasis "<<nn<<std::endl;
-
-
-    for(int b =0;b<nbasis;b++)
-    {
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-
-      // Initial matrix element
-      hermop.Op(noise,Mn);
-      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-      // Filter
-      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
-      Cheb(hermop,noise,Mn);
-      // normalise
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-    }
-
-  }
-
-};
-
-// Fine Object == (per site) type of fine field
-// nbasis      == number of deflation vectors
-template<class Fobj,class CComplex,int nbasis>
-class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
-public:
-    
-  typedef iVector<CComplex,nbasis >           siteVector;
-  typedef Lattice<CComplex >                  CoarseComplexField;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-  typedef iMatrix<CComplex,nbasis >  Cobj;
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-  typedef CoarseVector FermionField;
-
-  // enrich interface, use default implementation as in FermionOperator ///////
-  void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; }
-  void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; }
-  void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; }
-  void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; }
-  void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
-  void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
-
-  ////////////////////
-  // Data members
-  ////////////////////
-  Geometry         geom;
-  GridBase *       _grid; 
-  GridBase*        _cbgrid;
-  int hermitian;
-
-  CartesianStencil<siteVector,siteVector,DefaultImplParams> Stencil; 
-  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilEven;
-  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilOdd;
-
-  std::vector<CoarseMatrix> A;
-  std::vector<CoarseMatrix> Aeven;
-  std::vector<CoarseMatrix> Aodd;
-
-  CoarseMatrix AselfInv;
-  CoarseMatrix AselfInvEven;
-  CoarseMatrix AselfInvOdd;
-
-  Vector<RealD> dag_factor;
-
-  ///////////////////////
-  // Interface
-  ///////////////////////
-  GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know
-  GridBase * RedBlackGrid()     { return _cbgrid; };
-
-  int ConstEE() { return 0; }
-
-  void M (const CoarseVector &in, CoarseVector &out)
-  {
-    conformable(_grid,in.Grid());
-    conformable(in.Grid(),out.Grid());
-    out.Checkerboard() = in.Checkerboard();
-
-    SimpleCompressor<siteVector> compressor;
-
-    Stencil.HaloExchange(in,compressor);
-    autoView( in_v , in, AcceleratorRead);
-    autoView( out_v , out, AcceleratorWrite);
-    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    int npoint = geom.npoint;
-    typedef LatticeView<Cobj> Aview;
-      
-    Vector<Aview> AcceleratorViewContainer;
-  
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
-    Aview *Aview_p = & AcceleratorViewContainer[0];
-
-    const int Nsimd = CComplex::Nsimd();
-    typedef decltype(coalescedRead(in_v[0])) calcVector;
-    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
-
-    int osites=Grid()->oSites();
-
-    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
-      int ss = sss/nbasis;
-      int b  = sss%nbasis;
-      calcComplex res = Zero();
-      calcVector nbr;
-      int ptype;
-      StencilEntry *SE;
-
-      for(int point=0;point<npoint;point++){
-
-	SE=Stencil_v.GetEntry(ptype,point,ss);
-	  
-	if(SE->_is_local) { 
-	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
-	} else {
-	  nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
-	}
-	acceleratorSynchronise();
-
-	for(int bb=0;bb<nbasis;bb++) {
-	  res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
-	}
-      }
-      coalescedWrite(out_v[ss](b),res);
-      });
-
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
-  };
-
-  void Mdag (const CoarseVector &in, CoarseVector &out)
-  {
-    if(hermitian) {
-      // corresponds to Petrov-Galerkin coarsening
-      return M(in,out);
-    } else {
-      // corresponds to Galerkin coarsening
-      return MdagNonHermitian(in, out);
-    }
-  };
-
-  void MdagNonHermitian(const CoarseVector &in, CoarseVector &out)
-  {
-    conformable(_grid,in.Grid());
-    conformable(in.Grid(),out.Grid());
-    out.Checkerboard() = in.Checkerboard();
-
-    SimpleCompressor<siteVector> compressor;
-
-    Stencil.HaloExchange(in,compressor);
-    autoView( in_v , in, AcceleratorRead);
-    autoView( out_v , out, AcceleratorWrite);
-    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    int npoint = geom.npoint;
-    typedef LatticeView<Cobj> Aview;
-
-    Vector<Aview> AcceleratorViewContainer;
-
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
-    Aview *Aview_p = & AcceleratorViewContainer[0];
-
-    const int Nsimd = CComplex::Nsimd();
-    typedef decltype(coalescedRead(in_v[0])) calcVector;
-    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
-
-    int osites=Grid()->oSites();
-
-    Vector<int> points(geom.npoint, 0);
-    for(int p=0; p<geom.npoint; p++)
-      points[p] = geom.points_dagger[p];
-
-    auto points_p = &points[0];
-
-    RealD* dag_factor_p = &dag_factor[0];
-
-    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
-      int ss = sss/nbasis;
-      int b  = sss%nbasis;
-      calcComplex res = Zero();
-      calcVector nbr;
-      int ptype;
-      StencilEntry *SE;
-
-      for(int p=0;p<npoint;p++){
-        int point = points_p[p];
-
-	SE=Stencil_v.GetEntry(ptype,point,ss);
-
-	if(SE->_is_local) {
-	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
-	} else {
-	  nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
-	}
-	acceleratorSynchronise();
-
-	for(int bb=0;bb<nbasis;bb++) {
-	  res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
-	}
-      }
-      coalescedWrite(out_v[ss](b),res);
-      });
-
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
-  }
-
-  void MdirComms(const CoarseVector &in)
-  {
-    SimpleCompressor<siteVector> compressor;
-    Stencil.HaloExchange(in,compressor);
-  }
-  void MdirCalc(const CoarseVector &in, CoarseVector &out, int point)
-  {
-    conformable(_grid,in.Grid());
-    conformable(_grid,out.Grid());
-    out.Checkerboard() = in.Checkerboard();
-
-    typedef LatticeView<Cobj> Aview;
-    Vector<Aview> AcceleratorViewContainer;
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
-    Aview *Aview_p = & AcceleratorViewContainer[0];
-
-    autoView( out_v , out, AcceleratorWrite);
-    autoView( in_v  , in, AcceleratorRead);
-    autoView( Stencil_v  , Stencil, AcceleratorRead);
-
-    const int Nsimd = CComplex::Nsimd();
-    typedef decltype(coalescedRead(in_v[0])) calcVector;
-    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
-
-    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
-      int ss = sss/nbasis;
-      int b  = sss%nbasis;
-      calcComplex res = Zero();
-      calcVector nbr;
-      int ptype;
-      StencilEntry *SE;
-
-      SE=Stencil_v.GetEntry(ptype,point,ss);
-	  
-      if(SE->_is_local) { 
-	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
-      } else {
-	nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
-      }
-      acceleratorSynchronise();
-
-      for(int bb=0;bb<nbasis;bb++) {
-	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
-      }
-      coalescedWrite(out_v[ss](b),res);
-    });
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
-  }
-  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
-  {
-    this->MdirComms(in);
-    int ndir=geom.npoint-1;
-    if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { 
-      std::cout <<"MdirAll out size "<< out.size()<<std::endl;
-      std::cout <<"MdirAll ndir "<< ndir<<std::endl;
-      assert(0);
-    }
-    for(int p=0;p<ndir;p++){
-      MdirCalc(in,out[p],p);
-    }
-  };
-  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
-
-    this->MdirComms(in);
-
-    MdirCalc(in,out,geom.point(dir,disp));
-  };
-
-  void Mdiag(const CoarseVector &in, CoarseVector &out)
-  {
-    int point=geom.npoint-1;
-    MdirCalc(in, out, point); // No comms
-  };
-
-  void Mooee(const CoarseVector &in, CoarseVector &out) {
-    MooeeInternal(in, out, DaggerNo, InverseNo);
-  }
-
-  void MooeeInv(const CoarseVector &in, CoarseVector &out) {
-    MooeeInternal(in, out, DaggerNo, InverseYes);
-  }
-
-  void MooeeDag(const CoarseVector &in, CoarseVector &out) {
-    MooeeInternal(in, out, DaggerYes, InverseNo);
-  }
-
-  void MooeeInvDag(const CoarseVector &in, CoarseVector &out) {
-    MooeeInternal(in, out, DaggerYes, InverseYes);
-  }
-
-  void Meooe(const CoarseVector &in, CoarseVector &out) {
-    if(in.Checkerboard() == Odd) {
-      DhopEO(in, out, DaggerNo);
-    } else {
-      DhopOE(in, out, DaggerNo);
-    }
-  }
-
-  void MeooeDag(const CoarseVector &in, CoarseVector &out) {
-    if(in.Checkerboard() == Odd) {
-      DhopEO(in, out, DaggerYes);
-    } else {
-      DhopOE(in, out, DaggerYes);
-    }
-  }
-
-  void Dhop(const CoarseVector &in, CoarseVector &out, int dag) {
-    conformable(in.Grid(), _grid); // verifies full grid
-    conformable(in.Grid(), out.Grid());
-
-    out.Checkerboard() = in.Checkerboard();
-
-    DhopInternal(Stencil, A, in, out, dag);
-  }
-
-  void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) {
-    conformable(in.Grid(), _cbgrid);    // verifies half grid
-    conformable(in.Grid(), out.Grid()); // drops the cb check
-
-    assert(in.Checkerboard() == Even);
-    out.Checkerboard() = Odd;
-
-    DhopInternal(StencilEven, Aodd, in, out, dag);
-  }
-
-  void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) {
-    conformable(in.Grid(), _cbgrid);    // verifies half grid
-    conformable(in.Grid(), out.Grid()); // drops the cb check
-
-    assert(in.Checkerboard() == Odd);
-    out.Checkerboard() = Even;
-
-    DhopInternal(StencilOdd, Aeven, in, out, dag);
-  }
-
-  void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
-    out.Checkerboard() = in.Checkerboard();
-    assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
-
-    CoarseMatrix *Aself = nullptr;
-    if(in.Grid()->_isCheckerBoarded) {
-      if(in.Checkerboard() == Odd) {
-        Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1];
-        DselfInternal(StencilOdd, *Aself, in, out, dag);
-      } else {
-        Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1];
-        DselfInternal(StencilEven, *Aself, in, out, dag);
-      }
-    } else {
-      Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
-      DselfInternal(Stencil, *Aself, in, out, dag);
-    }
-    assert(Aself != nullptr);
-  }
-
-  void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
-                       const CoarseVector &in, CoarseVector &out, int dag) {
-    int point = geom.npoint-1;
-    autoView( out_v, out, AcceleratorWrite);
-    autoView( in_v,  in,  AcceleratorRead);
-    autoView( st_v,  st,  AcceleratorRead);
-    autoView( a_v,   a,   AcceleratorRead);
-
-    const int Nsimd = CComplex::Nsimd();
-    typedef decltype(coalescedRead(in_v[0])) calcVector;
-    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
-
-    RealD* dag_factor_p = &dag_factor[0];
-
-    if(dag) {
-      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
-        int ss = sss/nbasis;
-        int b  = sss%nbasis;
-        calcComplex res = Zero();
-        calcVector nbr;
-        int ptype;
-        StencilEntry *SE;
-
-        SE=st_v.GetEntry(ptype,point,ss);
-
-        if(SE->_is_local) {
-          nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
-        } else {
-          nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
-        }
-        acceleratorSynchronise();
-
-        for(int bb=0;bb<nbasis;bb++) {
-          res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(a_v[ss](b,bb))*nbr(bb);
-        }
-        coalescedWrite(out_v[ss](b),res);
-      });
-    } else {
-      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
-        int ss = sss/nbasis;
-        int b  = sss%nbasis;
-        calcComplex res = Zero();
-        calcVector nbr;
-        int ptype;
-        StencilEntry *SE;
-
-        SE=st_v.GetEntry(ptype,point,ss);
-
-        if(SE->_is_local) {
-          nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
-        } else {
-          nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
-        }
-        acceleratorSynchronise();
-
-        for(int bb=0;bb<nbasis;bb++) {
-          res = res + coalescedRead(a_v[ss](b,bb))*nbr(bb);
-        }
-        coalescedWrite(out_v[ss](b),res);
-      });
-    }
-  }
-
-  void DhopInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, std::vector<CoarseMatrix> &a,
-                    const CoarseVector &in, CoarseVector &out, int dag) {
-    SimpleCompressor<siteVector> compressor;
-
-    st.HaloExchange(in,compressor);
-    autoView( in_v,  in,  AcceleratorRead);
-    autoView( out_v, out, AcceleratorWrite);
-    autoView( st_v , st,  AcceleratorRead);
-    typedef LatticeView<Cobj> Aview;
-
-    // determine in what order we need the points
-    int npoint = geom.npoint-1;
-    Vector<int> points(npoint, 0);
-    for(int p=0; p<npoint; p++)
-      points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
-
-    auto points_p = &points[0];
-
-    Vector<Aview> AcceleratorViewContainer;
-    for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
-    Aview *Aview_p = & AcceleratorViewContainer[0];
-
-    const int Nsimd = CComplex::Nsimd();
-    typedef decltype(coalescedRead(in_v[0])) calcVector;
-    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
-
-    RealD* dag_factor_p = &dag_factor[0];
-
-    if(dag) {
-      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
-        int ss = sss/nbasis;
-        int b  = sss%nbasis;
-        calcComplex res = Zero();
-        calcVector nbr;
-        int ptype;
-        StencilEntry *SE;
-
-        for(int p=0;p<npoint;p++){
-          int point = points_p[p];
-          SE=st_v.GetEntry(ptype,point,ss);
-
-          if(SE->_is_local) {
-            nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
-          } else {
-            nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
-          }
-          acceleratorSynchronise();
-
-          for(int bb=0;bb<nbasis;bb++) {
-            res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
-          }
-        }
-        coalescedWrite(out_v[ss](b),res);
-      });
-    } else {
-      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
-        int ss = sss/nbasis;
-        int b  = sss%nbasis;
-        calcComplex res = Zero();
-        calcVector nbr;
-        int ptype;
-        StencilEntry *SE;
-
-        for(int p=0;p<npoint;p++){
-          int point = points_p[p];
-          SE=st_v.GetEntry(ptype,point,ss);
-
-          if(SE->_is_local) {
-            nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
-          } else {
-            nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
-          }
-          acceleratorSynchronise();
-
-          for(int bb=0;bb<nbasis;bb++) {
-            res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
-          }
-        }
-        coalescedWrite(out_v[ss](b),res);
-      });
-    }
-
-    for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
-  }
-  
-  CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	:
-    _grid(&CoarseGrid),
-    _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
-    geom(CoarseGrid._ndimension),
-    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements),
-    A(geom.npoint,&CoarseGrid),
-    Aeven(geom.npoint,_cbgrid),
-    Aodd(geom.npoint,_cbgrid),
-    AselfInv(&CoarseGrid),
-    AselfInvEven(_cbgrid),
-    AselfInvOdd(_cbgrid),
-    dag_factor(nbasis*nbasis)
-  {
-    fillFactor();
-  };
-
-  CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) 	:
-
-    _grid(&CoarseGrid),
-    _cbgrid(&CoarseRBGrid),
-    geom(CoarseGrid._ndimension),
-    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements),
-    A(geom.npoint,&CoarseGrid),
-    Aeven(geom.npoint,&CoarseRBGrid),
-    Aodd(geom.npoint,&CoarseRBGrid),
-    AselfInv(&CoarseGrid),
-    AselfInvEven(&CoarseRBGrid),
-    AselfInvOdd(&CoarseRBGrid),
-    dag_factor(nbasis*nbasis)
-  {
-    fillFactor();
-  };
-
-  void fillFactor() {
-    Eigen::MatrixXd dag_factor_eigen = Eigen::MatrixXd::Ones(nbasis, nbasis);
-    if(!hermitian) {
-      const int nb = nbasis/2;
-      dag_factor_eigen.block(0,nb,nb,nb) *= -1.0;
-      dag_factor_eigen.block(nb,0,nb,nb) *= -1.0;
-    }
-
-    // GPU readable prefactor
-    thread_for(i, nbasis*nbasis, {
-      int j = i/nbasis;
-      int k = i%nbasis;
-      dag_factor[i] = dag_factor_eigen(j, k);
-    });
-  }
-
-  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
-  {
-    typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
-    typedef typename Fobj::scalar_type scalar_type;
-
-    std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl;
-
-    FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
-    FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
-
-    std::vector<FineComplexField> masks(geom.npoint,FineGrid);
-    FineComplexField imask(FineGrid); // contributions from within this block
-    FineComplexField omask(FineGrid); // contributions from outwith this block
-
-    FineComplexField evenmask(FineGrid);
-    FineComplexField oddmask(FineGrid); 
-
-    FineField     phi(FineGrid);
-    FineField     tmp(FineGrid);
-    FineField     zz(FineGrid); zz=Zero();
-    FineField    Mphi(FineGrid);
-    FineField    Mphie(FineGrid);
-    FineField    Mphio(FineGrid);
-    std::vector<FineField>     Mphi_p(geom.npoint,FineGrid);
-
-    Lattice<iScalar<vInteger> > coor (FineGrid);
-    Lattice<iScalar<vInteger> > bcoor(FineGrid);
-    Lattice<iScalar<vInteger> > bcb  (FineGrid); bcb = Zero();
-
-    CoarseVector iProj(Grid()); 
-    CoarseVector oProj(Grid()); 
-    CoarseVector SelfProj(Grid()); 
-    CoarseComplexField iZProj(Grid()); 
-    CoarseComplexField oZProj(Grid()); 
-
-    CoarseScalar InnerProd(Grid()); 
-
-    std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl;
-    // Orthogonalise the subblocks over the basis
-    blockOrthogonalise(InnerProd,Subspace.subspace);
-
-    // Compute the matrix elements of linop between this orthonormal
-    // set of vectors.
-    std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl;
-    int self_stencil=-1;
-    for(int p=0;p<geom.npoint;p++)
-    { 
-      int dir   = geom.directions[p];
-      int disp  = geom.displacements[p];
-      A[p]=Zero();
-      if( geom.displacements[p]==0){
-	self_stencil=p;
-      }
-
-      Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
-
-      LatticeCoordinate(coor,dir);
-
-      ///////////////////////////////////////////////////////
-      // Work out even and odd block checkerboarding for fast diagonal term
-      ///////////////////////////////////////////////////////
-      if ( disp==1 ) {
-	bcb   = bcb + div(coor,block);
-      }
-	
-      if ( disp==0 ) {
-	  masks[p]= Zero();
-      } else if ( disp==1 ) {
-	masks[p] = where(mod(coor,block)==(block-1),one,zero);
-      } else if ( disp==-1 ) {
-	masks[p] = where(mod(coor,block)==(Integer)0,one,zero);
-      }
-    }
-    evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
-    oddmask  = one-evenmask;
-
-    assert(self_stencil!=-1);
-
-    for(int i=0;i<nbasis;i++){
-
-      phi=Subspace.subspace[i];
-
-      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
-      linop.OpDirAll(phi,Mphi_p);
-      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);
-
-      for(int p=0;p<geom.npoint;p++){ 
-
-	Mphi = Mphi_p[p];
-
-	int dir   = geom.directions[p];
-	int disp  = geom.displacements[p];
-
-	if ( (disp==-1) || (!hermitian ) ) {
-
-	  ////////////////////////////////////////////////////////////////////////
-	  // Pick out contributions coming from this cell and neighbour cell
-	  ////////////////////////////////////////////////////////////////////////
-	  omask = masks[p];
-	  imask = one-omask;
-	
-	  for(int j=0;j<nbasis;j++){
-	    
-	    blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
-	    
-	    autoView( iZProj_v , iZProj, AcceleratorRead) ;
-	    autoView( oZProj_v , oZProj, AcceleratorRead) ;
-	    autoView( A_p     ,  A[p], AcceleratorWrite);
-	    autoView( A_self  , A[self_stencil], AcceleratorWrite);
-
-	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
-	    if ( hermitian && (disp==-1) ) {
-	      for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
-		int dirp   = geom.directions[pp];
-		int dispp  = geom.displacements[pp];
-		if ( (dirp==dir) && (dispp==1) ){
-		  auto sft = conjugate(Cshift(oZProj,dir,1));
-		  autoView( sft_v    ,  sft  , AcceleratorWrite);
-		  autoView( A_pp     ,  A[pp], AcceleratorWrite);
-		  accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); });
-		}
-	      }
-	    }
-
-	  }
-	}
-      }
-
-      ///////////////////////////////////////////
-      // Faster alternate self coupling.. use hermiticity to save 2x
-      ///////////////////////////////////////////
-      {
-	mult(tmp,phi,evenmask);  linop.Op(tmp,Mphie);
-	mult(tmp,phi,oddmask );  linop.Op(tmp,Mphio);
-
-	{
-	  autoView( tmp_      , tmp, AcceleratorWrite);
-	  autoView( evenmask_ , evenmask, AcceleratorRead);
-	  autoView( oddmask_  ,  oddmask, AcceleratorRead);
-	  autoView( Mphie_    ,  Mphie, AcceleratorRead);
-	  autoView( Mphio_    ,  Mphio, AcceleratorRead);
-	  accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ 
-	      coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
-	    });
-	}
-
-	blockProject(SelfProj,tmp,Subspace.subspace);
-
-	autoView( SelfProj_ , SelfProj, AcceleratorRead);
-	autoView( A_self  , A[self_stencil], AcceleratorWrite);
-
-	accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
-	  for(int j=0;j<nbasis;j++){
-	    coalescedWrite(A_self[ss](j,i), SelfProj_(ss)(j));
-	  }
-	});
-
-      }
-    }
-    if(hermitian) {
-      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
-    }
-
-    InvertSelfStencilLink(); std::cout << GridLogMessage << "Coarse self link inverted" << std::endl;
-    FillHalfCbs(); std::cout << GridLogMessage << "Coarse half checkerboards filled" << std::endl;
-  }
-
-  void InvertSelfStencilLink() {
-    std::cout << GridLogDebug << "CoarsenedMatrix::InvertSelfStencilLink" << std::endl;
-    int localVolume = Grid()->lSites();
-
-    typedef typename Cobj::scalar_object scalar_object;
-
-    autoView(Aself_v,    A[geom.npoint-1], CpuRead);
-    autoView(AselfInv_v, AselfInv,         CpuWrite);
-    thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke
-      Eigen::MatrixXcd selfLinkEigen    = Eigen::MatrixXcd::Zero(nbasis, nbasis);
-      Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
-
-      scalar_object selfLink    = Zero();
-      scalar_object selfLinkInv = Zero();
-
-      Coordinate lcoor;
-
-      Grid()->LocalIndexToLocalCoor(site, lcoor);
-      peekLocalSite(selfLink, Aself_v, lcoor);
-
-      for (int i = 0; i < nbasis; ++i)
-        for (int j = 0; j < nbasis; ++j)
-          selfLinkEigen(i, j) = static_cast<ComplexD>(TensorRemove(selfLink(i, j)));
-
-      selfLinkInvEigen = selfLinkEigen.inverse();
-
-      for(int i = 0; i < nbasis; ++i)
-        for(int j = 0; j < nbasis; ++j)
-          selfLinkInv(i, j) = selfLinkInvEigen(i, j);
-
-      pokeLocalSite(selfLinkInv, AselfInv_v, lcoor);
-    });
-  }
-
-  void FillHalfCbs() {
-    std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl;
-    for(int p = 0; p < geom.npoint; ++p) {
-      pickCheckerboard(Even, Aeven[p], A[p]);
-      pickCheckerboard(Odd, Aodd[p], A[p]);
-    }
-    pickCheckerboard(Even, AselfInvEven, AselfInv);
-    pickCheckerboard(Odd, AselfInvOdd, AselfInv);
-  }
-};
-
-NAMESPACE_END(Grid);
-#endif
diff --git a/Grid/algorithms/GeneralCoarsenedMatrix.h b/Grid/algorithms/GeneralCoarsenedMatrix.h
deleted file mode 100644
index 86971fdf..00000000
--- a/Grid/algorithms/GeneralCoarsenedMatrix.h
+++ /dev/null
@@ -1,573 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
-
-#include <Grid/lattice/PaddedCell.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// Fixme need coalesced read gpermute
-template<class vobj> void gpermute(vobj & inout,int perm){
-  vobj tmp=inout;
-  if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
-  if (perm & 0x2 ) { permute(inout,tmp,1); tmp=inout;}
-  if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
-  if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
-}
-
-/////////////////////////////////////////////////////////////////
-// Reuse Aggregation class from CoarsenedMatrix for now
-// Might think about *smoothed* Aggregation
-// Equivalent of Geometry class in cartesian case
-/////////////////////////////////////////////////////////////////
-class NonLocalStencilGeometry {
-public:
-  int depth;
-  int hops;
-  int npoint;
-  std::vector<Coordinate> shifts;
-  Coordinate stencil_size;
-  Coordinate stencil_lo;
-  Coordinate stencil_hi;
-  GridCartesian *grid;
-  GridCartesian *Grid() {return grid;};
-  int Depth(void){return 1;};   // Ghost zone depth
-  int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil
-
-  virtual int DimSkip(void) =0;
-
-  virtual ~NonLocalStencilGeometry() {};
-
-  int  Reverse(int point)
-  {
-    int Nd = Grid()->Nd();
-    Coordinate shft = shifts[point];
-    Coordinate rev(Nd);
-    for(int mu=0;mu<Nd;mu++) rev[mu]= -shft[mu];
-    for(int p=0;p<npoint;p++){
-      if(rev==shifts[p]){
-	return p;
-      }
-    }
-    assert(0);
-    return -1;
-  }
-  void BuildShifts(void)
-  {
-    this->shifts.resize(0);
-    int Nd = this->grid->Nd();
-
-    int dd = this->DimSkip();
-    for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){
-    for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){
-    for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){
-    for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){
-      Coordinate sft(Nd,0);
-      sft[dd+0] = s0;
-      sft[dd+1] = s1;
-      sft[dd+2] = s2;
-      sft[dd+3] = s3;
-      int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3);
-      if(nhops<=this->hops) this->shifts.push_back(sft);
-    }}}}
-    this->npoint = this->shifts.size();
-    std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<<std::endl;
-  }
-  
-  NonLocalStencilGeometry(GridCartesian *_coarse_grid,int _hops) : grid(_coarse_grid), hops(_hops)
-  {
-    Coordinate latt = grid->GlobalDimensions();
-    stencil_size.resize(grid->Nd());
-    stencil_lo.resize(grid->Nd());
-    stencil_hi.resize(grid->Nd());
-    for(int d=0;d<grid->Nd();d++){
-     if ( latt[d] == 1 ) {
-      stencil_lo[d] = 0;
-      stencil_hi[d] = 0;
-      stencil_size[d]= 1;
-     } else if ( latt[d] == 2 ) {
-      stencil_lo[d] = -1;
-      stencil_hi[d] = 0;
-      stencil_size[d]= 2;
-     } else if ( latt[d] > 2 ) {
-       stencil_lo[d] = -1;
-       stencil_hi[d] =  1;
-       stencil_size[d]= 3;
-     }
-    }
-  };
-
-};
-
-// Need to worry about red-black now
-class NonLocalStencilGeometry4D : public NonLocalStencilGeometry {
-public:
-  virtual int DimSkip(void) { return 0;};
-  NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops) { };
-  virtual ~NonLocalStencilGeometry4D() {};
-};
-class NonLocalStencilGeometry5D : public NonLocalStencilGeometry {
-public:
-  virtual int DimSkip(void) { return 1; }; 
-  NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops)  { };
-  virtual ~NonLocalStencilGeometry5D() {};
-};
-/*
- * Bunch of different options classes
- */
-class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
-public:
-  NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,4)
-  {
-    this->BuildShifts();
-  };
-};
-class NextToNextToNextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,4)
-  {
-    this->BuildShifts();
-  };
-};
-class NextToNearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
-public:
-  NextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,2)
-  {
-    this->BuildShifts();
-  };
-};
-class NextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,2)
-  {
-    this->BuildShifts();
-  };
-};
-class NearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
-public:
-  NearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,1)
-  {
-    this->BuildShifts();
-  };
-};
-class NearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,1)
-  {
-    this->BuildShifts();
-  };
-};
-
-// Fine Object == (per site) type of fine field
-// nbasis      == number of deflation vectors
-template<class Fobj,class CComplex,int nbasis>
-class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
-public:
-
-  typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
-  typedef iVector<CComplex,nbasis >           siteVector;
-  typedef iMatrix<CComplex,nbasis >           siteMatrix;
-  typedef Lattice<iScalar<CComplex> >         CoarseComplexField;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-  typedef iMatrix<CComplex,nbasis >  Cobj;
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-  typedef CoarseVector Field;
-  ////////////////////
-  // Data members
-  ////////////////////
-  int hermitian;
-  GridBase      *       _FineGrid; 
-  GridCartesian *       _CoarseGrid; 
-  NonLocalStencilGeometry &geom;
-  PaddedCell Cell;
-  GeneralLocalStencil Stencil;
-  
-  std::vector<CoarseMatrix> _A;
-  std::vector<CoarseMatrix> _Adag;
-
-  ///////////////////////
-  // Interface
-  ///////////////////////
-  GridBase      * Grid(void)           { return _FineGrid; };   // this is all the linalg routines need to know
-  GridBase      * FineGrid(void)       { return _FineGrid; };   // this is all the linalg routines need to know
-  GridCartesian * CoarseGrid(void)     { return _CoarseGrid; };   // this is all the linalg routines need to know
-
-
-  void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
-  {
-    int nfound=0;
-    std::cout << " ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
-    for(int p=0;p<geom.npoint;p++){
-      for(int pp=0;pp<CopyMe.geom.npoint;pp++){
- 	// Search for the same relative shift
-	// Avoids brutal handling of Grid pointers
-	if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
-	  _A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
-	  _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
-	  nfound++;
-	}
-      }
-    }
-    assert(nfound==geom.npoint);
-    ExchangeCoarseLinks();
-  }
-  
-  GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
-    : geom(_geom),
-      _FineGrid(FineGrid),
-      _CoarseGrid(CoarseGrid),
-      hermitian(1),
-      Cell(_geom.Depth(),_CoarseGrid),
-      Stencil(Cell.grids.back(),geom.shifts)
-  {
-    {
-      int npoint = _geom.npoint;
-      autoView( Stencil_v  , Stencil, AcceleratorRead);
-      int osites=Stencil.Grid()->oSites();
-      for(int ss=0;ss<osites;ss++){
-	for(int point=0;point<npoint;point++){
-	  auto SE = Stencil_v.GetEntry(point,ss);
-	  int o = SE->_offset;
-	  assert( o< osites);
-	}
-      }    
-    }
-
-    _A.resize(geom.npoint,CoarseGrid);
-    _Adag.resize(geom.npoint,CoarseGrid);
-  }
-  void M (const CoarseVector &in, CoarseVector &out)
-  {
-    Mult(_A,in,out);
-  }
-  void Mdag (const CoarseVector &in, CoarseVector &out)
-  {
-    if ( hermitian ) M(in,out);
-    else Mult(_Adag,in,out);
-  }
-  void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
-  {
-    RealD tviews=0;
-    RealD ttot=0;
-    RealD tmult=0;
-    RealD texch=0;
-    RealD text=0;
-    ttot=-usecond();
-    conformable(CoarseGrid(),in.Grid());
-    conformable(in.Grid(),out.Grid());
-    out.Checkerboard() = in.Checkerboard();
-    CoarseVector tin=in;
-
-    texch-=usecond();
-    CoarseVector pin  = Cell.Exchange(tin);
-    texch+=usecond();
-
-    CoarseVector pout(pin.Grid()); pout=Zero();
-
-    int npoint = geom.npoint;
-    typedef LatticeView<Cobj> Aview;
-      
-    const int Nsimd = CComplex::Nsimd();
-    
-    int osites=pin.Grid()->oSites();
-    //    int gsites=pin.Grid()->gSites();
-
-    RealD flops = 1.0* npoint * nbasis * nbasis * 8 * osites;
-    RealD bytes = (1.0*osites*sizeof(siteMatrix)*npoint+2.0*osites*sizeof(siteVector))*npoint;
-      
-    //    for(int point=0;point<npoint;point++){
-    //      conformable(A[point],pin);
-    //    }
-
-    {
-      tviews-=usecond();
-      autoView( in_v , pin, AcceleratorRead);
-      autoView( out_v , pout, AcceleratorWrite);
-      autoView( Stencil_v  , Stencil, AcceleratorRead);
-      tviews+=usecond();
-      
-      for(int point=0;point<npoint;point++){
-	tviews-=usecond();
-	autoView( A_v, A[point],AcceleratorRead);
-	tviews+=usecond();
-	tmult-=usecond();
-	accelerator_for(sss, osites*nbasis, Nsimd, {
-
-	    typedef decltype(coalescedRead(in_v[0]))    calcVector;
-
-	    int ss = sss/nbasis;
-	    int b  = sss%nbasis;
-
-	    auto SE  = Stencil_v.GetEntry(point,ss);
-	    auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
-	    auto res = out_v(ss)(b);
-	    for(int bb=0;bb<nbasis;bb++) {
-	      res = res + coalescedRead(A_v[ss](b,bb))*nbr(bb);
-	    }
-	    coalescedWrite(out_v[ss](b),res);
-	});
-
-	tmult+=usecond();
-      }
-    }
-    text-=usecond();
-    out = Cell.Extract(pout);
-    text+=usecond();
-    ttot+=usecond();
-
-    std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel "<< flops/tmult<<" mflop/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel "<< bytes/tmult<<" MB/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse bytes   "<< bytes/1e6<<" MB"<<std::endl;
-  };
-
-  void PopulateAdag(void)
-  {
-    for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
-      Coordinate bcoor;
-      CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
-      
-      for(int p=0;p<geom.npoint;p++){
-	Coordinate scoor = bcoor;
-	for(int mu=0;mu<bcoor.size();mu++){
-	  int L = CoarseGrid()->GlobalDimensions()[mu];
-	  scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic
-	}
-	// Flip to poke/peekLocalSite and not too bad
-	auto link = peekSite(_A[p],scoor);
-	int pp = geom.Reverse(p);
-	pokeSite(adj(link),_Adag[pp],bcoor);
-      }
-    }
-  }
-  /////////////////////////////////////////////////////////////
-  // 
-  // A) Only reduced flops option is to use a padded cell of depth 4
-  // and apply MpcDagMpc in the padded cell.
-  //
-  // Makes for ONE application of MpcDagMpc per vector instead of 30 or 80.
-  // With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio
-  // Cost is 81x more, same as stencil size.
-  //
-  // But: can eliminate comms and do as local dirichlet.
-  //
-  // Local exchange gauge field once.
-  // Apply to all vectors, local only computation.
-  // Must exchange ghost subcells in reverse process of PaddedCell to take inner products
-  //
-  // B) Can reduce cost: pad by 1, apply Deo      (4^4+6^4+8^4+8^4 )/ (4x 4^4)
-  //                     pad by 2, apply Doe
-  //                     pad by 3, apply Deo
-  //                     then break out 8x directions; cost is ~10x MpcDagMpc per vector
-  //
-  // => almost factor of 10 in setup cost, excluding data rearrangement
-  //
-  // Intermediates -- ignore the corner terms, leave approximate and force Hermitian
-  // Intermediates -- pad by 2 and apply 1+8+24 = 33 times.
-  /////////////////////////////////////////////////////////////
-
-    //////////////////////////////////////////////////////////
-    // BFM HDCG style approach: Solve a system of equations to get Aij
-    //////////////////////////////////////////////////////////
-    /*
-     *     Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM.
-     *
-     *     conj(phases[block]) proj[k][ block*Nvec+j ] =  \sum_ball  e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} > 
-     *                                                 =  \sum_ball e^{iqk.delta} A_ji
-     *
-     *     Must invert matrix M_k,l = e^[i q_k . delta_l]
-     *
-     *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
-     */
-  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
-  {
-    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
-    GridBase *grid = FineGrid();
-
-    RealD tproj=0.0;
-    RealD teigen=0.0;
-    RealD tmat=0.0;
-    RealD tphase=0.0;
-    RealD tinv=0.0;
-
-    /////////////////////////////////////////////////////////////
-    // Orthogonalise the subblocks over the basis
-    /////////////////////////////////////////////////////////////
-    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
-
-    const int npoint = geom.npoint;
-      
-    Coordinate clatt = CoarseGrid()->GlobalDimensions();
-    int Nd = CoarseGrid()->Nd();
-
-      /*
-       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
-       *     Matrix index i is mapped to this shift via 
-       *               geom.shifts[i]
-       *
-       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
-       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
-       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
-       *       = M_{kl} A_ji^{b.b+l}
-       *
-       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
-       *  
-       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
-       *
-       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
-       */
-    teigen-=usecond();
-    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
-    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
-    ComplexD ci(0.0,1.0);
-    for(int k=0;k<npoint;k++){ // Loop over momenta
-
-      for(int l=0;l<npoint;l++){ // Loop over nbr relative
-	ComplexD phase(0.0,0.0);
-	for(int mu=0;mu<Nd;mu++){
-	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	  phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
-	}
-	phase=exp(phase*ci);
-	Mkl(k,l) = phase;
-      }
-    }
-    invMkl = Mkl.inverse();
-    teigen+=usecond();
-
-    ///////////////////////////////////////////////////////////////////////
-    // Now compute the matrix elements of linop between the orthonormal
-    // set of vectors.
-    ///////////////////////////////////////////////////////////////////////
-    FineField phaV(grid); // Phased block basis vector
-    FineField MphaV(grid);// Matrix applied
-    CoarseVector coarseInner(CoarseGrid());
-
-    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
-    std::vector<CoarseVector>          FT(npoint,CoarseGrid());
-    for(int i=0;i<nbasis;i++){// Loop over basis vectors
-      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
-      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
-	/////////////////////////////////////////////////////
-	// Stick a phase on every block
-	/////////////////////////////////////////////////////
-	tphase-=usecond();
-	CoarseComplexField coor(CoarseGrid());
-	CoarseComplexField pha(CoarseGrid());	pha=Zero();
-	for(int mu=0;mu<Nd;mu++){
-	  LatticeCoordinate(coor,mu);
-	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	  pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
-	}
-	pha  =exp(pha*ci);
-	phaV=Zero();
-	blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
-	tphase+=usecond();
-
-	/////////////////////////////////////////////////////////////////////
-	// Multiple phased subspace vector by matrix and project to subspace
-	// Remove local bulk phase to leave relative phases
-	/////////////////////////////////////////////////////////////////////
-	tmat-=usecond();
-	linop.Op(phaV,MphaV);
-	tmat+=usecond();
-
-	tproj-=usecond();
-	blockProject(coarseInner,MphaV,Subspace.subspace);
-	coarseInner = conjugate(pha) * coarseInner;
-
-	ComputeProj[p] = coarseInner;
-	tproj+=usecond();
-
-      }
-
-      tinv-=usecond();
-      for(int k=0;k<npoint;k++){
-	FT[k] = Zero();
-	for(int l=0;l<npoint;l++){
-	  FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
-	}
-      
-	int osites=CoarseGrid()->oSites();
-	autoView( A_v  , _A[k], AcceleratorWrite);
-	autoView( FT_v  , FT[k], AcceleratorRead);
-	accelerator_for(sss, osites, 1, {
-	    for(int j=0;j<nbasis;j++){
-	      A_v[sss](j,i) = FT_v[sss](j);
-	    }
-        });
-      }
-      tinv+=usecond();
-    }
-
-    for(int p=0;p<geom.npoint;p++){
-      Coordinate coor({0,0,0,0,0});
-      auto sval = peekSite(_A[p],coor);
-    }
-
-    // Only needed if nonhermitian
-    if ( ! hermitian ) {
-      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
-      PopulateAdag();
-    }
-
-    // Need to write something to populate Adag from A
-    std::cout << GridLogMessage<<"ExchangeCoarseLinks  "<<std::endl;
-    ExchangeCoarseLinks();
-    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator mat    "<<tmat <<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator proj   "<<tproj<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
-  }
-  void ExchangeCoarseLinks(void){
-    for(int p=0;p<geom.npoint;p++){
-      std::cout << "Exchange "<<p<<std::endl;
-      _A[p] = Cell.Exchange(_A[p]);
-      _Adag[p]= Cell.Exchange(_Adag[p]);
-    }
-  }
-  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
-};
-
-
-NAMESPACE_END(Grid);
diff --git a/Grid/algorithms/multigrid/Multigrid.h b/Grid/algorithms/multigrid/MultiGrid.h
similarity index 100%
rename from Grid/algorithms/multigrid/Multigrid.h
rename to Grid/algorithms/multigrid/MultiGrid.h