Test_evec_compression changes:

Added ability to choose one of a variety of preselected basis sizes from the command line Fine lanczos now checks enough evecs are generated and resizes the output to Nstop and not the actual amount that converged (which can be larger)
Test_evec_compression enhancements:
2026-07-28 06:23:28 +01:00 · 2022-04-06 06:33:26 -07:00 · 2022-03-29 06:16:15 -07:00 · 2022-03-14 06:45:28 -07:00 · 2022-02-22 14:25:27 -05:00 · 2022-02-14 08:09:01 -08:00
702 changed files with 22608 additions and 72099 deletions
@@ -1,54 +0,0 @@
-name: Bug report
-description: Report a bug.
-title: "<insert title>"
-labels: [bug]
-
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thank you for taking the time to file a bug report.
-        Please check that the code is pointing to the HEAD of develop
-        or any commit in master which is tagged with a version number.
-
-  - type: textarea
-    attributes:
-      label: "Describe the issue:"
-      description: >
-        Describe the issue and any previous attempt to solve it.
-    validations:
-      required: true
-
-  - type: textarea
-    attributes:
-      label: "Code example:"
-      description: >
-        If relevant, show how to reproduce the issue using a minimal working
-        example.
-      placeholder: |
-        << your code here >>
-      render: shell
-    validations:
-      required: false
-
-  - type: textarea
-    attributes:
-      label: "Target platform:"
-      description: >
-        Give a description of the target platform (CPU, network, compiler).
-        Please give the full CPU part description, using for example
-        `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
-        or `sysctl machdep.cpu.brand_string` (macOS) and the full output
-        the `--version` option of your compiler.
-    validations:
-      required: true
-
-  - type: textarea
-    attributes:
-      label: "Configure options:"
-      description: >
-        Please give the exact configure command used and attach
-        `config.log`, `grid.config.summary` and the output of `make V=1`.
-      render: shell
-    validations:
-      required: true
@@ -1,7 +1,3 @@
-# Doxygen stuff
-html/*
-latex/*
-
 # Compiled Object files #
 #########################
 *.slo
@@ -1,2 +0,0 @@
-
-mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
@@ -1,5 +0,0 @@
-CXX=hipcc
-MPICXX=mpicxx 
-CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
-LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
-hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench
@@ -1,2 +0,0 @@
-
-mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
@@ -34,6 +34,9 @@ directory

 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+#if defined __GNUC__ 
+#pragma GCC diagnostic ignored "-Wpsabi"
 #endif

 //disables and intel compiler specific warning (in json.hpp)
@@ -44,22 +47,14 @@ directory
 #ifdef __NVCC__
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"
-
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
- //disables nvcc specific warning in json.hpp
-#pragma nv_diag_suppress unsigned_compare_with_zero
-#pragma nv_diag_suppress cast_to_qualified_type
- //disables nvcc specific warning in many files
-#pragma nv_diag_suppress esa_on_defaulted_function_ignored
-#pragma nv_diag_suppress extra_semicolon
-#else
- //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
+
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
-#endif
+
+//Eigen only
 #endif

 // Disable vectorisation in Eigen on the Power8/9 and PowerPC
@@ -44,10 +44,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridStd.h>
 #include <Grid/threads/Pragmas.h>
 #include <Grid/perfmon/Timer.h>
-//#include <Grid/perfmon/PerfCount.h>
+#include <Grid/perfmon/PerfCount.h>
 #include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
-#include <Grid/perfmon/Tracing.h>
 #include <Grid/allocator/Allocator.h>
 #include <Grid/simd/Simd.h>
 #include <Grid/threads/ThreadReduction.h>
@@ -59,7 +58,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice.h>      
 #include <Grid/cshift/Cshift.h>       
 #include <Grid/stencil/Stencil.h>      
-#include <Grid/stencil/GeneralLocalStencil.h>      
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/algorithms/Algorithms.h>   
 NAMESPACE_CHECK(GridCore)
@@ -16,7 +16,6 @@
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
-#include <strings.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
@@ -14,11 +14,7 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
-#pragma nv_diag_suppress code_is_unreachable
-#else
 #pragma diag_suppress code_is_unreachable
-#endif
 #pragma push_macro("__CUDA_ARCH__")
 #pragma push_macro("__NVCC__")
 #pragma push_macro("__CUDACC__")
@@ -34,7 +30,7 @@
 #pragma push_macro("__SYCL_DEVICE_ONLY__")
 #undef __SYCL_DEVICE_ONLY__
 #define EIGEN_DONT_VECTORIZE
-#undef EIGEN_USE_SYCL
+//#undef EIGEN_USE_SYCL
 #define __SYCL__REDEFINE__
 #endif

@@ -66,10 +66,6 @@ if BUILD_FERMION_REPS
  extra_sources+=$(ADJ_FERMION_FILES)
  extra_sources+=$(TWOIND_FERMION_FILES)
 endif
-if BUILD_SP
-    extra_sources+=$(SP_FERMION_FILES)
-    extra_sources+=$(SP_TWOIND_FERMION_FILES)
-endif

 lib_LIBRARIES = libGrid.a

@@ -30,14 +30,9 @@ directory

 #include <type_traits>
 #include <cassert>
-#include <exception>

 #define NAMESPACE_BEGIN(A) namespace A {
 #define NAMESPACE_END(A)   }
 #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
 #define GRID_NAMESPACE_END   NAMESPACE_END(Grid)
 #define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  ); 
-
-#define EXCEPTION_CHECK_BEGIN(A) try {
-#define EXCEPTION_CHECK_END(A)   } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }
-
@@ -29,9 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALGORITHMS_H
 #define GRID_ALGORITHMS_H

-NAMESPACE_CHECK(blas);
-#include <Grid/algorithms/blas/BatchedBlas.h>
-
 NAMESPACE_CHECK(algorithms);
 #include <Grid/algorithms/SparseMatrix.h>
 #include <Grid/algorithms/LinearOperator.h>
@@ -47,11 +44,7 @@ NAMESPACE_CHECK(SparseMatrix);
 #include <Grid/algorithms/approx/RemezGeneral.h>
 #include <Grid/algorithms/approx/ZMobius.h>
 NAMESPACE_CHECK(approx);
-#include <Grid/algorithms/deflation/Deflation.h>
-#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
-#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
-#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
-NAMESPACE_CHECK(deflation);
+#include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 NAMESPACE_CHECK(ConjGrad);
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
@@ -62,7 +55,6 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
-#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
@@ -73,13 +65,11 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
-#include <Grid/algorithms/iterative/SimpleLanczos.h>
 #include <Grid/algorithms/iterative/PowerMethod.h>
-#include <Grid/algorithms/iterative/AdefGeneric.h>
-#include <Grid/algorithms/iterative/AdefMrhs.h>
+
 NAMESPACE_CHECK(PowerMethod);
-#include <Grid/algorithms/multigrid/MultiGrid.h>
-NAMESPACE_CHECK(multigrid);
+#include <Grid/algorithms/CoarsenedMatrix.h>
+NAMESPACE_CHECK(CoarsendMatrix);
 #include <Grid/algorithms/FFT.h>

 #endif
@@ -56,6 +56,243 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
  blockSum(CoarseInner,fine_inner_msk);
 }

+
+class Geometry {
+public:
+  int npoint;
+  int base;
+  std::vector<int> directions   ;
+  std::vector<int> displacements;
+  std::vector<int> points_dagger;
+
+  Geometry(int _d)  {
+    
+    base = (_d==5) ? 1:0;
+
+    // make coarse grid stencil for 4d , not 5d
+    if ( _d==5 ) _d=4;
+
+    npoint = 2*_d+1;
+    directions.resize(npoint);
+    displacements.resize(npoint);
+    points_dagger.resize(npoint);
+    for(int d=0;d<_d;d++){
+      directions[d   ] = d+base;
+      directions[d+_d] = d+base;
+      displacements[d  ] = +1;
+      displacements[d+_d]= -1;
+      points_dagger[d   ] = d+_d;
+      points_dagger[d+_d] = d;
+    }
+    directions   [2*_d]=0;
+    displacements[2*_d]=0;
+    points_dagger[2*_d]=2*_d;
+  }
+
+  int point(int dir, int disp) {
+    assert(disp == -1 || disp == 0 || disp == 1);
+    assert(base+0 <= dir && dir < base+4);
+
+    // directions faster index = new indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  1  2  3  0  1  2  3  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  2  3  4  1  2  3  4  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+
+    // displacements faster index = old indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  0  1  1  2  2  3  3  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  1  2  2  3  3  4  4  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+
+    if(dir == 0 and disp == 0)
+      return 8;
+    else // New indexing
+      return (1 - disp) / 2 * 4 + dir - base;
+    // else // Old indexing
+    //   return (4 * (dir - base) + 1 - disp) / 2;
+  }
+};
+  
+template<class Fobj,class CComplex,int nbasis>
+class Aggregation   {
+public:
+  typedef iVector<CComplex,nbasis >             siteVector;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+
+  GridBase *CoarseGrid;
+  GridBase *FineGrid;
+  std::vector<Lattice<Fobj> > subspace;
+  int checkerboard;
+  int Checkerboard(void){return checkerboard;}
+  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
+    CoarseGrid(_CoarseGrid),
+    FineGrid(_FineGrid),
+    subspace(nbasis,_FineGrid),
+    checkerboard(_checkerboard)
+  {
+  };
+  
+  void Orthogonalise(void){
+    CoarseScalar InnerProd(CoarseGrid); 
+    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+  } 
+  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
+    blockProject(CoarseVec,FineVec,subspace);
+  }
+  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
+    FineVec.Checkerboard() = subspace[0].Checkerboard();
+    blockPromote(CoarseVec,FineVec,subspace);
+  }
+
+  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
+
+    RealD scale;
+
+    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+
+    for(int b=0;b<nn;b++){
+      
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+      for(int i=0;i<1;i++){
+
+	CG(hermop,noise,subspace[b]);
+
+	noise = subspace[b];
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+      }
+
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
+      subspace[b]   = noise;
+
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
+  // and this is the best I found
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {
+      // Filter
+      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      b++;
+    }
+
+    // Generate a full sequence of Chebyshevs
+    {
+      lo=filterlo;
+      noise=Mn;
+
+      FineField T0(FineGrid); T0 = noise;  
+      FineField T1(FineGrid); 
+      FineField T2(FineGrid);
+      FineField y(FineGrid);
+      
+      FineField *Tnm = &T0;
+      FineField *Tn  = &T1;
+      FineField *Tnp = &T2;
+
+      // Tn=T1 = (xscale M + mscale)in
+      RealD xscale = 2.0/(hi-lo);
+      RealD mscale = -(hi+lo)/(hi-lo);
+      hermop.HermOp(T0,y);
+      T1=y*xscale+noise*mscale;
+
+      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
+	
+	hermop.HermOp(*Tn,y);
+
+	autoView( y_v , y, AcceleratorWrite);
+	autoView( Tn_v , (*Tn), AcceleratorWrite);
+	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
+	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
+	const int Nsimd = CComplex::Nsimd();
+	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+        });
+
+	// Possible more fine grained control is needed than a linear sweep,
+	// but huge productivity gain if this is simple algorithm and not a tunable
+	int m =1;
+	if ( n>=ordermin ) m=n-ordermin;
+	if ( (m%orderstep)==0 ) { 
+	  Mn=*Tnp;
+	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
+	  subspace[b] = Mn;
+	  hermop.Op(Mn,tmp); 
+	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  b++;
+	}
+
+	// Cycle pointers to avoid copies
+	FineField *swizzle = Tnm;
+	Tnm    =Tn;
+	Tn     =Tnp;
+	Tnp    =swizzle;
+	  
+      }
+    }
+    assert(b==nn);
+  }
+
+};
+
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
@@ -87,9 +324,9 @@ public:
  GridBase*        _cbgrid;
  int hermitian;

-  CartesianStencil<siteVector,siteVector,DefaultImplParams> Stencil; 
-  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilEven;
-  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilOdd;
+  CartesianStencil<siteVector,siteVector,int> Stencil; 
+  CartesianStencil<siteVector,siteVector,int> StencilEven;
+  CartesianStencil<siteVector,siteVector,int> StencilOdd;

  std::vector<CoarseMatrix> A;
  std::vector<CoarseMatrix> Aeven;
@@ -99,7 +336,7 @@ public:
  CoarseMatrix AselfInvEven;
  CoarseMatrix AselfInvOdd;

-  deviceVector<RealD> dag_factor;
+  Vector<RealD> dag_factor;

  ///////////////////////
  // Interface
@@ -124,13 +361,9 @@ public:
    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
      
-    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
-    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
+    Vector<Aview> AcceleratorViewContainer;
  
-    for(int p=0;p<geom.npoint;p++) {
-      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
-      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
-    }
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
    Aview *Aview_p = & AcceleratorViewContainer[0];

    const int Nsimd = CComplex::Nsimd();
@@ -165,7 +398,7 @@ public:
      coalescedWrite(out_v[ss](b),res);
      });

-    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
  };

  void Mdag (const CoarseVector &in, CoarseVector &out)
@@ -194,14 +427,9 @@ public:
    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;

+    Vector<Aview> AcceleratorViewContainer;

-    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
-    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
-  
-    for(int p=0;p<geom.npoint;p++) {
-      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
-      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
-    }
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
    Aview *Aview_p = & AcceleratorViewContainer[0];

    const int Nsimd = CComplex::Nsimd();
@@ -210,10 +438,10 @@ public:

    int osites=Grid()->oSites();

-    deviceVector<int> points(geom.npoint);
-    for(int p=0; p<geom.npoint; p++) { 
-      acceleratorPut(points[p],geom.points_dagger[p]);
-    }
+    Vector<int> points(geom.npoint, 0);
+    for(int p=0; p<geom.npoint; p++)
+      points[p] = geom.points_dagger[p];
+
    auto points_p = &points[0];

    RealD* dag_factor_p = &dag_factor[0];
@@ -245,7 +473,7 @@ public:
      coalescedWrite(out_v[ss](b),res);
      });

-    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
  }

  void MdirComms(const CoarseVector &in)
@@ -260,14 +488,8 @@ public:
    out.Checkerboard() = in.Checkerboard();

    typedef LatticeView<Cobj> Aview;
-
-    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
-    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
-  
-    for(int p=0;p<geom.npoint;p++) {
-      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
-      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
-    }
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
    Aview *Aview_p = & AcceleratorViewContainer[0];

    autoView( out_v , out, AcceleratorWrite);
@@ -300,7 +522,7 @@ public:
      }
      coalescedWrite(out_v[ss](b),res);
    });
-    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
  }
  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
  {
@@ -409,7 +631,7 @@ public:
    assert(Aself != nullptr);
  }

-  void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
+  void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
                       const CoarseVector &in, CoarseVector &out, int dag) {
    int point = geom.npoint-1;
    autoView( out_v, out, AcceleratorWrite);
@@ -472,7 +694,7 @@ public:
    }
  }

-  void DhopInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, std::vector<CoarseMatrix> &a,
+  void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
                    const CoarseVector &in, CoarseVector &out, int dag) {
    SimpleCompressor<siteVector> compressor;

@@ -484,20 +706,14 @@ public:

    // determine in what order we need the points
    int npoint = geom.npoint-1;
-    deviceVector<int> points(npoint);
-    for(int p=0; p<npoint; p++) {
-      int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
-      acceleratorPut(points[p], val);
-    }
+    Vector<int> points(npoint, 0);
+    for(int p=0; p<npoint; p++)
+      points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
+
    auto points_p = &points[0];

-    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
-    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
-  
-    for(int p=0;p<geom.npoint;p++) {
-      hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
-      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
-    }
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
    Aview *Aview_p = & AcceleratorViewContainer[0];

    const int Nsimd = CComplex::Nsimd();
@@ -560,7 +776,7 @@ public:
      });
    }

-    for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
  }
  
  CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	:
@@ -568,9 +784,9 @@ public:
    _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,_cbgrid),
    Aodd(geom.npoint,_cbgrid),
@@ -588,9 +804,9 @@ public:
    _cbgrid(&CoarseRBGrid),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,&CoarseRBGrid),
    Aodd(geom.npoint,&CoarseRBGrid),
@@ -611,13 +827,11 @@ public:
    }

    // GPU readable prefactor
-    std::vector<RealD> h_dag_factor(nbasis*nbasis);
    thread_for(i, nbasis*nbasis, {
      int j = i/nbasis;
      int k = i%nbasis;
-      h_dag_factor[i] = dag_factor_eigen(j, k);
+      dag_factor[i] = dag_factor_eigen(j, k);
    });
-    acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
  }

  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #define _GRID_FFT_H_

 #ifdef HAVE_FFTW
-#if defined(USE_MKL) || defined(GRID_SYCL)
+#ifdef USE_MKL
 #include <fftw/fftw3.h>
 #else
 #include <fftw3.h>
@@ -168,7 +168,6 @@ public:
  template<class vobj>
  void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
 #ifndef HAVE_FFTW
-    std::cerr << "FFTW is not compiled but is called"<<std::endl;
    assert(0);
 #else
    conformable(result.Grid(),vgrid);
@@ -191,8 +190,7 @@ public:
      
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
-    //std::cout << "CPU view" << std::endl;
-    
+
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
      
@@ -215,7 +213,6 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
      
-    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -229,7 +226,6 @@ public:
    }
      
    // Barrel shift and collect global pencil
-    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@@ -251,7 +247,6 @@ public:
      }
    }
      
-    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@@ -274,7 +269,6 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
      
-    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@@ -291,7 +285,6 @@ public:
    }
    result = result*div;
      
-    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
@@ -103,38 +103,6 @@ public:
    _Mat.MdagM(in,out);
  }
 };
-template<class Matrix,class Field>
-class MMdagLinearOperator : public LinearOperatorBase<Field> {
-  Matrix &_Mat;
-public:
-  MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
-
-  // Support for coarsening to a multigrid
-  void OpDiag (const Field &in, Field &out) {
-    _Mat.Mdiag(in,out);
-  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    _Mat.Mdir(in,out,dir,disp);
-  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){
-    _Mat.MdirAll(in,out);
-  };
-  void Op     (const Field &in, Field &out){
-    _Mat.M(in,out);
-  }
-  void AdjOp     (const Field &in, Field &out){
-    _Mat.Mdag(in,out);
-  }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    _Mat.MMdag(in,out);
-    ComplexD dot = innerProduct(in,out);
-    n1=real(dot);
-    n2=norm2(out);
-  }
-  void HermOp(const Field &in, Field &out){
-    _Mat.MMdag(in,out);
-  }
-};

 ////////////////////////////////////////////////////////////////////
 // Construct herm op and shift it for mgrid smoother
@@ -177,44 +145,6 @@ public:
  }
 };

-////////////////////////////////////////////////////////////////////
-// Create a shifted HermOp
-////////////////////////////////////////////////////////////////////
-template<class Field>
-class ShiftedHermOpLinearOperator : public LinearOperatorBase<Field> {
-  LinearOperatorBase<Field> &_Mat;
-  RealD _shift;
-public:
-  ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
-  // Support for coarsening to a multigrid
-  void OpDiag (const Field &in, Field &out) {
-    assert(0);
-  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    assert(0);
-  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
-  };
-  void Op     (const Field &in, Field &out){
-    HermOp(in,out);
-  }
-  void AdjOp     (const Field &in, Field &out){
-    HermOp(in,out);
-  }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    HermOp(in,out);
-    ComplexD dot = innerProduct(in,out);
-    n1=real(dot);
-    n2=norm2(out);
-  }
-  void HermOp(const Field &in, Field &out){
-    _Mat.HermOp(in,out);
-    out = out + _shift*in;
-  }
-};
-
-
 ////////////////////////////////////////////////////////////////////
 // Wrap an already herm matrix
 ////////////////////////////////////////////////////////////////////
@@ -277,38 +207,6 @@ public:
    assert(0);
  }
 };
-template<class Matrix,class Field>
-class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
-  Matrix &_Mat;
-  RealD shift;
-public:
-  ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
-  // Support for coarsening to a multigrid
-  void OpDiag (const Field &in, Field &out) {
-    _Mat.Mdiag(in,out);
-    out = out + shift*in;
-  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    _Mat.Mdir(in,out,dir,disp);
-  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){
-    _Mat.MdirAll(in,out);
-  };
-  void Op     (const Field &in, Field &out){
-    _Mat.M(in,out);
-    out = out + shift * in;
-  }
-  void AdjOp     (const Field &in, Field &out){
-    _Mat.Mdag(in,out);
-    out = out + shift * in;
-  }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    assert(0);
-  }
-  void HermOp(const Field &in, Field &out){
-    assert(0);
-  }
-};

 //////////////////////////////////////////////////////////
 // Even Odd Schur decomp operators; there are several
@@ -628,7 +526,6 @@ public:
      (*this)(Linop,in[k],out[k]);
    }
  };
-  virtual ~OperatorFunction(){};
 };

 template<class Field> class LinearFunction {
@@ -644,7 +541,6 @@ public:
      (*this)(in[i], out[i]);
    }
  }
-  virtual ~LinearFunction(){};
 };

 template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
@@ -45,11 +45,6 @@ public:
    M(in,tmp);
    Mdag(tmp,out);
  }
-  virtual void  MMdag(const Field &in, Field &out) {
-    Field tmp (in.Grid());
-    Mdag(in,tmp);
-    M(tmp,out);
-  }
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
@@ -59,7 +59,7 @@ public:
    RealD diff = hi-lo;
    RealD delta = diff*1.0e-9;
    for (RealD x=lo; x<hi; x+=delta) {
-      delta*=1.02;
+      delta*=1.1;
      RealD f = approx(x);
      out<< x<<" "<<f<<std::endl;
    }
@@ -90,8 +90,9 @@ public:
    order=_order;
      
    if(order < 2) exit(-1);
-    Coeffs.resize(order,0.0);
-    Coeffs[order-1] = 1.0;
+    Coeffs.resize(order);
+    Coeffs.assign(0.,order);
+    Coeffs[order-1] = 1.;
  };
  
  // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
@@ -131,26 +132,6 @@ public:
      Coeffs[j] = s * 2.0/order;
    }
  };
-  template<class functor>
-  void Init(RealD _lo,RealD _hi,int _order, functor & func)
-  {
-    lo=_lo;
-    hi=_hi;
-    order=_order;
-      
-    if(order < 2) exit(-1);
-    Coeffs.resize(order);
-    for(int j=0;j<order;j++){
-      RealD s=0;
-      for(int k=0;k<order;k++){
-	RealD y=std::cos(M_PI*(k+0.5)/order);
-	RealD x=0.5*(y*(hi-lo)+(hi+lo));
-	RealD f=func(x);
-	s=s+f*std::cos( j*M_PI*(k+0.5)/order );
-      }
-      Coeffs[j] = s * 2.0/order;
-    }
-  };

    
  void JacksonSmooth(void){
@@ -277,12 +258,26 @@ public:
    for(int n=2;n<order;n++){

      Linop.HermOp(*Tn,y);
+#if 0
+      auto y_v = y.View();
+      auto Tn_v = Tn->View();
+      auto Tnp_v = Tnp->View();
+      auto Tnm_v = Tnm->View();
+      constexpr int Nsimd = vector_type::Nsimd();
+      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+      });
+      if ( Coeffs[n] != 0.0) {
+	axpy(out,Coeffs[n],*Tnp,out);
+      }
+#else
      axpby(y,xscale,mscale,y,(*Tn));
      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
      if ( Coeffs[n] != 0.0) {
 	axpy(out,Coeffs[n],*Tnp,out);
      }
-
+#endif
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
@@ -297,6 +292,7 @@ public:
 template<class Field>
 class ChebyshevLanczos : public Chebyshev<Field> {
 private:
+
  std::vector<RealD> Coeffs;
  int order;
  RealD alpha;
@@ -40,7 +40,7 @@ public:
  RealD norm;
  RealD lo,hi;

-  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), tolerances(n), lo(_lo), hi(_hi) {;};
+  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
  RealD approx(RealD x);
  void csv(std::ostream &out);
  void gnuplot(std::ostream &out);
@@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
 * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
 * type = 1 for the approximation which is infinite at x = 0. */

-zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
+zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
  INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
    l, invlambda, xi, xisq, *tv, s, opl;
  int m, czero, ts;
@@ -375,12 +375,12 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
  construct_partfrac(d);
  construct_contfrac(d);

-  /* Converting everything to ZOLO_PRECISION for external use only */
+  /* Converting everything to PRECISION for external use only */

  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (ZOLO_PRECISION) d -> A;
-  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
-  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
+  zd -> A = (PRECISION) d -> A;
+  zd -> Delta = (PRECISION) d -> Delta;
+  zd -> epsilon = (PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@@ -390,24 +390,24 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;

-  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
+  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
  free(d -> a);

-  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
+  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
  free(d -> ap);

-  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
+  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
  free(d -> alpha);

-  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
+  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
  free(d -> beta);

-  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
+  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
  free(d -> gamma);

  free(d);
@@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
 }


-zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
+zolotarev_data* higham(PRECISION epsilon, int n) {
  INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
  int m, czero;
  zolotarev_data *zd;
@@ -481,9 +481,9 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
  /* Converting everything to PRECISION for external use only */

  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (ZOLO_PRECISION) d -> A;
-  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
-  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
+  zd -> A = (PRECISION) d -> A;
+  zd -> Delta = (PRECISION) d -> Delta;
+  zd -> epsilon = (PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@@ -493,24 +493,24 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;

-  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
+  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
  free(d -> a);

-  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
+  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
  free(d -> ap);

-  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
+  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
  free(d -> alpha);

-  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
+  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
  free(d -> beta);

-  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
+  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
  free(d -> gamma);

  free(d);
@@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
 #ifdef TEST

 #undef ZERO
-#define ZERO ((ZOLO_PRECISION) 0)
+#define ZERO ((PRECISION) 0)
 #undef ONE
-#define ONE ((ZOLO_PRECISION) 1)
+#define ONE ((PRECISION) 1)
 #undef TWO
-#define TWO ((ZOLO_PRECISION) 2)
+#define TWO ((PRECISION) 2)

 /* Evaluate the rational approximation R(x) using the factored form */

-static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION R;
+  PRECISION R;

  if (rdata -> type == 0) {
    R = rdata -> A * x;
@@ -551,9 +551,9 @@ static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {

 /* Evaluate the rational approximation R(x) using the partial fraction form */

-static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
+  PRECISION R = rdata -> alpha[rdata -> da - 1];
  for (m = 0; m < rdata -> dd; m++)
    R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
  if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@@ -568,18 +568,18 @@ static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data*
 * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
 * but with signalling overflow you will get an error message. */

-static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION R = rdata -> beta[0] * x;
+  PRECISION R = rdata -> beta[0] * x;
  for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
  return R;
 }    

 /* Evaluate the rational approximation R(x) using Cayley form */

-static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION T;
+  PRECISION T;

  T = rdata -> type == 0 ? ONE : -ONE;
  for (m = 0; m < rdata -> n; m++)
@@ -607,7 +607,7 @@ int main(int argc, char** argv) {
  int m, n, plotpts = 5000, type = 0;
  float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
  zolotarev_data *rdata;
-  ZOLO_PRECISION y;
+  PRECISION y;
  FILE *plot_function, *plot_error, 
    *plot_partfrac, *plot_contfrac, *plot_cayley;

@@ -626,13 +626,13 @@ int main(int argc, char** argv) {
  }

  rdata = type == 2 
-    ? higham((ZOLO_PRECISION) eps, n) 
-    : zolotarev((ZOLO_PRECISION) eps, n, type);
+    ? higham((PRECISION) eps, n) 
+    : zolotarev((PRECISION) eps, n, type);

  printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
 	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
 	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
-	 "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
+	 "\tPRECISION = " STRINGIFY(PRECISION)
 	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
 	 "\tDelta = %g (maximum error)\n\n"
 	 "\tA = %g (overall factor)\n",
@@ -681,15 +681,15 @@ int main(int argc, char** argv) {
    x = 2.4 * (float) m / plotpts - 1.2;
    if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
      /* skip x = 0 for type 1, as R(0) is singular */
-      y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
+      y = zolotarev_eval((PRECISION) x, rdata);
      fprintf(plot_function, "%g %g\n", x, (float) y);
      fprintf(plot_error, "%g %g\n",
 	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
-      ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
+      ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
+      ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
+      ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
      if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
 	maxypferr = MAX(maxypferr, fabs(ypferr));
@@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
 #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>

 #ifndef ZOLOTAREV_INTERNAL
-#ifndef ZOLO_PRECISION
-#define ZOLO_PRECISION double
+#ifndef PRECISION
+#define PRECISION double
 #endif
-#define ZPRECISION ZOLO_PRECISION
+#define ZPRECISION PRECISION
 #define ZOLOTAREV_DATA zolotarev_data
 #endif

@@ -77,8 +77,8 @@ typedef struct {
 * zolotarev_data structure. The arguments must satisfy the constraints that
 * epsilon > 0, n > 0, and type = 0 or 1. */

-ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
-ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
+ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
+ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
 void zolotarev_free(zolotarev_data *zdata);
 #endif

@@ -86,4 +86,3 @@ void zolotarev_free(zolotarev_data *zdata);
 NAMESPACE_END(Approx);
 NAMESPACE_END(Grid);
 #endif
-
@@ -1,34 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: BatchedBlas.h
-
-    Copyright (C) 2023
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/GridCore.h>
-#include <Grid/algorithms/blas/BatchedBlas.h>
-NAMESPACE_BEGIN(Grid);
-gridblasHandle_t GridBLAS::gridblasHandle;
-int              GridBLAS::gridblasInit;
-NAMESPACE_END(Grid);
-
@@ -1,376 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: MultiRHSBlockCGLinalg.h
-
-    Copyright (C) 2024
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-
-/* Need helper object for BLAS accelerated mrhs blockCG */
-template<class Field>
-class MultiRHSBlockCGLinalg
-{
-public:
-
-  typedef typename Field::scalar_type   scalar;
-  typedef typename Field::scalar_object scalar_object;
-  typedef typename Field::vector_object vector_object;
-
-  deviceVector<scalar> BLAS_X;      // nrhs x vol -- the sources
-  deviceVector<scalar> BLAS_Y;      // nrhs x vol -- the result
-  deviceVector<scalar> BLAS_C;      // nrhs x nrhs -- the coefficients 
-  deviceVector<scalar> BLAS_Cred;   // nrhs x nrhs x oSites -- reduction buffer
-  deviceVector<scalar *> Xdip;
-  deviceVector<scalar *> Ydip;
-  deviceVector<scalar *> Cdip;
-  
-  MultiRHSBlockCGLinalg() {};
-  ~MultiRHSBlockCGLinalg(){ Deallocate(); };
-  
-  void Deallocate(void)
-  {
-    Xdip.resize(0);
-    Ydip.resize(0);
-    Cdip.resize(0);
-    BLAS_Cred.resize(0);
-    BLAS_C.resize(0);
-    BLAS_X.resize(0);
-    BLAS_Y.resize(0);
-  }
-  void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
-  {
-    std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
-    for(int r=0;r<AP.size();r++){
-      Y_copy[r] = Y[r];
-    }
-    MulMatrix(AP,m,X);
-    for(int r=0;r<AP.size();r++){
-      AP[r] = scale*AP[r]+Y_copy[r];
-    }
-  }
-  void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
-  {
-    typedef typename Field::scalar_type scomplex;
-    GridBase *grid;
-    uint64_t vol;
-    uint64_t words;
-
-    int nrhs = Y.size();
-    grid  = X[0].Grid();
-    vol   = grid->lSites();
-    words = sizeof(scalar_object)/sizeof(scalar);
-    int64_t vw = vol * words;
-
-    RealD t0 = usecond();
-    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
-    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
-    BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
-    RealD t1 = usecond();
-
-    /////////////////////////////////////////////
-    // Copy in the multi-rhs sources
-    /////////////////////////////////////////////
-    for(int r=0;r<nrhs;r++){
-      int64_t offset = r*vw;
-      autoView(x_v,X[r],AcceleratorRead);
-      acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
-    }
-
-    // Assumes Eigen storage contiguous
-    acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
-    
-  /*
-   * in Fortran column major notation (cuBlas order)
-   *
-   * Xxr = [X1(x)][..][Xn(x)]
-   * Yxr = [Y1(x)][..][Ym(x)]
-   * Y = X . C
-   */
-    deviceVector<scalar *> Xd(1);
-    deviceVector<scalar *> Yd(1);
-    deviceVector<scalar *> Cd(1);
-
-    scalar * Xh = & BLAS_X[0];
-    scalar * Yh = & BLAS_Y[0];
-    scalar * Ch = & BLAS_C[0];
-
-    acceleratorPut(Xd[0],Xh);
-    acceleratorPut(Yd[0],Yh);
-    acceleratorPut(Cd[0],Ch);
-
-    RealD t2 = usecond();
-    GridBLAS BLAS;
-    /////////////////////////////////////////
-    // Y = X*C (transpose?)
-    /////////////////////////////////////////
-    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
-    		     vw,nrhs,nrhs,
-		     scalar(1.0),
-		     Xd,
-		     Cd,
-		     scalar(0.0),  // wipe out Y
-		     Yd);
-    BLAS.synchronise();
-    RealD t3 = usecond();
-
-    // Copy back Y = m X 
-    for(int r=0;r<nrhs;r++){
-      int64_t offset = r*vw;
-      autoView(y_v,Y[r],AcceleratorWrite);
-      acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
-    }    
-    RealD t4 = usecond();
-    std::cout <<GridLogPerformance << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
-  }
-  
-  void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
-  {
-#if 0    
-    int nrhs;
-    GridBase *grid;
-    uint64_t vol;
-    uint64_t words;
-
-    nrhs = X.size();
-    assert(X.size()==Y.size());
-    conformable(X[0],Y[0]);
-
-    grid  = X[0].Grid();
-    vol   = grid->lSites();
-    words = sizeof(scalar_object)/sizeof(scalar);
-    int64_t vw = vol * words;
-
-    RealD t0 = usecond();
-    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
-    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
-    BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
-    RealD t1 = usecond();
-
-    /////////////////////////////////////////////
-    // Copy in the multi-rhs sources
-    /////////////////////////////////////////////
-    for(int r=0;r<nrhs;r++){
-      int64_t offset = r*vw;
-      autoView(x_v,X[r],AcceleratorRead);
-      acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
-      autoView(y_v,Y[r],AcceleratorRead);
-      acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
-    }
-    RealD t2 = usecond();
-
-  /*
-   * in Fortran column major notation (cuBlas order)
-   *
-   * Xxr = [X1(x)][..][Xn(x)]
-   *
-   * Yxr = [Y1(x)][..][Ym(x)]
-   *
-   * C_rs = X^dag Y
-   */
-    deviceVector<scalar *> Xd(1);
-    deviceVector<scalar *> Yd(1);
-    deviceVector<scalar *> Cd(1);
-
-    scalar * Xh = & BLAS_X[0];
-    scalar * Yh = & BLAS_Y[0];
-    scalar * Ch = & BLAS_C[0];
-
-    acceleratorPut(Xd[0],Xh);
-    acceleratorPut(Yd[0],Yh);
-    acceleratorPut(Cd[0],Ch);
-
-    GridBLAS BLAS;
-
-    RealD t3 = usecond();
-    /////////////////////////////////////////
-    // C_rs = X^dag Y
-    /////////////////////////////////////////
-    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
-    		     nrhs,nrhs,vw,
-		     ComplexD(1.0),
-		     Xd,
-		     Yd,
-		     ComplexD(0.0),  // wipe out C
-		     Cd);
-    BLAS.synchronise();
-    RealD t4 = usecond();
-
-    std::vector<scalar> HOST_C(BLAS_C.size());      // nrhs . nrhs -- the coefficients 
-    acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
-    grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
-
-    RealD t5 = usecond();
-    for(int rr=0;rr<nrhs;rr++){
-      for(int r=0;r<nrhs;r++){
-	int off = r+nrhs*rr;
-	m(r,rr)=HOST_C[off];
-      }
-    }
-    RealD t6 = usecond();
-    uint64_t M=nrhs;
-    uint64_t N=nrhs;
-    uint64_t K=vw;
-    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
-    RealD flops = 8.0*M*N*K;
-    flops = flops/(t4-t3)/1.e3;
-    bytes = bytes/(t4-t3)/1.e3;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
-#else
-    int nrhs;
-    GridBase *grid;
-    uint64_t vol;
-    uint64_t words;
-
-    nrhs = X.size();
-    assert(X.size()==Y.size());
-    conformable(X[0],Y[0]);
-
-    grid  = X[0].Grid();
-    int rd0 =  grid->_rdimensions[0] * grid->_rdimensions[1];
-    vol   = grid->oSites()/rd0;
-    words = rd0*sizeof(vector_object)/sizeof(scalar);
-    int64_t vw = vol * words;
-    assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
-
-    RealD t0 = usecond();
-    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
-    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
-    BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
-    RealD t1 = usecond();
-
-    /////////////////////////////////////////////
-    // Copy in the multi-rhs sources -- layout batched BLAS ready
-    /////////////////////////////////////////////
-    for(int r=0;r<nrhs;r++){
-      autoView(x_v,X[r],AcceleratorRead);
-      autoView(y_v,Y[r],AcceleratorRead);
-      scalar *from_x=(scalar *)&x_v[0];
-      scalar *from_y=(scalar *)&y_v[0];
-      scalar *BX = &BLAS_X[0];
-      scalar *BY = &BLAS_Y[0];
-      accelerator_for(ssw,vw,1,{
-	  uint64_t ss=ssw/words;
-	  uint64_t  w=ssw%words;
-	  uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
-	  BX[offset] = from_x[ssw];
-	  BY[offset] = from_y[ssw];
-	});
-    }
-    RealD t2 = usecond();
-
-  /*
-   * in Fortran column major notation (cuBlas order)
-   *
-   * Xxr = [X1(x)][..][Xn(x)]
-   *
-   * Yxr = [Y1(x)][..][Ym(x)]
-   *
-   * C_rs = X^dag Y
-   */
-    Xdip.resize(vol);
-    Ydip.resize(vol);
-    Cdip.resize(vol);
-    std::vector<scalar *> Xh(vol);
-    std::vector<scalar *> Yh(vol);
-    std::vector<scalar *> Ch(vol);
-    for(uint64_t ss=0;ss<vol;ss++){
-
-      Xh[ss] = & BLAS_X[ss*nrhs*words];
-      Yh[ss] = & BLAS_Y[ss*nrhs*words];
-      Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
-
-    }
-    acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
-    acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
-    acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
-    
-    GridBLAS BLAS;
-
-    RealD t3 = usecond();
-    /////////////////////////////////////////
-    // C_rs = X^dag Y
-    /////////////////////////////////////////
-    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
-    		     nrhs,nrhs,words,
-		     ComplexD(1.0),
-		     Xdip,
-		     Ydip,
-		     ComplexD(0.0),  // wipe out C
-		     Cdip);
-    BLAS.synchronise();
-    RealD t4 = usecond();
-
-    std::vector<scalar> HOST_C(BLAS_Cred.size());      // nrhs . nrhs -- the coefficients 
-    acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
-
-    RealD t5 = usecond();
-    m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
-    for(int ss=0;ss<vol;ss++){
-      Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
-      m = m + eC;
-    }
-    RealD t6l = usecond();
-    grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
-    RealD t6 = usecond();
-    uint64_t M=nrhs;
-    uint64_t N=nrhs;
-    uint64_t K=vw;
-    RealD xybytes = grid->lSites()*sizeof(scalar_object);
-    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
-    RealD flops = 8.0*M*N*K;
-    flops = flops/(t4-t3)/1.e3;
-    bytes = bytes/(t4-t3)/1.e3;
-    xybytes = 4*xybytes/(t2-t1)/1.e3;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
-    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
-#endif
-  }
-};
-
-NAMESPACE_END(Grid);
@@ -1,513 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: MultiRHSDeflation.h
-
-    Copyright (C) 2023
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-
-/* 
-   MultiRHS block projection
-
-   Import basis -> nblock x nbasis x  (block x internal) 
-   Import vector of fine lattice objects -> nblock x nrhs x (block x internal) 
-
-   => coarse_(nrhs x nbasis )^block = via batched GEMM
-
-//template<class vobj,class CComplex,int nbasis,class VLattice>
-//inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
-//			   const VLattice &fineData,
-//			   const VLattice &Basis)
-*/
-
-template<class Field>
-class MultiRHSBlockProject
-{
-public:
-
-  typedef typename Field::scalar_type   scalar;
-  typedef typename Field::scalar_object scalar_object;
-  typedef Field Fermion;
-
-  int nbasis;
-  GridBase *coarse_grid;
-  GridBase *fine_grid;
-  uint64_t block_vol;
-  uint64_t fine_vol;
-  uint64_t coarse_vol;
-  uint64_t words;
-
-  // Row major layout "C" order:
-  // BLAS_V[coarse_vol][nbasis][block_vol][words]
-  // BLAS_F[coarse_vol][nrhs][block_vol][words]
-  // BLAS_C[coarse_vol][nrhs][nbasis]
-  /*
-   * in Fortran column major notation (cuBlas order)
-   *
-   * Vxb = [v1(x)][..][vn(x)] ... x coarse vol
-   *
-   * Fxr = [r1(x)][..][rm(x)] ... x coarse vol
-   *
-   * Block project:
-   * C_br = V^dag F x coarse vol
-   *
-   * Block promote:
-   * F_xr = Vxb Cbr x coarse_vol
-   */  
-  deviceVector<scalar> BLAS_V;      // words * block_vol * nbasis x coarse_vol 
-  deviceVector<scalar> BLAS_F;      // nrhs x fine_vol * words   -- the sources
-  deviceVector<scalar> BLAS_C;      // nrhs x coarse_vol * nbasis -- the coarse coeffs
-
-  RealD blasNorm2(deviceVector<scalar> &blas)
-  {
-    scalar ss(0.0);
-    std::vector<scalar> tmp(blas.size());
-    acceleratorCopyFromDevice(&blas[0],&tmp[0],blas.size()*sizeof(scalar));
-    for(int64_t s=0;s<blas.size();s++){
-      ss=ss+tmp[s]*adj(tmp[s]);
-    }
-    coarse_grid->GlobalSum(ss);
-    return real(ss);
-  }
-  
-  MultiRHSBlockProject(){};
- ~MultiRHSBlockProject(){ Deallocate(); };
-  
-  void Deallocate(void)
-  {
-    nbasis=0;
-    coarse_grid=nullptr;
-    fine_grid=nullptr;
-    fine_vol=0;
-    block_vol=0;
-    coarse_vol=0;
-    words=0;
-    BLAS_V.resize(0);
-    BLAS_F.resize(0);
-    BLAS_C.resize(0);
-  }
-  void Allocate(int _nbasis,GridBase *_fgrid,GridBase *_cgrid)
-  {
-    nbasis=_nbasis;
-
-    fine_grid=_fgrid;
-    coarse_grid=_cgrid;
-
-    fine_vol   = fine_grid->lSites();
-    coarse_vol = coarse_grid->lSites();
-    block_vol = fine_vol/coarse_vol;
-    
-    words = sizeof(scalar_object)/sizeof(scalar);
-
-    BLAS_V.resize (fine_vol * words * nbasis );
-  }
-  void ImportFineGridVectors(std::vector <Field > &vecs, deviceVector<scalar> &blas)
-  {
-    int nvec = vecs.size();
-    typedef typename Field::vector_object vobj;
-    //    std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl;
-
-    assert(vecs[0].Grid()==fine_grid);
-
-    subdivides(coarse_grid,fine_grid); // require they map
-
-    int _ndimension = coarse_grid->_ndimension;
-    assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
-    
-    Coordinate  block_r      (_ndimension);
-    for(int d=0 ; d<_ndimension;d++){
-      block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
-    }
-
-    uint64_t sz = blas.size();
-
-    acceleratorMemSet(&blas[0],0,blas.size()*sizeof(scalar));
-
-    Coordinate fine_rdimensions = fine_grid->_rdimensions;
-    Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
-    int64_t bv= block_vol;
-    for(int v=0;v<vecs.size();v++){
-
-      //      std::cout << " BlockProjector importing vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
-      autoView( fineData   , vecs[v], AcceleratorRead);
-
-      auto blasData_p  = &blas[0];
-      auto fineData_p  = &fineData[0];
-
-      int64_t osites = fine_grid->oSites();
-
-      // loop over fine sites
-      const int Nsimd = vobj::Nsimd();
-      //      std::cout << "sz "<<sz<<std::endl;
-      //      std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
-      assert(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
-      uint64_t lwords= words; // local variable for copy in to GPU
-      accelerator_for(sf,osites,Nsimd,{
-#ifdef GRID_SIMT
-        {
-	  int lane=acceleratorSIMTlane(Nsimd); // buffer lane
-#else
-	  for(int lane=0;lane<Nsimd;lane++) {
-#endif
-	  // One thread per fine site
-	  Coordinate coor_f(_ndimension);
-	  Coordinate coor_b(_ndimension);
-	  Coordinate coor_c(_ndimension);
-
-	  // Fine site to fine coor
-	  Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
-
-	  for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
-	  for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
-	  
-	  int sc;// coarse site
-	  int sb;// block site
-	  Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
-	  Lexicographic::IndexFromCoor(coor_b,sb,block_r);
-
-          scalar_object data = extractLane(lane,fineData[sf]);
-
-	  // BLAS layout address calculation
-	  // words * block_vol * nbasis x coarse_vol
-	  // coarse oSite x block vole x lanes
-	  int64_t site = (lane*osites + sc*bv)*nvec
-   	               + v*bv
-	               + sb;
-
-	  //	  assert(site*lwords<sz);
-
-	  scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
-
-	  *ptr = data;
-#ifdef GRID_SIMT
-	}
-#else
-	}
-#endif
-      });
-      //      std::cout << " import fine Blas norm "<<blasNorm2(blas)<<std::endl;
-      //      std::cout << " BlockProjector imported vector"<<v<<std::endl;
-    }
-  }
-  void ExportFineGridVectors(std::vector <Field> &vecs, deviceVector<scalar> &blas)
-  {
-    typedef typename Field::vector_object vobj;
-
-    int nvec = vecs.size();
-
-    assert(vecs[0].Grid()==fine_grid);
-
-    subdivides(coarse_grid,fine_grid); // require they map
-
-    int _ndimension = coarse_grid->_ndimension;
-    assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
-    
-    Coordinate  block_r      (_ndimension);
-    for(int d=0 ; d<_ndimension;d++){
-      block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
-    }
-    Coordinate fine_rdimensions = fine_grid->_rdimensions;
-    Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
-
-    //    std::cout << " export fine Blas norm "<<blasNorm2(blas)<<std::endl;
-
-    int64_t bv= block_vol;
-    for(int v=0;v<vecs.size();v++){
-
-      autoView( fineData   , vecs[v], AcceleratorWrite);
-
-      auto blasData_p  = &blas[0];
-      auto fineData_p    = &fineData[0];
-
-      int64_t osites = fine_grid->oSites();
-      uint64_t lwords = words;
-      //      std::cout << " Nsimd is "<<vobj::Nsimd() << std::endl;
-      //      std::cout << " lwords is "<<lwords << std::endl;
-      //      std::cout << " sizeof(scalar_object) is "<<sizeof(scalar_object) << std::endl;
-      // loop over fine sites
-      accelerator_for(sf,osites,vobj::Nsimd(),{
-      
-#ifdef GRID_SIMT
-        {
-	  int lane=acceleratorSIMTlane(vobj::Nsimd()); // buffer lane
-#else
-	  for(int lane=0;lane<vobj::Nsimd();lane++) {
-#endif
-	  // One thread per fine site
-	  Coordinate coor_f(_ndimension);
-	  Coordinate coor_b(_ndimension);
-	  Coordinate coor_c(_ndimension);
-
-	  Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
-
-	  for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
-	  for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
-	  
-	  int sc;
-	  int sb;
-	  Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
-	  Lexicographic::IndexFromCoor(coor_b,sb,block_r);
-
-	  // BLAS layout address calculation
-	  // words * block_vol * nbasis x coarse_vol 	  
-	  int64_t site = (lane*osites + sc*bv)*nvec
-   	               + v*bv
-	               + sb;
-
-	  scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
-
-	  scalar_object data = *ptr;
-
-	  insertLane(lane,fineData[sf],data);
-#ifdef GRID_SIMT
-	}
-#else
-	}
-#endif
-      });
-    }
-  }
-  template<class vobj>
-  void ImportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
-  {
-    int nvec = vecs.size();
-    typedef typename vobj::scalar_object coarse_scalar_object;
-
-    //    std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl;
-
-    assert(vecs[0].Grid()==coarse_grid);
-
-    int _ndimension = coarse_grid->_ndimension;
-
-    uint64_t sz = blas.size();
-
-    Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
-    
-    for(int v=0;v<vecs.size();v++){
-
-      //      std::cout << " BlockProjector importing coarse vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
-      autoView( coarseData   , vecs[v], AcceleratorRead);
-
-      auto blasData_p  = &blas[0];
-      auto coarseData_p  = &coarseData[0];
-
-      int64_t osites = coarse_grid->oSites();
-
-      // loop over fine sites
-      const int Nsimd = vobj::Nsimd();
-      uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
-      assert(cwords==nbasis);
-      
-      accelerator_for(sc,osites,Nsimd,{
-#ifdef GRID_SIMT
-        {
-	  int lane=acceleratorSIMTlane(Nsimd); // buffer lane
-#else
-	  for(int lane=0;lane<Nsimd;lane++) {
-#endif
-           // C_br per site
-	    int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
-	    
-	    coarse_scalar_object data = extractLane(lane,coarseData[sc]);
-
-	    coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
-
-	    *ptr = data;
-#ifdef GRID_SIMT
-	}
-#else
-	}
-#endif
-      });
-      //      std::cout << " import coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
-    }
-  }
-  template<class vobj>
-  void ExportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
-  {
-    int nvec = vecs.size();
-    typedef typename vobj::scalar_object coarse_scalar_object;
-    //    std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl;
-
-    assert(vecs[0].Grid()==coarse_grid);
-
-    int _ndimension = coarse_grid->_ndimension;
-    
-    uint64_t sz = blas.size();
-
-    Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
-    
-    //    std::cout << " export coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
-    for(int v=0;v<vecs.size();v++){
-
-      //  std::cout << " BlockProjector exporting coarse vector"<<v<<std::endl;
-      autoView( coarseData   , vecs[v], AcceleratorWrite);
-
-      auto blasData_p  = &blas[0];
-      auto coarseData_p  = &coarseData[0];
-
-      int64_t osites = coarse_grid->oSites();
-
-      // loop over fine sites
-      const int Nsimd = vobj::Nsimd();
-      uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
-      assert(cwords==nbasis);
-      
-      accelerator_for(sc,osites,Nsimd,{
-	  // Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
-#ifdef GRID_SIMT
-        {
-	  int lane=acceleratorSIMTlane(Nsimd); // buffer lane
-#else
-	  for(int lane=0;lane<Nsimd;lane++) {
-#endif
-	    int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
-	    coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
-	    coarse_scalar_object data = *ptr;
-	    insertLane(lane,coarseData[sc],data);
-#ifdef GRID_SIMT
-	}
-#else
-	}
-#endif
-      });
-    }
-  }
-  void ImportBasis(std::vector < Field > &vecs)
-  {
-    //    std::cout << " BlockProjector Import basis size "<<vecs.size()<<std::endl;
-    ImportFineGridVectors(vecs,BLAS_V);
-  }
-
-  template<class cobj>
-  void blockProject(std::vector<Field> &fine,std::vector< Lattice<cobj> > & coarse)
-  {
-    int nrhs=fine.size();
-    int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
-    //    std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl;
-    assert(nbasis==_nbasis);
-    
-    BLAS_F.resize (fine_vol * words * nrhs );
-    BLAS_C.resize (coarse_vol * nbasis * nrhs );
-
-    /////////////////////////////////////////////
-    // Copy in the multi-rhs sources to same data layout
-    /////////////////////////////////////////////
-    //    std::cout << "BlockProject import fine"<<std::endl;
-    ImportFineGridVectors(fine,BLAS_F);
-    
-    deviceVector<scalar *> Vd(coarse_vol);
-    deviceVector<scalar *> Fd(coarse_vol);
-    deviceVector<scalar *> Cd(coarse_vol);
-
-    //    std::cout << "BlockProject pointers"<<std::endl;
-    for(int c=0;c<coarse_vol;c++){
-      // BLAS_V[coarse_vol][nbasis][block_vol][words]
-      // BLAS_F[coarse_vol][nrhs][block_vol][words]
-      // BLAS_C[coarse_vol][nrhs][nbasis]
-      scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
-      scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
-      scalar * Ch = & BLAS_C[c*nrhs*nbasis];
-
-      acceleratorPut(Vd[c],Vh);
-      acceleratorPut(Fd[c],Fh);
-      acceleratorPut(Cd[c],Ch);
-    }
-
-    GridBLAS BLAS;
-
-    //    std::cout << "BlockProject BLAS"<<std::endl;
-    int64_t vw = block_vol * words;
-    /////////////////////////////////////////
-    // C_br = V^dag R
-    /////////////////////////////////////////
-    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
-    		     nbasis,nrhs,vw,
-		     scalar(1.0),
-		     Vd,
-		     Fd,
-		     scalar(0.0),  // wipe out C
-		     Cd);
-    BLAS.synchronise();
-    //    std::cout << "BlockProject done"<<std::endl;
-    ExportCoarseGridVectors(coarse, BLAS_C);
-    //    std::cout << "BlockProject done"<<std::endl;
-
-  }
-
-  template<class cobj>
-  void blockPromote(std::vector<Field> &fine,std::vector<Lattice<cobj> > & coarse)
-  {
-    int nrhs=fine.size();
-    int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
-    assert(nbasis==_nbasis);
-    
-    BLAS_F.resize (fine_vol * words * nrhs );
-    BLAS_C.resize (coarse_vol * nbasis * nrhs );
-
-    ImportCoarseGridVectors(coarse, BLAS_C);
-
-    GridBLAS BLAS;
-
-    deviceVector<scalar *> Vd(coarse_vol);
-    deviceVector<scalar *> Fd(coarse_vol);
-    deviceVector<scalar *> Cd(coarse_vol);
-
-    for(int c=0;c<coarse_vol;c++){
-      // BLAS_V[coarse_vol][nbasis][block_vol][words]
-      // BLAS_F[coarse_vol][nrhs][block_vol][words]
-      // BLAS_C[coarse_vol][nrhs][nbasis]
-      scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
-      scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
-      scalar * Ch = & BLAS_C[c*nrhs*nbasis];
-      acceleratorPut(Vd[c],Vh);
-      acceleratorPut(Fd[c],Fh);
-      acceleratorPut(Cd[c],Ch);
-    }
-
-    /////////////////////////////////////////
-    // Block promote:
-    // F_xr = Vxb Cbr (x coarse_vol)
-    /////////////////////////////////////////
-
-    int64_t vw = block_vol * words;
-    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
-    		     vw,nrhs,nbasis,
-		     scalar(1.0),
-		     Vd,
-		     Cd,
-		     scalar(0.0),  // wipe out C
-		     Fd);
-    BLAS.synchronise();
-    //    std::cout << " blas call done"<<std::endl;
-    
-    ExportFineGridVectors(fine, BLAS_F);
-    //    std::cout << " exported "<<std::endl;
-  }
-};
-
-NAMESPACE_END(Grid);
@@ -1,233 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: MultiRHSDeflation.h
-
-    Copyright (C) 2023
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-
-/* Need helper object for BLAS accelerated mrhs projection
-
-   i) MultiRHS Deflation
-
-   Import Evecs -> nev x vol x internal 
-   Import vector of Lattice objects -> nrhs x vol x internal
-   => Cij (nrhs x Nev) via GEMM.
-   => Guess  (nrhs x vol x internal)  = C x evecs (via GEMM)
-   Export
-
-   
-   ii) MultiRHS block projection
-
-   Import basis -> nblock x nbasis x  (block x internal) 
-   Import vector of fine lattice objects -> nblock x nrhs x (block x internal) 
-
-   => coarse_(nrhs x nbasis )^block = via batched GEMM
-
-   iii)   Alternate interface: 
-   Import higher dim Lattice object-> vol x nrhs layout
-   
-*/
-template<class Field>
-class MultiRHSDeflation
-{
-public:
-
-  typedef typename Field::scalar_type   scalar;
-  typedef typename Field::scalar_object scalar_object;
-
-  int nev;
-  std::vector<RealD> eval;
-  GridBase *grid;
-  uint64_t vol;
-  uint64_t words;
-  
-  deviceVector<scalar> BLAS_E;      //  nev x vol -- the eigenbasis   (up to a 1/sqrt(lambda))
-  deviceVector<scalar> BLAS_R;      // nrhs x vol -- the sources
-  deviceVector<scalar> BLAS_G;      // nrhs x vol -- the guess
-  deviceVector<scalar> BLAS_C;      // nrhs x nev -- the coefficients 
-  
-  MultiRHSDeflation(){};
-  ~MultiRHSDeflation(){ Deallocate(); };
-  
-  void Deallocate(void)
-  {
-    nev=0;
-    grid=nullptr;
-    vol=0;
-    words=0;
-    BLAS_E.resize(0);
-    BLAS_R.resize(0);
-    BLAS_C.resize(0);
-    BLAS_G.resize(0);
-  }
-  void Allocate(int _nev,GridBase *_grid)
-  {
-    nev=_nev;
-    grid=_grid;
-    vol   = grid->lSites();
-    words = sizeof(scalar_object)/sizeof(scalar);
-    eval.resize(nev);
-    BLAS_E.resize (vol * words * nev );
-    std::cout << GridLogMessage << " Allocate for "<<nev<<" eigenvectors and volume "<<vol<<std::endl;
-  }
-  void ImportEigenVector(Field &evec,RealD &_eval, int ev)
-  {
-    //    std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
-    assert(ev<eval.size());
-    eval[ev] = _eval;
-
-    int64_t offset = ev*vol*words;
-    autoView(v,evec,AcceleratorRead);
-    acceleratorCopyDeviceToDevice(&v[0],&BLAS_E[offset],sizeof(scalar_object)*vol);
-
-  }
-  void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval)
-  {
-    ImportEigenBasis(evec,_eval,0,evec.size());
-  }
-  // Could use to import a batch of eigenvectors
-  void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev)
-  {
-    assert(_ev0+_nev<=evec.size());
-
-    Allocate(_nev,evec[0].Grid());
-    
-    // Imports a sub-batch of eigenvectors, _ev0, ..., _ev0+_nev-1
-    for(int e=0;e<nev;e++){
-      std::cout << "Importing eigenvector "<<e<<" evalue "<<_eval[_ev0+e]<<std::endl;
-      ImportEigenVector(evec[_ev0+e],_eval[_ev0+e],e);
-    }
-  }
-  void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess)
-  {
-    int nrhs = source.size();
-    assert(source.size()==guess.size());
-    assert(grid == guess[0].Grid());
-    conformable(guess[0],source[0]);
-
-    int64_t vw = vol * words;
-
-    RealD t0 = usecond();
-    BLAS_R.resize(nrhs * vw); // cost free if size doesn't change
-    BLAS_G.resize(nrhs * vw); // cost free if size doesn't change
-    BLAS_C.resize(nev * nrhs);// cost free if size doesn't change
-
-    /////////////////////////////////////////////
-    // Copy in the multi-rhs sources
-    /////////////////////////////////////////////
-    //    for(int r=0;r<nrhs;r++){
-    //      std::cout << " source["<<r<<"] = "<<norm2(source[r])<<std::endl;
-    //    }
-    for(int r=0;r<nrhs;r++){
-      int64_t offset = r*vw;
-      autoView(v,source[r],AcceleratorRead);
-      acceleratorCopyDeviceToDevice(&v[0],&BLAS_R[offset],sizeof(scalar_object)*vol);
-    }
-
-  /*
-   * in Fortran column major notation (cuBlas order)
-   *
-   * Exe = [e1(x)][..][en(x)]
-   *
-   * Rxr = [r1(x)][..][rm(x)]
-   *
-   * C_er = E^dag R
-   * C_er = C_er / lambda_e 
-   * G_xr = Exe Cer
-   */
-    deviceVector<scalar *> Ed(1);
-    deviceVector<scalar *> Rd(1);
-    deviceVector<scalar *> Cd(1);
-    deviceVector<scalar *> Gd(1);
-
-    scalar * Eh = & BLAS_E[0];
-    scalar * Rh = & BLAS_R[0];
-    scalar * Ch = & BLAS_C[0];
-    scalar * Gh = & BLAS_G[0];
-
-    acceleratorPut(Ed[0],Eh);
-    acceleratorPut(Rd[0],Rh);
-    acceleratorPut(Cd[0],Ch);
-    acceleratorPut(Gd[0],Gh);
-
-    GridBLAS BLAS;
-
-    /////////////////////////////////////////
-    // C_er = E^dag R
-    /////////////////////////////////////////
-    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
-    		     nev,nrhs,vw,
-		     scalar(1.0),
-		     Ed,
-		     Rd,
-		     scalar(0.0),  // wipe out C
-		     Cd);
-    BLAS.synchronise();
-
-    assert(BLAS_C.size()==nev*nrhs);
-
-    std::vector<scalar> HOST_C(BLAS_C.size());      // nrhs . nev -- the coefficients 
-    acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
-    grid->GlobalSumVector(&HOST_C[0],nev*nrhs);
-    for(int e=0;e<nev;e++){
-      RealD lam(1.0/eval[e]);
-      for(int r=0;r<nrhs;r++){
-	int off = e+nev*r;
-	HOST_C[off]=HOST_C[off] * lam;
-	//	std::cout << "C["<<e<<"]["<<r<<"] ="<<HOST_C[off]<< " eval[e] "<<eval[e] <<std::endl;
-      }
-    }
-    acceleratorCopyToDevice(&HOST_C[0],&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
-
-    
-    /////////////////////////////////////////
-    // Guess G_xr = Exe Cer
-    /////////////////////////////////////////
-    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
-		     vw,nrhs,nev,
-		     scalar(1.0),
-		     Ed, // x . nev
-		     Cd, // nev . nrhs
-		     scalar(0.0),
-		     Gd);
-    BLAS.synchronise();
-
-    ///////////////////////////////////////
-    // Copy out the multirhs
-    ///////////////////////////////////////
-    for(int r=0;r<nrhs;r++){
-      int64_t offset = r*vw;
-      autoView(v,guess[r],AcceleratorWrite);
-      acceleratorCopyDeviceToDevice(&BLAS_G[offset],&v[0],sizeof(scalar_object)*vol);
-    }
-    RealD t1 = usecond();
-    std::cout << GridLogMessage << "MultiRHSDeflation for "<<nrhs<<" sources with "<<nev<<" eigenvectors took " << (t1-t0)/1e3 <<" ms"<<std::endl;
-  }
-};
-
-NAMESPACE_END(Grid);
@@ -33,111 +33,109 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   * Script A = SolverMatrix 
   * Script P = Preconditioner
   *
+   * Deflation methods considered
+   *      -- Solve P A x = P b        [ like Luscher ]
+   * DEF-1        M P A x = M P b     [i.e. left precon]
+   * DEF-2        P^T M A x = P^T M b
+   * ADEF-1       Preconditioner = M P + Q      [ Q + M + M A Q]
+   * ADEF-2       Preconditioner = P^T M + Q
+   * BNN          Preconditioner = P^T M P + Q
+   * BNN2         Preconditioner = M P + P^TM +Q - M P A M 
+   * 
   * Implement ADEF-2
   *
   * Vstart = P^Tx + Qb
   * M1 = P^TM + Q
   * M2=M3=1
+   * Vout = x
   */
-NAMESPACE_BEGIN(Grid);

-
-template<class Field>
-class TwoLevelCG : public LinearFunction<Field>
+// abstract base
+template<class Field, class CoarseField>
+class TwoLevelFlexiblePcg : public LinearFunction<Field>
 {
 public:
+  int verbose;
  RealD   Tolerance;
  Integer MaxIterations;
+  const int mmax = 5;
  GridBase *grid;
+  GridBase *coarsegrid;

-  // Fine operator, Smoother, CoarseSolver
-  LinearOperatorBase<Field>   &_FineLinop;
-  LinearFunction<Field>   &_Smoother;
+  LinearOperatorBase<Field>   *_Linop
+  OperatorFunction<Field>     *_Smoother,
+  LinearFunction<CoarseField> *_CoarseSolver;
+
+  // Need somthing that knows how to get from Coarse to fine and back again
  
  // more most opertor functions
-  TwoLevelCG(RealD tol,
-	     Integer maxit,
-	     LinearOperatorBase<Field>   &FineLinop,
-	     LinearFunction<Field>       &Smoother,
-	     GridBase *fine) : 
+  TwoLevelFlexiblePcg(RealD tol,
+		     Integer maxit,
+		     LinearOperatorBase<Field> *Linop,
+		     LinearOperatorBase<Field> *SmootherLinop,
+		     OperatorFunction<Field>   *Smoother,
+		     OperatorFunction<CoarseField>  CoarseLinop
+		     ) : 
      Tolerance(tol), 
      MaxIterations(maxit),
-      _FineLinop(FineLinop),
-      _Smoother(Smoother)
-  {
-    grid       = fine;
+      _Linop(Linop),
+      _PreconditionerLinop(PrecLinop),
+      _Preconditioner(Preconditioner)
+  { 
+    verbose=0;
  };
-  
-  virtual void operator() (const Field &src, Field &x)
-  {
-    std::cout << GridLogMessage<<"HDCG: fPcg starting single RHS"<<std::endl;
+
+  // The Pcg routine is common to all, but the various matrices differ from derived 
+  // implementation to derived implmentation
+  void operator() (const Field &src, Field &psi){
+  void operator() (const Field &src, Field &psi){
+
+    psi.Checkerboard() = src.Checkerboard();
+    grid             = src.Grid();
+
    RealD f;
    RealD rtzp,rtz,a,d,b;
    RealD rptzp;
-
+    RealD tn;
+    RealD guess = norm2(psi);
+    RealD ssq   = norm2(src);
+    RealD rsq   = ssq*Tolerance*Tolerance;
+    
    /////////////////////////////
    // Set up history vectors
    /////////////////////////////
-    int mmax = 5;
-    std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
-    std::vector<Field> p(mmax,grid);
+    std::vector<Field> p  (mmax,grid);
    std::vector<Field> mmp(mmax,grid);
    std::vector<RealD> pAp(mmax);
-    Field z(grid);
+
+    Field x  (grid); x = psi;
+    Field z  (grid);
    Field tmp(grid);
-    Field  mp (grid);
-    Field  r  (grid);
-    Field  mu (grid);
-    
-    std::cout << GridLogMessage<<"HDCG: fPcg allocated"<<std::endl;
-    //Initial residual computation & set up
-    RealD guess   = norm2(x);
-    std::cout << GridLogMessage<<"HDCG: fPcg guess nrm "<<guess<<std::endl;
-    RealD src_nrm = norm2(src);
-    std::cout << GridLogMessage<<"HDCG: fPcg src nrm "<<src_nrm<<std::endl;
-    
-    if ( src_nrm == 0.0 ) {
-      std::cout << GridLogMessage<<"HDCG: fPcg given trivial source norm "<<src_nrm<<std::endl;
-      x=Zero();
-    }
-    RealD tn;
-    
-    GridStopWatch HDCGTimer;
-    HDCGTimer.Start();
+    Field r  (grid);
+    Field mu (grid);
+  
    //////////////////////////
    // x0 = Vstart -- possibly modify guess
    //////////////////////////
+    x=src;
    Vstart(x,src);
-    
+
    // r0 = b -A x0
-    _FineLinop.HermOp(x,mmp[0]);
+    HermOp(x,mmp); // Shouldn't this be something else?
    axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
-    {
-      double n1 = norm2(x);
-      double n2 = norm2(mmp[0]);
-      double n3 = norm2(r);
-      std::cout<<GridLogMessage<<"x,vstart,r = "<<n1<<" "<<n2<<" "<<n3<<std::endl;
-    }

    //////////////////////////////////
    // Compute z = M1 x
    //////////////////////////////////
-    PcgM1(r,z);
+    M1(r,z,tmp,mp,SmootherMirs);
    rtzp =real(innerProduct(r,z));
-    
+
    ///////////////////////////////////////
    // Solve for Mss mu = P A z and set p = z-mu
-    // Def2 p = 1 - Q Az = Pright z
+    // Def2: p = 1 - Q Az = Pright z 
    // Other algos M2 is trivial
    ///////////////////////////////////////
-    PcgM2(z,p[0]);
-
-    RealD ssq =  norm2(src);
-    RealD rsq =  ssq*Tolerance*Tolerance;
-
-    std::cout << GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" rsq "<<rsq<<"\n";
-
-    Field pp(grid);
+    M2(z,p[0]);

    for (int k=0;k<=MaxIterations;k++){
    
@@ -145,46 +143,31 @@ class TwoLevelCG : public LinearFunction<Field>
      int peri_kp = (k+1) % mmax;

      rtz=rtzp;
-      d= PcgM3(p[peri_k],mmp[peri_k]);
+      d= M3(p[peri_k],mp,mmp[peri_k],tmp);
      a = rtz/d;
    
      // Memorise this
      pAp[peri_k] = d;
-      
+
      axpy(x,a,p[peri_k],x);
      RealD rn = axpy_norm(r,-a,mmp[peri_k],r);

      // Compute z = M x
-      PcgM1(r,z);
-      
-      {
-	RealD n1,n2;
-	n1=norm2(r);
-	n2=norm2(z);
-	std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : vector r,z "<<n1<<" "<<n2<<"\n";
-      }
+      M1(r,z,tmp,mp);
+
      rtzp =real(innerProduct(r,z));
-      std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : inner rtzp "<<rtzp<<"\n";

-      //    PcgM2(z,p[0]);
-      PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
-      
-      p[peri_kp]=mu;
+      M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate

-      // Standard search direction  p -> z + b p    
+      p[peri_kp]=p[peri_k];
+
+      // Standard search direction  p -> z + b p    ; b = 
      b = (rtzp)/rtz;
-      
-      int northog;
-      // k=zero  <=> peri_kp=1;        northog = 1
-      // k=1     <=> peri_kp=2;        northog = 2
-      // ...               ...                  ...
-      // k=mmax-2<=> peri_kp=mmax-1;   northog = mmax-1
-      // k=mmax-1<=> peri_kp=0;        northog = 1

+      int northog;
      //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
      northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
    
-      std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
      for(int back=0; back < northog; back++){
 	int peri_back = (k-back)%mmax;
 	RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
@@ -193,324 +176,75 @@ class TwoLevelCG : public LinearFunction<Field>
      }

      RealD rrn=sqrt(rn/ssq);
-      RealD rtn=sqrt(rtz/ssq);
-      RealD rtnp=sqrt(rtzp/ssq);
-
-      std::cout<<GridLogMessage<<"HDCG: fPcg k= "<<k<<" residual = "<<rrn<<"\n";
+      std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;

      // Stopping condition
      if ( rn <= rsq ) { 

-	HDCGTimer.Stop();
-	std::cout<<GridLogMessage<<"HDCG: fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
-	
-	_FineLinop.HermOp(x,mmp[0]);			  
+	HermOp(x,mmp); // Shouldn't this be something else?
 	axpy(tmp,-1.0,src,mmp[0]);
 	
-	RealD  mmpnorm = sqrt(norm2(mmp[0]));
-	RealD  xnorm   = sqrt(norm2(x));
-	RealD  srcnorm = sqrt(norm2(src));
-	RealD  tmpnorm = sqrt(norm2(tmp));
-	RealD  true_residual = tmpnorm/srcnorm;
-	std::cout<<GridLogMessage
-	       <<"HDCG: true residual is "<<true_residual
-	       <<" solution "<<xnorm
-	       <<" source "<<srcnorm
-	       <<" mmp "<<mmpnorm	  
-	       <<std::endl;
-      
-	return;
+	RealD psinorm = sqrt(norm2(x));
+	RealD srcnorm = sqrt(norm2(src));
+	RealD tmpnorm = sqrt(norm2(tmp));
+	RealD true_residual = tmpnorm/srcnorm;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
+	return k;
      }
-
    }
-    HDCGTimer.Stop();
-    std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
-    RealD  xnorm   = sqrt(norm2(x));
-    RealD  srcnorm = sqrt(norm2(src));
-    std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
+    // Non-convergence
+    assert(0);
  }

-
-
-  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
-  {
-    std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
-    src[0].Grid()->Barrier();
-    int nrhs = src.size();
-    std::vector<RealD> f(nrhs);
-    std::vector<RealD> rtzp(nrhs);
-    std::vector<RealD> rtz(nrhs);
-    std::vector<RealD> a(nrhs);
-    std::vector<RealD> d(nrhs);
-    std::vector<RealD> b(nrhs);
-    std::vector<RealD> rptzp(nrhs);
-    /////////////////////////////
-    // Set up history vectors
-    /////////////////////////////
-    int mmax = 3;
-    std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
-    src[0].Grid()->Barrier();
-    std::vector<std::vector<Field> > p(nrhs);   for(int r=0;r<nrhs;r++)  p[r].resize(mmax,grid);
-    std::cout << GridLogMessage<<"HDCG: fPcg allocated p"<<std::endl;
-    src[0].Grid()->Barrier();
-    std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
-    std::cout << GridLogMessage<<"HDCG: fPcg allocated mmp"<<std::endl;
-    src[0].Grid()->Barrier();
-    std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
-    std::cout << GridLogMessage<<"HDCG: fPcg allocated pAp"<<std::endl;
-    src[0].Grid()->Barrier();
-    std::vector<Field> z(nrhs,grid);
-    std::vector<Field>  mp (nrhs,grid);
-    std::vector<Field>  r  (nrhs,grid);
-    std::vector<Field>  mu (nrhs,grid);
-    std::cout << GridLogMessage<<"HDCG: fPcg allocated z,mp,r,mu"<<std::endl;
-    src[0].Grid()->Barrier();
-
-    //Initial residual computation & set up
-    std::vector<RealD> src_nrm(nrhs);
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      src_nrm[rhs]=norm2(src[rhs]);
-      assert(src_nrm[rhs]!=0.0);
-    }
-    std::vector<RealD> tn(nrhs);
-
-    GridStopWatch HDCGTimer;
-    HDCGTimer.Start();
-    //////////////////////////
-    // x0 = Vstart -- possibly modify guess
-    //////////////////////////
-    Vstart(x,src);
-
-    for(int rhs=0;rhs<nrhs;rhs++){
-      // r0 = b -A x0
-      _FineLinop.HermOp(x[rhs],mmp[rhs][0]);
-      axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]);    // Recomputes r=src-Ax0
-    }
-
-    //////////////////////////////////
-    // Compute z = M1 x
-    //////////////////////////////////
-    // This needs a multiRHS version for acceleration
-    PcgM1(r,z);
-
-    std::vector<RealD> ssq(nrhs);
-    std::vector<RealD> rsq(nrhs);
-    std::vector<Field> pp(nrhs,grid);
-
-    for(int rhs=0;rhs<nrhs;rhs++){
-      rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
-      p[rhs][0]=z[rhs];
-      ssq[rhs]=norm2(src[rhs]);
-      rsq[rhs]=  ssq[rhs]*Tolerance*Tolerance;
-      std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
-    }
-
-    std::vector<RealD> rn(nrhs);
-    for (int k=0;k<=MaxIterations;k++){
-    
-      int peri_k  = k % mmax;
-      int peri_kp = (k+1) % mmax;
-
-      for(int rhs=0;rhs<nrhs;rhs++){
-	rtz[rhs]=rtzp[rhs];
-	d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
-	a[rhs] = rtz[rhs]/d[rhs];
-    
-	// Memorise this
-	pAp[rhs][peri_k] = d[rhs];
-
-	axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
-	rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
-      }
-
-      // Compute z = M x (for *all* RHS)
-      PcgM1(r,z);
-      std::cout << GridLogMessage<<"HDCG::fPcg M1 complete"<<std::endl;
-      grid->Barrier();
-      
-      RealD max_rn=0.0;
-      for(int rhs=0;rhs<nrhs;rhs++){
-
-	rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
-
-	std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
-	
-	mu[rhs]=z[rhs];
-
-	p[rhs][peri_kp]=mu[rhs];
-
-	// Standard search direction p == z + b p 
-	b[rhs] = (rtzp[rhs])/rtz[rhs];
-
-	int northog = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
-	std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
-	for(int back=0; back < northog; back++){
-	  int peri_back = (k-back)%mmax;
-	  RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
-	  RealD beta = -pbApk/pAp[rhs][peri_back];
-	  axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
-	}
-
-	RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
-	RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
-	RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
-	
-	std::cout<<GridLogMessage<<"HDCG: rhs "<<rhs<<"fPcg k= "<<k<<" residual = "<<rrn<<"\n";
-	if ( rrn > max_rn ) max_rn = rrn;
-      }
-
-      // Stopping condition based on worst case
-      if ( max_rn <= Tolerance ) { 
-
-	HDCGTimer.Stop();
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
-
-	for(int rhs=0;rhs<nrhs;rhs++){
-	  _FineLinop.HermOp(x[rhs],mmp[rhs][0]);			  
-	  Field tmp(grid);
-	  axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
-      
-	  RealD  mmpnorm = sqrt(norm2(mmp[rhs][0]));
-	  RealD  xnorm   = sqrt(norm2(x[rhs]));
-	  RealD  srcnorm = sqrt(norm2(src[rhs]));
-	  RealD  tmpnorm = sqrt(norm2(tmp));
-	  RealD  true_residual = tmpnorm/srcnorm;
-	  std::cout<<GridLogMessage
-		   <<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
-		   <<" solution "<<xnorm
-		   <<" source "<<srcnorm
-		   <<" mmp "<<mmpnorm	  
-		   <<std::endl;
-	}
-	return;
-      }
-      
-    }
-    HDCGTimer.Stop();
-    std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
-    for(int rhs=0;rhs<nrhs;rhs++){
-      RealD  xnorm   = sqrt(norm2(x[rhs]));
-      RealD  srcnorm = sqrt(norm2(src[rhs]));
-      std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
-    }
-  }
-  
-
 public:

-  virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out)
-  {
-    std::cout << "PcgM1 default (cheat) mrhs version"<<std::endl;
-    for(int rhs=0;rhs<in.size();rhs++){
-      this->PcgM1(in[rhs],out[rhs]);
-    }
-  }
-  virtual void PcgM1(Field & in, Field & out)     =0;
-  virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
-  {
-    std::cout << "Vstart default (cheat) mrhs version"<<std::endl;
-    for(int rhs=0;rhs<x.size();rhs++){
-      this->Vstart(x[rhs],src[rhs]);
-    }
-  }
-  virtual void Vstart(Field & x,const Field & src)=0;
+  virtual void M(Field & in,Field & out,Field & tmp) {

-  virtual void PcgM2(const Field & in, Field & out) {
-    out=in;
  }

-  virtual RealD PcgM3(const Field & p, Field & mmp){
-    RealD dd;
-    _FineLinop.HermOp(p,mmp);
-    ComplexD dot = innerProduct(p,mmp);
-    dd=real(dot);
-    return dd;
-  }
+  virtual void M1(Field & in, Field & out) {// the smoother

-  /////////////////////////////////////////////////////////////////////
-  // Only Def1 has non-trivial Vout.
-  /////////////////////////////////////////////////////////////////////
-
-};
-  
-template<class Field, class CoarseField, class Aggregation>
-class TwoLevelADEF2 : public TwoLevelCG<Field>
-{
- public:
-  ///////////////////////////////////////////////////////////////////////////////////
-  // Need something that knows how to get from Coarse to fine and back again
-  //  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
-  //  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
-  ///////////////////////////////////////////////////////////////////////////////////
-  GridBase *coarsegrid;
-  Aggregation &_Aggregates;                    
-  LinearFunction<CoarseField> &_CoarseSolver;
-  LinearFunction<CoarseField> &_CoarseSolverPrecise;
-  ///////////////////////////////////////////////////////////////////////////////////
-  
-  // more most opertor functions
-  TwoLevelADEF2(RealD tol,
-		Integer maxit,
-		LinearOperatorBase<Field>    &FineLinop,
-		LinearFunction<Field>        &Smoother,
-		LinearFunction<CoarseField>  &CoarseSolver,
-		LinearFunction<CoarseField>  &CoarseSolverPrecise,
-		Aggregation &Aggregates
-		) :
-      TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
-      _CoarseSolver(CoarseSolver),
-      _CoarseSolverPrecise(CoarseSolverPrecise),
-      _Aggregates(Aggregates)
-  {
-    coarsegrid = Aggregates.CoarseGrid;
-  };
-
-  virtual void PcgM1(Field & in, Field & out)
-  {
-    GRID_TRACE("MultiGridPreconditioner ");
    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
+    Field tmp(grid);
+    Field Min(grid);

-    Field tmp(this->grid);
-    Field Min(this->grid);
-    CoarseField PleftProj(this->coarsegrid);
-    CoarseField PleftMss_proj(this->coarsegrid);
+    PcgM(in,Min); // Smoother call

-    GridStopWatch SmootherTimer;
-    GridStopWatch MatrixTimer;
-    SmootherTimer.Start();
-    this->_Smoother(in,Min);
-    SmootherTimer.Stop();
-
-    MatrixTimer.Start();
-    this->_FineLinop.HermOp(Min,out);
-    MatrixTimer.Stop();
+    HermOp(Min,out);
    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min

-    GridStopWatch ProjTimer;
-    GridStopWatch CoarseTimer;
-    GridStopWatch PromTimer;
-    ProjTimer.Start();
-    this->_Aggregates.ProjectToSubspace(PleftProj,tmp);     
-    ProjTimer.Stop();
-    CoarseTimer.Start();
-    this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
-    CoarseTimer.Stop();
-    PromTimer.Start();
-    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
-    PromTimer.Stop();
-    std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
-    std::cout << GridLogPerformance << "\tSmoother   " << SmootherTimer.Elapsed() <<std::endl;
-    std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-    std::cout << GridLogPerformance << "\tProj       " << ProjTimer.Elapsed() <<std::endl;
-    std::cout << GridLogPerformance << "\tCoarse     " << CoarseTimer.Elapsed() <<std::endl;
-    std::cout << GridLogPerformance << "\tProm       " << PromTimer.Elapsed() <<std::endl;
-
+    ProjectToSubspace(tmp,PleftProj);     
+    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
+    PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
    axpy(out,1.0,Min,tmp); // Min+tmp
  }

-  virtual void Vstart(Field & x,const Field & src)
-  {
-    std::cout << GridLogMessage<<"HDCG: fPcg Vstart "<<std::endl;
+  virtual void M2(const Field & in, Field & out) {
+    out=in;
+    // Must override for Def2 only
+    //  case PcgDef2:
+    //    Pright(in,out);
+    //    break;
+  }
+
+  virtual RealD M3(const Field & p, Field & mmp){
+    double d,dd;
+    HermOpAndNorm(p,mmp,d,dd);
+    return dd;
+    // Must override for Def1 only
+    //  case PcgDef1:
+    //    d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
+    //      linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
+    //    Pleft(mp,mmp);
+    //    d=real(linop_d->inner(p,mmp));
+  }
+
+  virtual void VstartDef2(Field & xconst Field & src){
+    //case PcgDef2:
+    //case PcgAdef2: 
+    //case PcgAdef2f:
+    //case PcgV11f:
    ///////////////////////////////////
    // Choose x_0 such that 
    // x_0 = guess +  (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
@@ -522,78 +256,142 @@ class TwoLevelADEF2 : public TwoLevelCG<Field>
    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
    //                   = 0 
    ///////////////////////////////////
-    Field r(this->grid);
-    Field mmp(this->grid);
-    CoarseField PleftProj(this->coarsegrid);
-    CoarseField PleftMss_proj(this->coarsegrid);
-
-    std::cout << GridLogMessage<<"HDCG: fPcg Vstart projecting "<<std::endl;
-    this->_Aggregates.ProjectToSubspace(PleftProj,src);     
-    std::cout << GridLogMessage<<"HDCG: fPcg Vstart coarse solve "<<std::endl;
-    this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    std::cout << GridLogMessage<<"HDCG: fPcg Vstart promote "<<std::endl;
-    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x);  
+    Field r(grid);
+    Field mmp(grid);
+    
+    HermOp(x,mmp);
+    axpy (r, -1.0, mmp, src);        // r_{-1} = src - A x
+    ProjectToSubspace(r,PleftProj);     
+    ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    PromoteFromSubspace(PleftMss_proj,mmp);  
+    x=x+mmp;

  }

-};
-
-  
-template<class Field>
-class TwoLevelADEF1defl : public TwoLevelCG<Field>
-{
-public:
-  const std::vector<Field> &evec;
-  const std::vector<RealD> &eval;
-  
-  TwoLevelADEF1defl(RealD tol,
-		   Integer maxit,
-		   LinearOperatorBase<Field>   &FineLinop,
-		   LinearFunction<Field>   &Smoother,
-		   std::vector<Field> &_evec,
-		   std::vector<RealD> &_eval) : 
-    TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
-    evec(_evec),
-    eval(_eval)
-  {};
-
-  // Can just inherit existing M2
-  // Can just inherit existing M3
-
-  // Simple vstart - do nothing
  virtual void Vstart(Field & x,const Field & src){
-    x=src; // Could apply Q
-  };
-
-  // Override PcgM1
-  virtual void PcgM1(Field & in, Field & out)
-  {
-    GRID_TRACE("EvecPreconditioner ");
-    int N=evec.size();
-    Field Pin(this->grid);
-    Field Qin(this->grid);
-
-    //MP  + Q = M(1-AQ) + Q = M
-    // // If we are eigenvector deflating in coarse space
-    // // Q   = Sum_i |phi_i> 1/lambda_i <phi_i|
-    // // A Q = Sum_i |phi_i> <phi_i|
-    // // M(1-AQ) = M(1-proj) + Q
-    Qin.Checkerboard()=in.Checkerboard();
-    Qin = Zero();
-    Pin = in;
-    for (int i=0;i<N;i++) {
-      const Field& tmp = evec[i];
-      auto ip = TensorRemove(innerProduct(tmp,in));
-      axpy(Qin, ip / eval[i],tmp,Qin);
-      axpy(Pin, -ip ,tmp,Pin);
-    }
-
-    this->_Smoother(Pin,out);
-
-    out = out + Qin;
+    return;
  }
-};

-NAMESPACE_END(Grid);
+  /////////////////////////////////////////////////////////////////////
+  // Only Def1 has non-trivial Vout. Override in Def1
+  /////////////////////////////////////////////////////////////////////
+  virtual void   Vout  (Field & in, Field & out,Field & src){
+    out = in;
+    //case PcgDef1:
+    //    //Qb + PT x
+    //    ProjectToSubspace(src,PleftProj);     
+    //    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    //    PromoteFromSubspace(PleftMss_proj,tmp);  
+    //    
+    //    Pright(in,out);
+    //    
+    //    linop_d->axpy(out,tmp,out,1.0);
+    //    break;
+  }

+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // Pright and Pleft are common to all implementations
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  virtual void Pright(Field & in,Field & out){
+    // P_R  = [ 1              0 ] 
+    //        [ -Mss^-1 Msb    0 ] 
+    Field in_sbar(grid);
+
+    ProjectToSubspace(in,PleftProj);     
+    PromoteFromSubspace(PleftProj,out);  
+    axpy(in_sbar,-1.0,out,in);       // in_sbar = in - in_s 
+
+    HermOp(in_sbar,out);
+    ProjectToSubspace(out,PleftProj);           // Mssbar in_sbar  (project)
+
+    ApplyInverse     (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
+    PromoteFromSubspace(PleftMss_proj,out);     // 
+
+    axpy(out,-1.0,out,in_sbar);     // in_sbar - Mss^{-1} Mssbar in_sbar
+  }
+  virtual void Pleft (Field & in,Field & out){
+    // P_L  = [ 1  -Mbs Mss^-1] 
+    //        [ 0   0         ] 
+    Field in_sbar(grid);
+    Field    tmp2(grid);
+    Field    Mtmp(grid);
+
+    ProjectToSubspace(in,PleftProj);     
+    PromoteFromSubspace(PleftProj,out);  
+    axpy(in_sbar,-1.0,out,in);      // in_sbar = in - in_s
+
+    ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
+    PromoteFromSubspace(PleftMss_proj,out);
+
+    HermOp(out,Mtmp);
+
+    ProjectToSubspace(Mtmp,PleftProj);      // Msbar s Mss^{-1}
+    PromoteFromSubspace(PleftProj,tmp2);
+
+    axpy(out,-1.0,tmp2,Mtmp);
+    axpy(out,-1.0,out,in_sbar);     // in_sbar - Msbars Mss^{-1} in_s
+  }
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp){
+
+  } 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
+
+  }
+  virtual void M2(Field & in, Field & out){
+
+  }
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
+
+  }
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
+
+  }
+}
+/*
+template<class Field>
+class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+  virtual void   Vout  (Field & in, Field & out,Field & src,Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+*/
 #endif
@@ -1,734 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/AdefGeneric.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#pragma once
-
-
-  /*
-   * Compared to Tang-2009:  P=Pleft. P^T = PRight Q=MssInv. 
-   * Script A = SolverMatrix 
-   * Script P = Preconditioner
-   *
-   * Implement ADEF-2
-   *
-   * Vstart = P^Tx + Qb
-   * M1 = P^TM + Q
-   * M2=M3=1
-   */
-NAMESPACE_BEGIN(Grid);
-
-
-template<class Field>
-class TwoLevelCGmrhs
-{
- public:
-  RealD   Tolerance;
-  Integer MaxIterations;
-  GridBase *grid;
-
-  // Fine operator, Smoother, CoarseSolver
-  LinearOperatorBase<Field>   &_FineLinop;
-  LinearFunction<Field>   &_Smoother;
-  MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
-
-  GridStopWatch ProjectTimer;
-  GridStopWatch PromoteTimer;
-  GridStopWatch DeflateTimer;
-  GridStopWatch CoarseTimer;
-  GridStopWatch FineTimer;
-  GridStopWatch SmoothTimer;
-  GridStopWatch InsertTimer;
-
-  /*
-    Field rrr;
-  Field sss;
-  Field qqq;
-  Field zzz;
-  */  
-  // more most opertor functions
-  TwoLevelCGmrhs(RealD tol,
-		 Integer maxit,
-		 LinearOperatorBase<Field>   &FineLinop,
-		 LinearFunction<Field>       &Smoother,
-		 GridBase *fine) : 
-    Tolerance(tol), 
-    MaxIterations(maxit),
-    _FineLinop(FineLinop),
-    _Smoother(Smoother)
-    /*
-    rrr(fine),
-    sss(fine),
-    qqq(fine),
-    zzz(fine)
-*/
-  {
-    grid       = fine;
-  };
-  
-  // Vector case
-  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
-  {
-    //    SolveSingleSystem(src,x);
-    SolvePrecBlockCG(src,x);
-  }
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Thin QR factorisation (google it)
-////////////////////////////////////////////////////////////////////////////////////////////////////
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  //Dimensions
-  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
-  //
-  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
-  //
-  //   Q  C = R => Q = R C^{-1}
-  //
-  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
-  //
-  // Set C = L^{dag}, and then Q^dag Q = ident 
-  //
-  // Checks:
-  // Cdag C = Rdag R ; passes.
-  // QdagQ  = 1      ; passes
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  void ThinQRfact (Eigen::MatrixXcd &m_zz,
-		   Eigen::MatrixXcd &C,
-		   Eigen::MatrixXcd &Cinv,
-		   std::vector<Field> &  Q,
-		   std::vector<Field> & MQ,
-		   const std::vector<Field> & Z,
-		   const std::vector<Field> & MZ)
-  {
-    RealD t0=usecond();
-    _BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
-    RealD t1=usecond();
-
-    m_zz = 0.5*(m_zz+m_zz.adjoint());
-    
-    Eigen::MatrixXcd L    = m_zz.llt().matrixL(); 
-    
-    C    = L.adjoint();
-    Cinv = C.inverse();
-    
-    RealD t3=usecond();
-    _BlockCGLinalg.MulMatrix( Q,Cinv,Z);
-    _BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
-    RealD t4=usecond();
-    std::cout << " ThinQRfact IP    :"<< t1-t0<<" us"<<std::endl;
-    std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
-    std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
-  }
-
-  virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
-  {
-    std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
-    src[0].Grid()->Barrier();
-    int nrhs = src.size();
-    //    std::vector<RealD> f(nrhs);
-    //    std::vector<RealD> rtzp(nrhs);
-    //    std::vector<RealD> rtz(nrhs);
-    //    std::vector<RealD> a(nrhs);
-    //    std::vector<RealD> d(nrhs);
-    //    std::vector<RealD> b(nrhs);
-    //    std::vector<RealD> rptzp(nrhs);
-
-    ////////////////////////////////////////////
-    //Initial residual computation & set up
-    ////////////////////////////////////////////
-    std::vector<RealD> ssq(nrhs);
-    for(int rhs=0;rhs<nrhs;rhs++){
-      ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
-    }      
-
-    ///////////////////////////
-    // Fields -- eliminate duplicates between fPcg and block cg
-    ///////////////////////////
-    std::vector<Field> Mtmp(nrhs,grid);
-    std::vector<Field> tmp(nrhs,grid);
-    std::vector<Field>   Z(nrhs,grid); // Rename Z to R
-    std::vector<Field>  MZ(nrhs,grid); // Rename MZ to Z
-    std::vector<Field>   Q(nrhs,grid); // 
-    std::vector<Field>  MQ(nrhs,grid); // Rename to P
-    std::vector<Field>   D(nrhs,grid);
-    std::vector<Field>  AD(nrhs,grid);
-    
-    /************************************************************************
-     * Preconditioned Block conjugate gradient rQ
-     * Generalise Sebastien Birk Thesis, after Dubrulle 2001.
-     * Introduce preconditioning following Saad Ch9
-     ************************************************************************
-     * Dimensions:
-     *
-     *   X,B etc... ==(Nferm x nrhs)
-     *  Matrix A==(Nferm x Nferm)
-     *  
-     * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
-     * QC => Thin QR factorisation (google it)
-     *
-     * R = B-AX
-     * Z = Mi R
-     * QC = Z
-     * D = Q 
-     * for k: 
-     *   R  = AD
-     *   Z  = Mi R
-     *   M  = [D^dag R]^{-1}
-     *   X  = X + D M C
-     *   QS = Q - Z.M
-     *   D  = Q + D S^dag
-     *   C  = S C
-     */
-    Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(nrhs,nrhs);
-    Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(nrhs,nrhs);
-    Eigen::MatrixXcd m_zz     = Eigen::MatrixXcd::Zero(nrhs,nrhs);
-    Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(nrhs,nrhs);
-    
-    Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(nrhs,nrhs);
-    Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(nrhs,nrhs);
-    Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(nrhs,nrhs);
-    Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(nrhs,nrhs);
-    
-    Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(nrhs,nrhs);
-    Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(nrhs,nrhs);
-
-    GridStopWatch HDCGTimer;
-
-    //////////////////////////
-    // x0 = Vstart -- possibly modify guess
-    //////////////////////////
-    Vstart(X,src);
-
-    //////////////////////////
-    // R = B-AX
-    //////////////////////////
-    for(int rhs=0;rhs<nrhs;rhs++){
-      // r0 = b -A x0
-      _FineLinop.HermOp(X[rhs],tmp[rhs]);
-      axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]);    // Computes R=Z=src - A X0
-    }
-
-    //////////////////////////////////
-    // Compute MZ = M1 Z = M1 B - M1 A x0
-    //////////////////////////////////
-    PcgM1(Z,MZ);  
-
-    //////////////////////////////////
-    // QC = Z
-    //////////////////////////////////
-    ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
-
-    //////////////////////////////////
-    // D=MQ
-    //////////////////////////////////
-    for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
-
-    std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
-
-    ProjectTimer.Reset();
-    PromoteTimer.Reset();
-    DeflateTimer.Reset();
-    CoarseTimer.Reset();
-    SmoothTimer.Reset();
-    FineTimer.Reset();
-    InsertTimer.Reset();
-
-    GridStopWatch M1Timer;
-    GridStopWatch M2Timer;
-    GridStopWatch M3Timer;
-    GridStopWatch LinalgTimer;
-    GridStopWatch InnerProdTimer;
-
-    HDCGTimer.Start();
-
-    std::vector<RealD> rn(nrhs);
-    for (int k=0;k<=MaxIterations;k++){
-
-      ////////////////////
-      // Z  = AD
-      ////////////////////
-      M3Timer.Start();
-      for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);      
-      M3Timer.Stop();
-
-      ////////////////////
-      // MZ  = M1 Z <==== the Multigrid preconditioner
-      ////////////////////
-      M1Timer.Start();
-      PcgM1(Z,MZ);
-      M1Timer.Stop();
-
-      FineTimer.Start();
-      ////////////////////
-      // M  = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
-      ////////////////////
-      InnerProdTimer.Start();
-      _BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
-      InnerProdTimer.Stop();
-      m_M       = m_DZ.inverse();
-
-      ///////////////////////////
-      // X  = X + D MC
-      ///////////////////////////
-      m_tmp     = m_M * m_C;
-      LinalgTimer.Start();
-      _BlockCGLinalg.MaddMatrix(X,m_tmp, D,X);     // D are the search directions and X takes the updates 
-      LinalgTimer.Stop();
-
-      ///////////////////////////
-      // QS = Q - M Z
-      // (MQ) S = MQ - M (M1Z)
-      ///////////////////////////
-      LinalgTimer.Start();
-      _BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
-      _BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
-      ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
-      LinalgTimer.Stop();
-
-      ////////////////////////////
-      // D  = MQ + D S^dag
-      ////////////////////////////
-      m_tmp = m_S.adjoint();
-      LinalgTimer.Start();
-      _BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
-      LinalgTimer.Stop();
-
-      ////////////////////////////
-      // C  = S C
-      ////////////////////////////
-      m_C = m_S*m_C;
-      
-      ////////////////////////////
-      // convergence monitor
-      ////////////////////////////
-      m_rr = m_C.adjoint() * m_C;
-      
-      FineTimer.Stop();
-
-      RealD max_resid=0;
-      RealD rrsum=0;
-      RealD sssum=0;
-      RealD rr;
-
-      for(int b=0;b<nrhs;b++) {
-	rrsum+=real(m_rr(b,b));
-	sssum+=ssq[b];
-	rr = real(m_rr(b,b))/ssq[b];
-	if ( rr > max_resid ) max_resid = rr;
-      }
-      std::cout << GridLogMessage <<
-	  "\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
-
-
-      if ( max_resid < Tolerance*Tolerance ) { 
-
-	HDCGTimer.Stop();
-	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg  "<<LinalgTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H  "<<M3Timer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
-	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse  "<<CoarseTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine    "<<FineTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth  "<<SmoothTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert  "<<InsertTimer.Elapsed()<<std::endl;;
-
-	for(int rhs=0;rhs<nrhs;rhs++){
-
-	  _FineLinop.HermOp(X[rhs],tmp[rhs]);			  
-
-	  Field mytmp(grid);
-	  axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
-      
-	  RealD  xnorm   = sqrt(norm2(X[rhs]));
-	  RealD  srcnorm = sqrt(norm2(src[rhs]));
-	  RealD  tmpnorm = sqrt(norm2(mytmp));
-	  RealD  true_residual = tmpnorm/srcnorm;
-	  std::cout<<GridLogMessage
-		   <<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
-		   <<" solution "<<xnorm
-		   <<" source "<<srcnorm
-		   <<std::endl;
-	}
-	return;
-      }
-      
-    }
-    HDCGTimer.Stop();
-    std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
-    assert(0);
-  }
-
-  virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
-  {
-    std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
-    src[0].Grid()->Barrier();
-    int nrhs = src.size();
-    std::vector<RealD> f(nrhs);
-    std::vector<RealD> rtzp(nrhs);
-    std::vector<RealD> rtz(nrhs);
-    std::vector<RealD> a(nrhs);
-    std::vector<RealD> d(nrhs);
-    std::vector<RealD> b(nrhs);
-    std::vector<RealD> rptzp(nrhs);
-    /////////////////////////////
-    // Set up history vectors
-    /////////////////////////////
-    int mmax = 3;
-
-    std::vector<std::vector<Field> > p(nrhs);   for(int r=0;r<nrhs;r++)  p[r].resize(mmax,grid);
-    std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
-    std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
-
-    std::vector<Field> z(nrhs,grid);
-    std::vector<Field>  mp (nrhs,grid);
-    std::vector<Field>  r  (nrhs,grid);
-    std::vector<Field>  mu (nrhs,grid);
-
-    //Initial residual computation & set up
-    std::vector<RealD> src_nrm(nrhs);
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      src_nrm[rhs]=norm2(src[rhs]);
-      assert(src_nrm[rhs]!=0.0);
-    }
-    std::vector<RealD> tn(nrhs);
-
-    GridStopWatch HDCGTimer;
-    //////////////////////////
-    // x0 = Vstart -- possibly modify guess
-    //////////////////////////
-    Vstart(x,src);
-
-    for(int rhs=0;rhs<nrhs;rhs++){
-      // r0 = b -A x0
-      _FineLinop.HermOp(x[rhs],mmp[rhs][0]);
-      axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]);    // Recomputes r=src-Ax0
-    }
-
-    //////////////////////////////////
-    // Compute z = M1 x
-    //////////////////////////////////
-    // This needs a multiRHS version for acceleration
-    PcgM1(r,z);
-
-    std::vector<RealD> ssq(nrhs);
-    std::vector<RealD> rsq(nrhs);
-    std::vector<Field> pp(nrhs,grid);
-
-    for(int rhs=0;rhs<nrhs;rhs++){
-      rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
-      p[rhs][0]=z[rhs];
-      ssq[rhs]=norm2(src[rhs]);
-      rsq[rhs]=  ssq[rhs]*Tolerance*Tolerance;
-      //      std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
-    }
-
-    ProjectTimer.Reset();
-    PromoteTimer.Reset();
-    DeflateTimer.Reset();
-    CoarseTimer.Reset();
-    SmoothTimer.Reset();
-    FineTimer.Reset();
-    InsertTimer.Reset();
-
-    GridStopWatch M1Timer;
-    GridStopWatch M2Timer;
-    GridStopWatch M3Timer;
-    GridStopWatch LinalgTimer;
-
-    HDCGTimer.Start();
-
-    std::vector<RealD> rn(nrhs);
-    for (int k=0;k<=MaxIterations;k++){
-    
-      int peri_k  = k % mmax;
-      int peri_kp = (k+1) % mmax;
-
-      for(int rhs=0;rhs<nrhs;rhs++){
-	rtz[rhs]=rtzp[rhs];
-	M3Timer.Start();
-	d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
-	M3Timer.Stop();
-	a[rhs] = rtz[rhs]/d[rhs];
-
-	LinalgTimer.Start();
-	// Memorise this
-	pAp[rhs][peri_k] = d[rhs];
-
-	axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
-	rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
-	LinalgTimer.Stop();
-      }
-
-      // Compute z = M x (for *all* RHS)
-      M1Timer.Start();
-      PcgM1(r,z);
-      M1Timer.Stop();
-      
-      RealD max_rn=0.0;
-      LinalgTimer.Start();
-      for(int rhs=0;rhs<nrhs;rhs++){
-
-	rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
-
-	//	std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
-	mu[rhs]=z[rhs];
-
-	p[rhs][peri_kp]=mu[rhs];
-
-	// Standard search direction p == z + b p 
-	b[rhs] = (rtzp[rhs])/rtz[rhs];
-
-	int northog = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
-	for(int back=0; back < northog; back++){
-	  int peri_back = (k-back)%mmax;
-	  RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
-	  RealD beta = -pbApk/pAp[rhs][peri_back];
-	  axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
-	}
-
-	RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
-	RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
-	RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
-	
-	std::cout<<GridLogMessage<<"HDCG:fPcg rhs "<<rhs<<" k= "<<k<<" residual = "<<rrn<<"\n";
-	if ( rrn > max_rn ) max_rn = rrn;
-      }
-      LinalgTimer.Stop();
-
-      // Stopping condition based on worst case
-      if ( max_rn <= Tolerance ) { 
-
-	HDCGTimer.Stop();
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Linalg  "<<LinalgTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : fine M3 "<<M3Timer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Project "<<ProjectTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Coarse  "<<CoarseTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Fine    "<<FineTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Smooth  "<<SmoothTimer.Elapsed()<<std::endl;;
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Insert  "<<InsertTimer.Elapsed()<<std::endl;;
-
-	for(int rhs=0;rhs<nrhs;rhs++){
-	  _FineLinop.HermOp(x[rhs],mmp[rhs][0]);			  
-	  Field tmp(grid);
-	  axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
-      
-	  RealD  mmpnorm = sqrt(norm2(mmp[rhs][0]));
-	  RealD  xnorm   = sqrt(norm2(x[rhs]));
-	  RealD  srcnorm = sqrt(norm2(src[rhs]));
-	  RealD  tmpnorm = sqrt(norm2(tmp));
-	  RealD  true_residual = tmpnorm/srcnorm;
-	  std::cout<<GridLogMessage
-		   <<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
-		   <<" solution "<<xnorm
-		   <<" source "<<srcnorm
-		   <<" mmp "<<mmpnorm	  
-		   <<std::endl;
-	}
-	return;
-      }
-      
-    }
-    HDCGTimer.Stop();
-    std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
-    for(int rhs=0;rhs<nrhs;rhs++){
-      RealD  xnorm   = sqrt(norm2(x[rhs]));
-      RealD  srcnorm = sqrt(norm2(src[rhs]));
-      std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
-    }
-  }
-  
-
- public:
-
-  virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out) = 0;
-  virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src) = 0;
-  virtual void PcgM2(const Field & in, Field & out) {
-    out=in;
-  }
-
-  virtual RealD PcgM3(const Field & p, Field & mmp){
-    RealD dd;
-    _FineLinop.HermOp(p,mmp);
-    ComplexD dot = innerProduct(p,mmp);
-    dd=real(dot);
-    return dd;
-  }
-
-};
-
-template<class Field, class CoarseField>
-class TwoLevelADEF2mrhs : public TwoLevelCGmrhs<Field>
-{
-public:
-  GridBase *coarsegrid;
-  GridBase *coarsegridmrhs;
-  LinearFunction<CoarseField> &_CoarseSolverMrhs;
-  LinearFunction<CoarseField> &_CoarseSolverPreciseMrhs;
-  MultiRHSBlockProject<Field>    &_Projector;
-  MultiRHSDeflation<CoarseField> &_Deflator;
-
-  
-  TwoLevelADEF2mrhs(RealD tol,
-		    Integer maxit,
-		    LinearOperatorBase<Field>    &FineLinop,
-		    LinearFunction<Field>        &Smoother,
-		    LinearFunction<CoarseField>  &CoarseSolverMrhs,
-		    LinearFunction<CoarseField>  &CoarseSolverPreciseMrhs,
-		    MultiRHSBlockProject<Field>    &Projector,
-		    MultiRHSDeflation<CoarseField> &Deflator,
-		    GridBase *_coarsemrhsgrid) :
-    TwoLevelCGmrhs<Field>(tol, maxit,FineLinop,Smoother,Projector.fine_grid),
-    _CoarseSolverMrhs(CoarseSolverMrhs),
-    _CoarseSolverPreciseMrhs(CoarseSolverPreciseMrhs),
-    _Projector(Projector),
-    _Deflator(Deflator)
-  {
-    coarsegrid = Projector.coarse_grid;
-    coarsegridmrhs = _coarsemrhsgrid;// Thi could be in projector
-  };
-
-  // Override Vstart
-  virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
-  {
-    int nrhs=x.size();
-    ///////////////////////////////////
-    // Choose x_0 such that 
-    // x_0 = guess +  (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
-    //                               = [1 - Ass_inv A] Guess + Assinv src
-    //                               = P^T guess + Assinv src 
-    //                               = Vstart  [Tang notation]
-    // This gives:
-    // W^T (src - A x_0) = src_s - A guess_s - r_s
-    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
-    //                   = 0 
-    ///////////////////////////////////
-    std::vector<CoarseField> PleftProj(nrhs,this->coarsegrid);
-    std::vector<CoarseField> PleftMss_proj(nrhs,this->coarsegrid);
-    CoarseField PleftProjMrhs(this->coarsegridmrhs);
-    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
-
-    this->_Projector.blockProject(src,PleftProj);
-    this->_Deflator.DeflateSources(PleftProj,PleftMss_proj);
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      InsertSliceFast(PleftProj[rhs],PleftProjMrhs,rhs,0);
-      InsertSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0); // the guess
-    }
-    
-    this->_CoarseSolverPreciseMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} r_s
-
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      ExtractSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0);
-    }
-    this->_Projector.blockPromote(x,PleftMss_proj);
-  }
-
-  virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out){
-
-    int nrhs=in.size();
-
-    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
-    std::vector<Field> tmp(nrhs,this->grid);
-    std::vector<Field> Min(nrhs,this->grid);
-
-    std::vector<CoarseField> PleftProj(nrhs,this->coarsegrid);
-    std::vector<CoarseField> PleftMss_proj(nrhs,this->coarsegrid);
-
-    CoarseField PleftProjMrhs(this->coarsegridmrhs);
-    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
-
-    //    this->rrr=in[0];
-
-#undef SMOOTHER_BLOCK_SOLVE
-#if SMOOTHER_BLOCK_SOLVE
-    this->SmoothTimer.Start();
-    this->_Smoother(in,Min);
-    this->SmoothTimer.Stop();
-#else
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      this->SmoothTimer.Start();
-      this->_Smoother(in[rhs],Min[rhs]);
-      this->SmoothTimer.Stop();
-    }
-#endif
-    //    this->sss=Min[0];
-    
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      
-      this->FineTimer.Start();
-      this->_FineLinop.HermOp(Min[rhs],out[rhs]);
-      axpy(tmp[rhs],-1.0,out[rhs],in[rhs]);          // resid  = in - A Min
-      this->FineTimer.Stop();
-
-    }
-
-    this->ProjectTimer.Start();
-    this->_Projector.blockProject(tmp,PleftProj);
-    this->ProjectTimer.Stop();
-    this->DeflateTimer.Start();
-    this->_Deflator.DeflateSources(PleftProj,PleftMss_proj);
-    this->DeflateTimer.Stop();
-    this->InsertTimer.Start();
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      InsertSliceFast(PleftProj[rhs],PleftProjMrhs,rhs,0);
-      InsertSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0); // the guess
-    }
-    this->InsertTimer.Stop();
-
-    this->CoarseTimer.Start();
-    this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s
-    this->CoarseTimer.Stop();
-
-    this->InsertTimer.Start();
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      ExtractSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0);
-    }
-    this->InsertTimer.Stop();
-    this->PromoteTimer.Start();
-    this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]  
-    this->PromoteTimer.Stop();
-    this->FineTimer.Start();
-    //    this->qqq=tmp[0];
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
-    }
-    //    this->zzz=out[0];
-    this->FineTimer.Stop();
-  }
-};
-
-
-NAMESPACE_END(Grid);
-
-
@@ -31,58 +31,6 @@ directory

 NAMESPACE_BEGIN(Grid);

-template<class Field>
-void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
-  typedef typename Field::scalar_type scomplex;
-  int Nblock = X.size();
-  for(int b=0;b<Nblock;b++){
-  for(int bp=0;bp<Nblock;bp++) {
-    m(b,bp) = innerProduct(X[b],Y[bp]);  
-  }}
-}
-template<class Field>
-void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
-  // Should make this cache friendly with site outermost, parallel_for
-  // Deal with case AP aliases with either Y or X
-  //
-  //Could pack "X" and "AP" into a Nblock x Volume dense array.
-  // AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
-  typedef typename Field::scalar_type scomplex;
-  int Nblock = AP.size();
-  std::vector<Field> tmp(Nblock,X[0]);
-  for(int b=0;b<Nblock;b++){
-    tmp[b]   = Y[b];
-    for(int bp=0;bp<Nblock;bp++) {
-      tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp]; 
-    }
-  }
-  for(int b=0;b<Nblock;b++){
-    AP[b] = tmp[b];
-  }
-}
-template<class Field>
-void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
-  // Should make this cache friendly with site outermost, parallel_for
-  typedef typename Field::scalar_type scomplex;
-  int Nblock = AP.size();
-  for(int b=0;b<Nblock;b++){
-    AP[b] = Zero();
-    for(int bp=0;bp<Nblock;bp++) {
-      AP[b] += scomplex(m(bp,b))*X[bp]; 
-    }
-  }
-}
-template<class Field>
-double normv(const std::vector<Field> &P){
-  int Nblock = P.size();
-  double nn = 0.0;
-  for(int b=0;b<Nblock;b++) {
-    nn+=norm2(P[b]);
-  }
-  return nn;
-}
-
-
 enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };

 //////////////////////////////////////////////////////////////////////////
@@ -139,19 +87,10 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  sliceInnerProductMatrix(m_rr,R,R,Orthog);

  // Force manifest hermitian to avoid rounding related
-  /*
-  int rank=m_rr.rows();
-  for(int r=0;r<rank;r++){
-  for(int s=0;s<rank;s++){
-    std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
-  }}
-  */
  m_rr = 0.5*(m_rr+m_rr.adjoint());

  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 

-//  ComplexD det = L.determinant();
-//  std::cout << " Det m_rr "<<det<<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -171,20 +110,11 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 const std::vector<Field> & R)
 {
  InnerProductMatrix(m_rr,R,R);
-  /*
-  int rank=m_rr.rows();
-  for(int r=0;r<rank;r++){
-  for(int s=0;s<rank;s++){
-    std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
-  }}
-  */
+
  m_rr = 0.5*(m_rr+m_rr.adjoint());

  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 

-  //  ComplexD det = L.determinant();
-  //  std::cout << " Det m_rr "<<det<<std::endl;
-
  C    = L.adjoint();
  Cinv = C.inverse();

@@ -256,7 +186,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  sliceNorm(ssq,B,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
-  for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;

  sliceNorm(residuals,B,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
@@ -292,9 +221,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  Linop.HermOp(X, AD);
  tmp = B - AD;  

-  sliceNorm(residuals,tmp,Orthog);
-  for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
-  
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  D=Q;

@@ -310,8 +236,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  GridStopWatch SolverTimer;
  SolverTimer.Start();

-  RealD max_resid=0;
-
  int k;
  for (k = 1; k <= MaxIterations; k++) {

@@ -356,7 +280,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
     */
    m_rr = m_C.adjoint() * m_C;

-    max_resid=0;
+    RealD max_resid=0;
    RealD rrsum=0;
    RealD rr;

@@ -398,9 +322,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    }

  }
-
-  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
-	    <<" residual "<< std::sqrt(max_resid)<< std::endl;
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;

  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
@@ -544,6 +466,43 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
  IterationsToComplete = k;
 }

+void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
+  for(int b=0;b<Nblock;b++){
+  for(int bp=0;bp<Nblock;bp++) {
+    m(b,bp) = innerProduct(X[b],Y[bp]);  
+  }}
+}
+void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
+  // Should make this cache friendly with site outermost, parallel_for
+  // Deal with case AP aliases with either Y or X
+  std::vector<Field> tmp(Nblock,X[0]);
+  for(int b=0;b<Nblock;b++){
+    tmp[b]   = Y[b];
+    for(int bp=0;bp<Nblock;bp++) {
+      tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp]; 
+    }
+  }
+  for(int b=0;b<Nblock;b++){
+    AP[b] = tmp[b];
+  }
+}
+void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
+  // Should make this cache friendly with site outermost, parallel_for
+  for(int b=0;b<Nblock;b++){
+    AP[b] = Zero();
+    for(int bp=0;bp<Nblock;bp++) {
+      AP[b] += scomplex(m(bp,b))*X[bp]; 
+    }
+  }
+}
+double normv(const std::vector<Field> &P){
+  double nn = 0.0;
+  for(int b=0;b<Nblock;b++) {
+    nn+=norm2(P[b]);
+  }
+  return nn;
+}
+
 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQvec implementation:
 //--------------------------
@@ -590,7 +549,6 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field

  RealD sssum=0;
  for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
-  for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];

  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
@@ -627,7 +585,6 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
  for(int b=0;b<Nblock;b++) {
    Linop.HermOp(X[b], AD[b]);
    tmp[b] = B[b] - AD[b];  
-    std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
  }

  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
@@ -38,13 +38,12 @@ NAMESPACE_BEGIN(Grid);
 // single input vec, single output vec.
 /////////////////////////////////////////////////////////////

-
 template <class Field>
 class ConjugateGradient : public OperatorFunction<Field> {
 public:

  using OperatorFunction<Field>::operator();
-  
+
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
@@ -55,26 +54,10 @@ public:
  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol),
      MaxIterations(maxit),
-      ErrorOnNoConverge(err_on_no_conv)
-  {};
+      ErrorOnNoConverge(err_on_no_conv){};

-  virtual void LogIteration(int k,RealD a,RealD b){
-    //    std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
-  };
-  virtual void LogBegin(void){
-    std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
-  };
+  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {

-    void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
-
-      this->LogBegin();
-
-      GRID_TRACE("ConjugateGradient");
-    GridStopWatch PreambleTimer;
-    GridStopWatch ConstructTimer;
-    GridStopWatch NormTimer;
-    GridStopWatch AssignTimer;
-    PreambleTimer.Start();
    psi.Checkerboard() = src.Checkerboard();

    conformable(psi, src);
@@ -82,32 +65,22 @@ public:
    RealD cp, c, a, d, b, ssq, qq;
    //RealD b_pred;

-    // Was doing copies
-    ConstructTimer.Start();
-    Field p  (src.Grid());
-    Field mmp(src.Grid());
-    Field r  (src.Grid());
-    ConstructTimer.Stop();
+    Field p(src);
+    Field mmp(src);
+    Field r(src);

    // Initial residual computation & set up
-    NormTimer.Start();
-    ssq = norm2(src);
    RealD guess = norm2(psi);
-    NormTimer.Stop();
    assert(std::isnan(guess) == 0);
-    AssignTimer.Start();
-    if ( guess == 0.0 ) {
-      r = src;
-      p = r;
-      a = ssq;
-    } else { 
-      Linop.HermOpAndNorm(psi, mmp, d, b);
-      r = src - mmp;
-      p = r;
-      a = norm2(p);
-    }
+    
+    Linop.HermOpAndNorm(psi, mmp, d, b);
+    
+    r = src - mmp;
+    p = r;
+
+    a = norm2(p);
    cp = a;
-    AssignTimer.Stop();
+    ssq = norm2(src);

    // Handle trivial case of zero src
    if (ssq == 0.){
@@ -137,7 +110,6 @@ public:
    std::cout << GridLogIterative << std::setprecision(8)
              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;

-    PreambleTimer.Stop();
    GridStopWatch LinalgTimer;
    GridStopWatch InnerTimer;
    GridStopWatch AxpyNormTimer;
@@ -145,13 +117,9 @@ public:
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;

-    RealD usecs = -usecond();
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
-
-      GridStopWatch IterationTimer;
-      IterationTimer.Start();
      c = cp;

      MatrixTimer.Start();
@@ -183,44 +151,32 @@ public:
      }
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
-      LogIteration(k,a,b);

-      IterationTimer.Stop();
-      if ( (k % 500) == 0 ) {
-	std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
+      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
-      } else { 
-	std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
-		  << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
-      }

      // Stopping condition
      if (cp <= rsq) {
-	usecs +=usecond();
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
-	GridBase *grid = src.Grid();
-	RealD DwfFlops = (1452. )*grid->gSites()*4*k
-   	               + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
+
        RealD srcnorm = std::sqrt(norm2(src));
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
+
        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
 		  << "\tComputed residual " << std::sqrt(cp / ssq)
 		  << "\tTrue residual " << true_residual
 		  << "\tTarget " << Tolerance << std::endl;

-	//	std::cout << GridLogMessage << "\tPreamble   " << PreambleTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tSolver Elapsed    " << SolverTimer.Elapsed() <<std::endl;
-        std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
-	std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\t\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\t\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
-
-	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
+        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
+	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

@@ -231,143 +187,17 @@ public:
      }
    }
    // Failed. Calculate true residual before giving up                                                         
-    // Linop.HermOpAndNorm(psi, mmp, d, qq);
-    //    p = mmp - src;
-    //TrueResidual = sqrt(norm2(p)/ssq);
-    //    TrueResidual = 1;
+    Linop.HermOpAndNorm(psi, mmp, d, qq);
+    p = mmp - src;

-    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
-    	      <<" residual "<< std::sqrt(cp / ssq)<< std::endl;
-    SolverTimer.Stop();
-    std::cout << GridLogMessage << "\tPreamble   " << PreambleTimer.Elapsed() <<std::endl;
-    std::cout << GridLogMessage << "\tConstruct  " << ConstructTimer.Elapsed() <<std::endl;
-    std::cout << GridLogMessage << "\tNorm       " << NormTimer.Elapsed() <<std::endl;
-    std::cout << GridLogMessage << "\tAssign     " << AssignTimer.Elapsed() <<std::endl;
-    std::cout << GridLogMessage << "\tSolver     " << SolverTimer.Elapsed() <<std::endl;
-    std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
-    std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-    std::cout << GridLogMessage<< "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-    std::cout << GridLogPerformance << "\t\tInner      " << InnerTimer.Elapsed() <<std::endl;
-    std::cout << GridLogPerformance << "\t\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-    std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+    TrueResidual = sqrt(norm2(p)/ssq);
+
+    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;

    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;

  }
 };
-
-
-template <class Field>
-class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
-public:
-  // Optionally record the CG polynomial
-  std::vector<double> ak;
-  std::vector<double> bk;
-  std::vector<double> poly_p;
-  std::vector<double> poly_r;
-  std::vector<double> poly_Ap;
-  std::vector<double> polynomial;
-
-public:
-  ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
-  { };
-  void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
-  {
-    Field tmp(src.Grid());
-    Field AtoN(src.Grid());
-    AtoN = src;
-    psi=AtoN*polynomial[0];
-    for(int n=1;n<polynomial.size();n++){
-      tmp = AtoN;
-      Linop.HermOp(tmp,AtoN);
-      psi = psi + polynomial[n]*AtoN;
-    }
-  }
-  void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
-  {
-    Field Ap(src.Grid());
-    Field r(src.Grid());
-    Field p(src.Grid());
-    p=src;
-    r=src;
-    x=Zero();
-    x.Checkerboard()=src.Checkerboard();
-    for(int k=0;k<ak.size();k++){
-      x = x + ak[k]*p;
-      Linop.HermOp(p,Ap);
-      r = r - ak[k] * Ap;
-      p = r + bk[k] * p;
-    }
-  }
-  void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
-  {
-    psi=Zero();
-    this->operator ()(Linop,src,psi);
-  }
-  virtual void LogBegin(void)
-  {
-    std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
-    ak.resize(0);
-    bk.resize(0);
-    polynomial.resize(0);
-    poly_Ap.resize(0);
-    poly_Ap.resize(0);
-    poly_p.resize(1);
-    poly_r.resize(1);
-    poly_p[0]=1.0;
-    poly_r[0]=1.0;
-  };
-  virtual void LogIteration(int k,RealD a,RealD b)
-  {
-    // With zero guess,
-    // p = r = src
-    //
-    // iterate:
-    //   x =  x + a p
-    //   r =  r - a A p
-    //   p =  r + b p
-    //
-    // [0]
-    // r = x
-    // p = x
-    // Ap=0
-    //
-    // [1]
-    // Ap = A x + 0  ==> shift poly P right by 1 and add 0.
-    // x  = x + a p  ==> add polynomials term by term 
-    // r  = r - a A p  ==> add polynomials term by term
-    // p  = r + b p  ==> add polynomials term by term
-    //
-    std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
-    ak.push_back(a);
-    bk.push_back(b);
-    //  Ap= right_shift(p)
-    poly_Ap.resize(k+1);
-    poly_Ap[0]=0.0;
-    for(int i=0;i<k;i++){
-      poly_Ap[i+1]=poly_p[i];
-    }
-
-    //  x = x + a p
-    polynomial.resize(k);
-    polynomial[k-1]=0.0;
-    for(int i=0;i<k;i++){
-      polynomial[i] = polynomial[i] + a * poly_p[i];
-    }
-    
-    //  r = r - a Ap
-    //  p = r + b p
-    poly_r.resize(k+1);
-    poly_p.resize(k+1);
-    poly_r[k] = poly_p[k] = 0.0;
-    for(int i=0;i<k+1;i++){
-      poly_r[i] = poly_r[i] - a * poly_Ap[i];
-      poly_p[i] = poly_r[i] + b * poly_p[i];
-    }
-  }
-};
-
 NAMESPACE_END(Grid);
 #endif
@@ -82,6 +82,11 @@ NAMESPACE_BEGIN(Grid);
    RealD stop = src_norm * Tolerance*Tolerance;

    GridBase* DoublePrecGrid = src_d_in.Grid();
+
+    //Generate precision change workspaces
+    precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
+    precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
+
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
    
@@ -108,25 +113,22 @@ NAMESPACE_BEGIN(Grid);
    GridStopWatch PrecChangeTimer;
    
    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
-
-    precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
-    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
-    
+      
    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
      //Compute double precision rsd and also new RHS vector.
      Linop_d.HermOp(sol_d, tmp_d);
      RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
-      std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
+      
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;

      if(norm < OuterLoopNormMult * stop){
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 	break;
      }
-      while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??

      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
+      precisionChange(src_f, src_d, wk_sp_from_dp);
      PrecChangeTimer.Stop();
      
      sol_f = Zero();
@@ -145,7 +147,7 @@ NAMESPACE_BEGIN(Grid);
      
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
+      precisionChange(tmp_d, sol_f, wk_dp_from_sp);
      PrecChangeTimer.Stop();
      
      axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -1,213 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
-
-    Copyright (C) 2015
-
-    Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
-#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
-
-NAMESPACE_BEGIN(Grid);
-
-//Mixed precision restarted defect correction CG
-template<class FieldD,class FieldF, 
-  typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
-  typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
-class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
-public:
-  using LinearFunction<FieldD>::operator();
-  RealD   Tolerance;
-  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
-  Integer MaxInnerIterations;
-  Integer MaxOuterIterations;
-  Integer MaxPatchupIterations;
-  GridBase* SinglePrecGrid; //Grid for single-precision fields
-  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
-  LinearOperatorBase<FieldF> &Linop_f;
-  LinearOperatorBase<FieldD> &Linop_d;
-
-  //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
-  LinearFunction<FieldF> *guesser;
-  bool updateResidual;
-  
-  MixedPrecisionConjugateGradientBatched(RealD tol, 
-          Integer maxinnerit, 
-          Integer maxouterit, 
-          Integer maxpatchit,
-          GridBase* _sp_grid, 
-          LinearOperatorBase<FieldF> &_Linop_f, 
-          LinearOperatorBase<FieldD> &_Linop_d,
-          bool _updateResidual=true) :
-    Linop_f(_Linop_f), Linop_d(_Linop_d),
-    Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
-    OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
-
-  void useGuesser(LinearFunction<FieldF> &g){
-    guesser = &g;
-  }
-  
-  void operator() (const FieldD &src_d_in, FieldD &sol_d){
-    std::vector<FieldD> srcs_d_in{src_d_in};
-    std::vector<FieldD> sols_d{sol_d};
-
-    (*this)(srcs_d_in,sols_d);
-
-    sol_d = sols_d[0];
-  }
-
-  void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
-    assert(src_d_in.size() == sol_d.size());
-    int NBatch = src_d_in.size();
-
-    std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
-
-    Integer TotalOuterIterations = 0; //Number of restarts
-    std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations
-    std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
-  
-    GridStopWatch TotalTimer;
-    TotalTimer.Start();
-
-    GridStopWatch InnerCGtimer;
-    GridStopWatch PrecChangeTimer;
-    
-    int cb = src_d_in[0].Checkerboard();
-    
-    std::vector<RealD> src_norm;
-    std::vector<RealD> norm;
-    std::vector<RealD> stop;
-    
-    GridBase* DoublePrecGrid = src_d_in[0].Grid();
-    FieldD tmp_d(DoublePrecGrid);
-    tmp_d.Checkerboard() = cb;
-    
-    FieldD tmp2_d(DoublePrecGrid);
-    tmp2_d.Checkerboard() = cb;
-
-    std::vector<FieldD> src_d;
-    std::vector<FieldF> src_f;
-    std::vector<FieldF> sol_f;
-
-    for (int i=0; i<NBatch; i++) {
-      sol_d[i].Checkerboard() = cb;
-
-      src_norm.push_back(norm2(src_d_in[i]));
-      norm.push_back(0.);
-      stop.push_back(src_norm[i] * Tolerance*Tolerance);
-
-      src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
-
-      src_f.push_back(SinglePrecGrid);
-      src_f[i].Checkerboard() = cb;
-
-      sol_f.push_back(SinglePrecGrid);
-      sol_f[i].Checkerboard() = cb;
-    }
-    
-    RealD inner_tol = InnerTolerance;
-    
-    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
-    CG_f.ErrorOnNoConverge = false;
-    
-    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
-      
-    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
-      std::cout << GridLogMessage << std::endl;
-      std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
-      
-      bool allConverged = true;
-      
-      for (int i=0; i<NBatch; i++) {
-        //Compute double precision rsd and also new RHS vector.
-        Linop_d.HermOp(sol_d[i], tmp_d);
-        norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
-        
-        std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
-
-        PrecChangeTimer.Start();
-        precisionChange(src_f[i], src_d[i]);
-        PrecChangeTimer.Stop();
-        
-        sol_f[i] = Zero();
-      
-        if(norm[i] > OuterLoopNormMult * stop[i]) {
-          allConverged = false;
-        }
-      }
-      if (allConverged) break;
-
-      if (updateResidual) {
-        RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
-        RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
-        while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
-        CG_f.Tolerance = inner_tol;
-      }
-
-      //Optionally improve inner solver guess (eg using known eigenvectors)
-      if(guesser != NULL) {
-        (*guesser)(src_f, sol_f);
-      }
-
-      for (int i=0; i<NBatch; i++) {
-        //Inner CG
-        InnerCGtimer.Start();
-        CG_f(Linop_f, src_f[i], sol_f[i]);
-        InnerCGtimer.Stop();
-        TotalInnerIterations[i] += CG_f.IterationsToComplete;
-        
-        //Convert sol back to double and add to double prec solution
-        PrecChangeTimer.Start();
-        precisionChange(tmp_d, sol_f[i]);
-        PrecChangeTimer.Stop();
-        
-        axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
-      }
-
-    }
-    
-    //Final trial CG
-    std::cout << GridLogMessage << std::endl;
-    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
-    
-    for (int i=0; i<NBatch; i++) {
-      ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
-      CG_d(Linop_d, src_d_in[i], sol_d[i]);
-      TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
-    }
-
-    TotalTimer.Stop();
-
-    std::cout << GridLogMessage << std::endl;
-    for (int i=0; i<NBatch; i++) {
-      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
-    }
-    std::cout << GridLogMessage << std::endl;
-    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
-    
-  }
-};
-
-NAMESPACE_END(Grid);
-
-#endif
@@ -44,7 +44,7 @@ public:

  using OperatorFunction<Field>::operator();

-  //  RealD   Tolerance;
+  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@@ -84,7 +84,6 @@ public:

  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
  {
-    GRID_TRACE("ConjugateGradientMultiShift");
  
    GridBase *grid = src.Grid();
  
@@ -102,11 +101,11 @@ public:
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
  
-    // remove dynamic sized arrays on stack; 2d is a pain with vector
-    std::vector<RealD>  bs(nshift);
-    std::vector<RealD>  rsq(nshift);
-    std::vector<std::array<RealD,2> >  z(nshift);
-    std::vector<int>     converged(nshift);
+    // dynamic sized arrays on stack; 2d is a pain with vector
+    RealD  bs[nshift];
+    RealD  rsq[nshift];
+    RealD  z[nshift][2];
+    int     converged[nshift];
  
    const int       primary =0;
  
@@ -144,7 +143,7 @@ public:
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
-	       <<" target resid^2 "<<rsq[s]<<std::endl;
+	       <<" target resid "<<rsq[s]<<std::endl;
      ps[s] = src;
    }
    // r and p for primary
@@ -325,8 +324,8 @@ public:

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tAXPY     " << AXPYTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMatrix   " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;

      IterationsToComplete = k;	
@@ -1,373 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
-
-    Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Christopher Kelly <ckelly@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
-//The residual is stored in single precision, but the search directions and solution are stored in double precision. 
-//Every update_freq iterations the residual is corrected in double precision. 
-//For safety the a final regular CG is applied to clean up if necessary
-
-//PB Pure single, then double fixup
-
-template<class FieldD, class FieldF,
-	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
-	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
-class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
-					     public OperatorFunction<FieldD>
-{
-public:                                                
-
-  using OperatorFunction<FieldD>::operator();
-
-  RealD   Tolerance;
-  Integer MaxIterationsMshift;
-  Integer MaxIterations;
-  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
-  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
-  int verbose;
-  MultiShiftFunction shifts;
-  std::vector<RealD> TrueResidualShift;
-
-  int ReliableUpdateFreq; //number of iterations between reliable updates
-
-  GridBase* SinglePrecGrid; //Grid for single-precision fields
-  LinearOperatorBase<FieldF> &Linop_f; //single precision
-
-  ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
-				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
-				       int _ReliableUpdateFreq) : 
-    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
-    MaxIterations(20000)
-  { 
-    verbose=1;
-    IterationsToCompleteShift.resize(_shifts.order);
-    TrueResidualShift.resize(_shifts.order);
-  }
-
-  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
-  {
-    GridBase *grid = src.Grid();
-    int nshift = shifts.order;
-    std::vector<FieldD> results(nshift,grid);
-    (*this)(Linop,src,results,psi);
-  }
-  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
-  {
-    int nshift = shifts.order;
-
-    (*this)(Linop,src,results);
-  
-    psi = shifts.norm*src;
-    for(int i=0;i<nshift;i++){
-      psi = psi + shifts.residues[i]*results[i];
-    }
-
-    return;
-  }
-
-  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
-  { 
-    GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
-    GridBase *DoublePrecGrid = src_d.Grid();
-
-    ////////////////////////////////////////////////////////////////////////
-    // Convenience references to the info stored in "MultiShiftFunction"
-    ////////////////////////////////////////////////////////////////////////
-    int nshift = shifts.order;
-
-    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
-    std::vector<RealD> &mresidual(shifts.tolerances);
-    std::vector<RealD> alpha(nshift,1.0);
-
-    //Double precision search directions
-    FieldD p_d(DoublePrecGrid);
-    std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
-    std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
-
-    FieldD tmp_d(DoublePrecGrid);
-    FieldD r_d(DoublePrecGrid);
-    FieldF r_f(SinglePrecGrid);
-    FieldD mmp_d(DoublePrecGrid);
-
-    assert(psi_d.size()==nshift);
-    assert(mass.size()==nshift);
-    assert(mresidual.size()==nshift);
-  
-    // dynamic sized arrays on stack; 2d is a pain with vector
-    std::vector<RealD>  bs(nshift);
-    std::vector<RealD>  rsq(nshift);
-    std::vector<RealD>  rsqf(nshift);
-    std::vector<std::array<RealD,2> >  z(nshift);
-    std::vector<int>     converged(nshift);
-  
-    const int       primary =0;
-  
-    //Primary shift fields CG iteration
-    RealD a,b,c,d;
-    RealD cp,bp,qq; //prev
-  
-    // Matrix mult fields
-    FieldF p_f(SinglePrecGrid);
-    FieldF mmp_f(SinglePrecGrid);
-
-    // Check lightest mass
-    for(int s=0;s<nshift;s++){
-      assert( mass[s]>= mass[primary] );
-      converged[s]=0;
-    }
-  
-    // Wire guess to zero
-    // Residuals "r" are src
-    // First search direction "p" is also src
-    cp = norm2(src_d);
-
-    // Handle trivial case of zero src.
-    if( cp == 0. ){
-      for(int s=0;s<nshift;s++){
-	psi_d[s] = Zero();
-	psi_f[s] = Zero();
-	IterationsToCompleteShift[s] = 1;
-	TrueResidualShift[s] = 0.;
-      }
-      return;
-    }
-
-    for(int s=0;s<nshift;s++){
-      rsq[s] = cp * mresidual[s] * mresidual[s];
-      rsqf[s] =rsq[s];
-      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
-      //      ps_d[s] = src_d;
-      precisionChange(ps_f[s],src_d);
-    }
-    // r and p for primary
-    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
-    r_d = p_d;
-    
-    //MdagM+m[0]
-    precisionChange(p_f,p_d);
-    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    precisionChange(tmp_d,mmp_f);
-    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    tmp_d = tmp_d - mmp_d;
-    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
-    //    assert(norm2(tmp_d)< 1.0e-4);
-
-    axpy(mmp_d,mass[0],p_d,mmp_d);
-    RealD rn = norm2(p_d);
-    d += rn*mass[0];
-
-    b = -cp /d;
-  
-    // Set up the various shift variables
-    int       iz=0;
-    z[0][1-iz] = 1.0;
-    z[0][iz]   = 1.0;
-    bs[0]      = b;
-    for(int s=1;s<nshift;s++){
-      z[s][1-iz] = 1.0;
-      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
-      bs[s]      = b*z[s][iz]; 
-    }
-  
-    // r += b[0] A.p[0]
-    // c= norm(r)
-    c=axpy_norm(r_d,b,mmp_d,r_d);
-  
-    for(int s=0;s<nshift;s++) {
-      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
-      precisionChange(psi_f[s],psi_d[s]);
-    }
-  
-    ///////////////////////////////////////
-    // Timers
-    ///////////////////////////////////////
-    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
-
-    SolverTimer.Start();
-  
-    // Iteration loop
-    int k;
-  
-    for (k=1;k<=MaxIterationsMshift;k++){    
-
-      a = c /cp;
-      AXPYTimer.Start();
-      axpy(p_d,a,p_d,r_d); 
-      AXPYTimer.Stop();
-
-      PrecChangeTimer.Start();
-      precisionChange(r_f, r_d);
-      PrecChangeTimer.Stop();
-
-      AXPYTimer.Start();
-      for(int s=0;s<nshift;s++){
-	if ( ! converged[s] ) { 
-	  if (s==0){
-	    axpy(ps_f[s],a,ps_f[s],r_f);
-	  } else{
-	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
-	    axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
-	  }
-	}
-      }
-      AXPYTimer.Stop();
-
-      cp=c;
-      PrecChangeTimer.Start();
-      precisionChange(p_f, p_d); //get back single prec search direction for linop
-      PrecChangeTimer.Stop();
-      MatrixTimer.Start();  
-      Linop_f.HermOp(p_f,mmp_f);
-      MatrixTimer.Stop();  
-      PrecChangeTimer.Start();
-      precisionChange(mmp_d, mmp_f); // From Float to Double
-      PrecChangeTimer.Stop();
-
-      d=real(innerProduct(p_d,mmp_d));    
-      axpy(mmp_d,mass[0],p_d,mmp_d);
-      RealD rn = norm2(p_d);
-      d += rn*mass[0];
-    
-      bp=b;
-      b=-cp/d;
-
-      // Toggle the recurrence history
-      bs[0] = b;
-      iz = 1-iz;
-      ShiftTimer.Start();
-      for(int s=1;s<nshift;s++){
-	if((!converged[s])){
-	  RealD z0 = z[s][1-iz];
-	  RealD z1 = z[s][iz];
-	  z[s][iz] = z0*z1*bp
-	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
-	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
-	}
-      }
-      ShiftTimer.Stop();
-
-      //Update single precision solutions
-      AXPYTimer.Start();
-      for(int s=0;s<nshift;s++){
-	int ss = s;
-	if( (!converged[s]) ) { 
-	  axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
-	}
-      }
-      c = axpy_norm(r_d,b,mmp_d,r_d);
-      AXPYTimer.Stop();
-    
-      // Convergence checks
-      int all_converged = 1;
-      for(int s=0;s<nshift;s++){
-      
-	if ( (!converged[s]) ){
-	  IterationsToCompleteShift[s] = k;
-	
-	  RealD css  = c * z[s][iz]* z[s][iz];
-	
-	  if(css<rsqf[s]){
-	    if ( ! converged[s] )
-	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
-	    converged[s]=1;
-	  } else {
-	    all_converged=0;
-	  }
-
-	}
-      }
-
-      if ( all_converged || k == MaxIterationsMshift-1){
-
-	SolverTimer.Stop();
-
-	for(int s=0;s<nshift;s++){
-	  precisionChange(psi_d[s],psi_f[s]);
-	}
-
-	
-	if ( all_converged ){
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
-	} else {
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
-	}
-	
-	// Check answers 
-	for(int s=0; s < nshift; s++) { 
-	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
-	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
-	  axpy(r_d,-alpha[s],src_d,tmp_d);
-	  RealD rn = norm2(r_d);
-	  RealD cn = norm2(src_d);
-	  TrueResidualShift[s] = std::sqrt(rn/cn);
-	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
-
-	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
-	  if(rn >= rsq[s]){
-	    CleanupTimer.Start();
-	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
-
-	    //Setup linear operators for final cleanup
-	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
-	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
-					       
-	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
-	    cg(src_d, psi_d[s]);
-	    
-	    TrueResidualShift[s] = cg.TrueResidual;
-	    CleanupTimer.Stop();
-	  }
-	}
-
-	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
-	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
-
-	IterationsToComplete = k;	
-
-	return;
-      }
-   
-    }
-    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-    assert(0);
-  }
-
-};
-NAMESPACE_END(Grid);
-
@@ -81,7 +81,6 @@ public:
  using OperatorFunction<FieldD>::operator();

  RealD   Tolerance;
-  Integer MaxIterationsMshift;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@@ -96,9 +95,9 @@ public:

  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
-				       int _ReliableUpdateFreq) : 
-    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
-    MaxIterations(20000)
+				       int _ReliableUpdateFreq
+				       ) : 
+    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
@@ -128,12 +127,10 @@ public:

  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
-    GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
    GridBase *DoublePrecGrid = src_d.Grid();
+    precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
+    precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);

-    precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
-    precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
-    
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
@@ -156,11 +153,10 @@ public:
    assert(mresidual.size()==nshift);
  
    // dynamic sized arrays on stack; 2d is a pain with vector
-    std::vector<RealD>  bs(nshift);
-    std::vector<RealD>  rsq(nshift);
-    std::vector<RealD>  rsqf(nshift);
-    std::vector<std::array<RealD,2> >  z(nshift);
-    std::vector<int>     converged(nshift);
+    RealD  bs[nshift];
+    RealD  rsq[nshift];
+    RealD  z[nshift][2];
+    int     converged[nshift];
  
    const int       primary =0;
  
@@ -169,8 +165,12 @@ public:
    RealD cp,bp,qq; //prev
  
    // Matrix mult fields
+    FieldF r_f(SinglePrecGrid);
    FieldF p_f(SinglePrecGrid);
+    FieldF tmp_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
+    FieldF src_f(SinglePrecGrid);
+    precisionChange(src_f, src_d, wk_f_from_d);

    // Check lightest mass
    for(int s=0;s<nshift;s++){
@@ -195,26 +195,18 @@ public:

    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
-      rsqf[s] =rsq[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
+    r_f=src_f; //residual maintained in single
+    p_f=src_f;
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
-    r_d = p_d;
-    
+  
    //MdagM+m[0]
-    precisionChange(p_f, p_d, pc_wk_d_to_s);
-
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
-    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    tmp_d = tmp_d - mmp_d;
-    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
-    assert(norm2(tmp_d)< 1.0);
-
-    axpy(mmp_d,mass[0],p_d,mmp_d);
-    RealD rn = norm2(p_d);
+    axpy(mmp_f,mass[0],p_f,mmp_f);
+    RealD rn = norm2(p_f);
    d += rn*mass[0];

    b = -cp /d;
@@ -232,7 +224,7 @@ public:
  
    // r += b[0] A.p[0]
    // c= norm(r)
-    c=axpy_norm(r_d,b,mmp_d,r_d);
+    c=axpy_norm(r_f,b,mmp_f,r_f);
  
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
@@ -248,9 +240,14 @@ public:
    // Iteration loop
    int k;
  
-    for (k=1;k<=MaxIterationsMshift;k++){    
-
+    for (k=1;k<=MaxIterations;k++){    
      a = c /cp;
+
+      //Update double precision search direction by residual
+      PrecChangeTimer.Start();
+      precisionChange(r_d, r_f, wk_d_from_f);
+      PrecChangeTimer.Stop();
+
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 

@@ -267,28 +264,24 @@ public:
      AXPYTimer.Stop();

      PrecChangeTimer.Start();
-      precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
+      precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();

      cp=c;
      MatrixTimer.Start();  
-      Linop_f.HermOp(p_f,mmp_f);
+      Linop_f.HermOp(p_f,mmp_f); 
+      d=real(innerProduct(p_f,mmp_f));    
      MatrixTimer.Stop();  

-      PrecChangeTimer.Start();
-      precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
-      PrecChangeTimer.Stop();
-
      AXPYTimer.Start();
-      d=real(innerProduct(p_d,mmp_d));    
-      axpy(mmp_d,mass[0],p_d,mmp_d);
+      axpy(mmp_f,mass[0],p_f,mmp_f);
      AXPYTimer.Stop();
-      RealD rn = norm2(p_d);
+      RealD rn = norm2(p_f);
      d += rn*mass[0];
    
      bp=b;
      b=-cp/d;
-
+    
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
@@ -314,12 +307,12 @@ public:
      }

      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
-      c = axpy_norm(r_d,b,mmp_d,r_d);
-
+      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
      AXPYTimer.Stop();

+      c = c_f;
+
      if(k % ReliableUpdateFreq == 0){
-	RealD c_old = c;
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
@@ -328,10 +321,15 @@ public:
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);

-	c = axpy_norm(r_d, -1.0, mmp_d, src_d);
+	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();

-	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_old <<" with |r|^2 = "<<c<<std::endl;
+	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
+	
+	PrecChangeTimer.Start();
+	precisionChange(r_f, r_d, wk_f_from_d);
+	PrecChangeTimer.Stop();
+	c = c_d;
      }
    
      // Convergence checks
@@ -343,7 +341,7 @@ public:
 	
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	
-	  if(css<rsqf[s]){
+	  if(css<rsq[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
@@ -354,17 +352,12 @@ public:
 	}
      }

-      if ( all_converged || k == MaxIterationsMshift-1){
+      if ( all_converged ){

 	SolverTimer.Stop();
-
-	if ( all_converged ){
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
-	} else {
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
-	}
-	
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
+      
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
@@ -405,10 +398,12 @@ public:

 	return;
      }
+
   
    }
+    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-    assert(0);
+    //  assert(0);
  }

 };
@@ -48,7 +48,7 @@ public:
  LinearOperatorBase<FieldF> &Linop_f;
  LinearOperatorBase<FieldD> &Linop_d;
  GridBase* SinglePrecGrid;
-  RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
+  RealD Delta; //reliable update parameter

  //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
  LinearOperatorBase<FieldF> *Linop_fallback;
@@ -65,9 +65,7 @@ public:
      ErrorOnNoConverge(err_on_no_conv),
      DoFinalCleanup(true),
      Linop_fallback(NULL)
-  {
-    assert(Delta > 0. && Delta < 1. && "Expect  0 < Delta < 1");
-  };
+  {};

  void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
    Linop_fallback = &_Linop_fallback;
@@ -75,7 +73,6 @@ public:
  }
    
  void operator()(const FieldD &src, FieldD &psi) {
-    GRID_TRACE("ConjugateGradientReliableUpdate");
    LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
    bool using_fallback = false;
      
@@ -118,12 +115,9 @@ public:
    }

    //Single prec initialization
-    precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
-    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
-    
    FieldF r_f(SinglePrecGrid);
    r_f.Checkerboard() = r.Checkerboard();
-    precisionChange(r_f, r, pc_wk_dp_to_sp);
+    precisionChange(r_f, r);

    FieldF psi_f(r_f);
    psi_f = Zero();
@@ -139,8 +133,7 @@ public:
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
-    GridStopWatch PrecChangeTimer;
-    
+
    SolverTimer.Start();
    int k = 0;
    int l = 0;
@@ -179,9 +172,7 @@ public:
      // Stopping condition
      if (cp <= rsq) {
 	//Although not written in the paper, I assume that I have to add on the final solution
-	PrecChangeTimer.Start();
-	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
-	PrecChangeTimer.Stop();
+	precisionChange(mmp, psi_f);
 	psi = psi + mmp;
 	
 	
@@ -202,10 +193,7 @@ public:
 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;

-	
 	IterationsToComplete = k;	
 	ReliableUpdatesPerformed = l;
 	  
@@ -225,21 +213,14 @@ public:
      else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 		  << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
-	PrecChangeTimer.Start();
-	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
-	PrecChangeTimer.Stop();
+	precisionChange(mmp, psi_f);
 	psi = psi + mmp;

-	MatrixTimer.Start();
 	Linop_d.HermOpAndNorm(psi, mmp, d, qq);
-	MatrixTimer.Stop();
-	
 	r = src - mmp;

 	psi_f = Zero();
-	PrecChangeTimer.Start();
-	precisionChange(r_f, r, pc_wk_dp_to_sp);
-	PrecChangeTimer.Stop();
+	precisionChange(r_f, r);
 	cp = norm2(r);
 	MaxResidSinceLastRelUp = cp;

@@ -113,43 +113,7 @@ public:
    blockPromote(guess_coarse,guess,subspace);
    guess.Checkerboard() = src.Checkerboard();
  };
-
-  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
-    int Nevec = (int)evec_coarse.size();
-    int Nsrc = (int)src.size();
-    // make temp variables
-    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
-    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
-    //Preporcessing
-    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
-    for (int j=0;j<Nsrc;j++)
-    {
-    guess_coarse[j] = Zero();
-    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
-    blockProject(src_coarse[j],src[j],subspace);
-    }
-    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
-    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
-    for (int i=0;i<Nevec;i++)
-    {
-      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
-      const CoarseField & tmp = evec_coarse[i];
-      for (int j=0;j<Nsrc;j++)
-      {
-        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
-      }
-    }
-    //postprocessing
-    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
-    for (int j=0;j<Nsrc;j++)
-    {
-    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
-    blockPromote(guess_coarse[j],guess[j],subspace);
-    guess[j].Checkerboard() = src[j].Checkerboard();
-    }
-  };
-
-  };
+};



@@ -79,16 +79,14 @@ template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public Imp
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);

    std::cout.precision(13);
-
-    int conv=0;
-    if( (vv<eresid*eresid) ) conv = 1;
-
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
-	     <<" target " << eresid*eresid << " conv " <<conv
 	     <<std::endl;

+    int conv=0;
+    if( (vv<eresid*eresid) ) conv = 1;
+
    return conv;
  }
 };
@@ -245,10 +243,9 @@ until convergence
 	_HermOp(src_n,tmp);
 	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
 	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
-//	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
-	RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
+	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
 	RealD vden = norm2(src_n);
-	RealD na = std::sqrt(vnum/vden);
+	RealD na = vnum/vden;
 	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
@@ -256,7 +253,6 @@ until convergence
 	src_n = tmp;
      }
    }
-    std::cout << GridLogIRL << " Final evalMaxApprox  " << evalMaxApprox << std::endl;
 	
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
@@ -423,15 +419,14 @@ until convergence
 	}
      }

-      if ( Nconv < Nstop ) {
+      if ( Nconv < Nstop )
 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
-	std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
-      }
+
      eval=eval2;
      
      //Keep only converged
-      eval.resize(Nstop);// was Nconv
-      evec.resize(Nstop,grid);// was Nconv
+      eval.resize(Nconv);// Nstop?
+      evec.resize(Nconv,grid);// Nstop?
      basisSortInPlace(evec,eval,reverse);
      
    }
@@ -461,7 +456,7 @@ until convergence
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
-    std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
+    std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
    const RealD tiny = 1.0e-20;
    assert( k< Nm );

@@ -469,7 +464,7 @@ until convergence

    Field& evec_k = evec[k];

-    _PolyOp(evec_k,w);    std::cout<<GridLogDebug << "PolyOp" <<std::endl;
+    _PolyOp(evec_k,w);    std::cout<<GridLogIRL << "PolyOp" <<std::endl;

    if(k>0) w -= lme[k-1] * evec[k-1];

@@ -484,18 +479,18 @@ until convergence
    lme[k] = beta;

    if ( (k>0) && ( (k % orth_period) == 0 )) {
-      std::cout<<GridLogDebug << "Orthogonalising " <<k<<std::endl;
+      std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
      orthogonalize(w,evec,k); // orthonormalise
-      std::cout<<GridLogDebug << "Orthogonalised " <<k<<std::endl;
+      std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
    }

    if(k < Nm-1) evec[k+1] = w;

-    std::cout<<GridLogIRL << "Lanczos step alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
+    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;

-    std::cout<<GridLogDebug << "Lanczos step complete " <<k<<std::endl;
+    std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
  }

  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
@@ -146,21 +146,14 @@ public:
  LinearOperatorBase<FineField> &_Linop;
  RealD                             _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
-
-  int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
-                                //As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
-                                //To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
-                                //out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
-                                //NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
  
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
-					   RealD coarse_relax_tol=5.0e3,
-					   int largestEvalIdxForReport=-1) 
+					   RealD coarse_relax_tol=5.0e3) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
-      _coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
+      _coarse_relax_tol(coarse_relax_tol)  
  {    };

  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
@@ -186,12 +179,6 @@ public:
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;

-    if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
-      std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
-      RealD tmp_eval;
-      ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
-    }
-    
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
@@ -422,7 +409,7 @@ public:
    //////////////////////////////////////////////////////////////////////////////////////////////////

    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); 
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); 

    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
@@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Take a matrix and form an NE solver calling a Herm solver
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class NormalEquations : public LinearFunction<Field>{
+template<class Field> class NormalEquations {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@@ -60,33 +60,7 @@ public:
  }     
 };

-template<class Field> class NormalResidual : public LinearFunction<Field>{
-private:
-  SparseMatrixBase<Field> & _Matrix;
-  OperatorFunction<Field> & _HermitianSolver;
-  LinearFunction<Field>   & _Guess;
-public:
-
-  /////////////////////////////////////////////////////
-  // Wrap the usual normal equations trick
-  /////////////////////////////////////////////////////
- NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
-		 LinearFunction<Field> &Guess) 
-   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
-
-  void operator() (const Field &in, Field &out){
- 
-    Field res(in.Grid());
-    Field tmp(in.Grid());
-
-    MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
-    _Guess(in,res);
-    _HermitianSolver(MMdagOp,in,res);  // M Mdag res = in ;
-    _Matrix.Mdag(res,out);             // out = Mdag res
-  }     
-};
-
-template<class Field> class HPDSolver : public LinearFunction<Field> {
+template<class Field> class HPDSolver {
 private:
  LinearOperatorBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@@ -104,13 +78,13 @@ public:
  void operator() (const Field &in, Field &out){
 
    _Guess(in,out);
-    _HermitianSolver(_Matrix,in,out);  //M out = in
+    _HermitianSolver(_Matrix,in,out);  // Mdag M out = Mdag in

  }     
 };


-template<class Field> class MdagMSolver : public LinearFunction<Field> {
+template<class Field> class MdagMSolver {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
    RealD evalMaxApprox = 0.0; 
    auto src_n = src; 
    auto tmp = src; 
-    const int _MAX_ITER_EST_ = 200; 
+    const int _MAX_ITER_EST_ = 50; 

    for (int i=0;i<_MAX_ITER_EST_;i++) { 
      
@@ -30,17 +30,18 @@ template<class Field> class PowerMethod
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 

-      std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
+      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      
-      //      if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) { 
-	// 	evalMaxApprox = na; 
-	// 	return evalMaxApprox; 
-      //      } 
+      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
+ 	evalMaxApprox = na; 
+	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
+ 	return evalMaxApprox; 
+      } 
      evalMaxApprox = na; 
      src_n = tmp;
    }
-    std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
-    return evalMaxApprox;
+    assert(0);
+    return 0;
  }
 };
 }
@@ -1,76 +0,0 @@
-#pragma once
-namespace Grid {
-
-class Band
-{
-  RealD lo, hi;
-public:
-  Band(RealD _lo,RealD _hi)
-  {
-    lo=_lo;
-    hi=_hi;
-  }
-  RealD operator() (RealD x){
-    if ( x>lo && x<hi ){
-      return 1.0;
-    } else {
-      return 0.0;
-    }
-  }
-};
-
-class PowerSpectrum
-{ 
- public: 
-
-  template<typename T>  static RealD normalise(T& v) 
-  {
-    RealD nn = norm2(v);
-    nn = sqrt(nn);
-    v = v * (1.0/nn);
-    return nn;
-  }
-
-  std::vector<RealD> ranges;
-  std::vector<int> order;
-  
-  PowerSpectrum(  std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order)  { };
-
-  template<class Field>
-  RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src) 
-  { 
-    GridBase *grid = src.Grid(); 
-    int N=ranges.size();
-    RealD hi = ranges[N-1];
-
-    RealD lo_band = 0.0;
-    RealD hi_band;
-    RealD nn=norm2(src);
-    RealD ss=0.0;
-
-    Field tmp = src;
-
-    for(int b=0;b<N;b++){
-      hi_band = ranges[b];
-      Band Notch(lo_band,hi_band);
-      
-      Chebyshev<Field> polynomial;
-      polynomial.Init(0.0,hi,order[b],Notch);
-      polynomial.JacksonSmooth();
-
-      polynomial(HermOp,src,tmp) ;
-
-      RealD p=norm2(tmp);
-      ss=ss+p;
-      std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
-      
-      lo_band=hi_band;
-    }
-    std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
-    std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
-
-    return 0;
-  };
-};
-  
-}
@@ -74,7 +74,7 @@ public:

  void operator() (const Field &src, Field &psi){

-    //    psi=Zero();
+    psi=Zero();
    RealD cp, ssq,rsq;
    ssq=norm2(src);
    rsq=Tolerance*Tolerance*ssq;
@@ -499,87 +499,6 @@ namespace Grid {
      }
  };

-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Site diagonal is identity, left preconditioned by Mee^inv
-  // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv  ) phi =  Mee_inv eta
-  //
-  // Solve:
-  // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag  Mee_inv eta
-  //
-  // Old notation e<->o
-  //
-  // Left precon by Moo^-1
-  //  b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o =  [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1}  eta_o
-  //                                   eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
-  public:
-    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
-
-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
-      const bool _solnAsInitGuess = false)  
-    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
-
-    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
-    {
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
-      
-      Field   tmp(grid);
-      Field  Mtmp(grid);
-
-      pickCheckerboard(Even,src_e,src);
-      pickCheckerboard(Odd ,src_o,src);
-    
-      /////////////////////////////////////////////////////
-      // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
-      Mtmp=src_o-Mtmp;                 
-      _Matrix.MooeeInv(Mtmp,tmp);      assert( tmp.Checkerboard() ==Odd);     
-      
-      // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
-    }
-
-    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
-    {
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      Field   tmp(grid);
-      Field   sol_e(grid);
-
-
-      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);    assert(  tmp.Checkerboard()   ==Even);
-      tmp = src_e-tmp;             assert(  src_e.Checkerboard() ==Even);
-      _Matrix.MooeeInv(tmp,sol_e); assert(  sol_e.Checkerboard() ==Even);
-     
-      setCheckerboard(sol,sol_e);  assert(  sol_e.Checkerboard() ==Even);
-      setCheckerboard(sol,sol_o);  assert(  sol_o.Checkerboard() ==Odd );
-    };
-
-    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
-    {
-      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
-      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
-    };
-    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
-    {
-      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
-      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
-    }
-  };
-
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, right preconditioned by Mee^inv
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
@@ -1,931 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
-
-    Copyright (C) 2015
-
-Author: Chulwoo Jung <chulwoo@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_LANC_H
-#define GRID_LANC_H
-
-#include <string.h>		//memset
-
-#ifdef USE_LAPACK
-#ifdef USE_MKL
-#include<mkl_lapack.h>
-#else
-void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
-		    double *vl, double *vu, int *il, int *iu, double *abstol,
-		    int *m, double *w, double *z, int *ldz, int *isuppz,
-		    double *work, int *lwork, int *iwork, int *liwork,
-		    int *info);
-//#include <lapacke/lapacke.h>
-#endif
-#endif
-
-//#include <Grid/algorithms/densematrix/DenseMatrix.h>
-
-// eliminate temorary vector in calc()
-#define MEM_SAVE
-
-namespace Grid
-{
-
-  struct Bisection
-  {
-
-#if 0
-    static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
-			  std::vector < RealD > &BETA,
-			  std::vector < RealD > &eig)
-    {
-      int i, j;
-        std::vector < RealD > evec1 (row_num + 3);
-        std::vector < RealD > evec2 (row_num + 3);
-      RealD eps2;
-        ALPHA[1] = 0.;
-        BETHA[1] = 0.;
-      for (i = 0; i < row_num - 1; i++)
-	{
-	  ALPHA[i + 1] = A[i * (row_num + 1)].real ();
-	  BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
-	}
-      ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
-        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
-        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
-
-      // Do we really need to sort here?
-      int begin = 1;
-      int end = row_num;
-      int swapped = 1;
-      while (swapped)
-	{
-	  swapped = 0;
-	  for (i = begin; i < end; i++)
-	    {
-	      if (mag (evec2[i]) > mag (evec2[i + 1]))
-		{
-		  swap (evec2 + i, evec2 + i + 1);
-		  swapped = 1;
-		}
-	    }
-	  end--;
-	  for (i = end - 1; i >= begin; i--)
-	    {
-	      if (mag (evec2[i]) > mag (evec2[i + 1]))
-		{
-		  swap (evec2 + i, evec2 + i + 1);
-		  swapped = 1;
-		}
-	    }
-	  begin++;
-	}
-
-      for (i = 0; i < row_num; i++)
-	{
-	  for (j = 0; j < row_num; j++)
-	    {
-	      if (i == j)
-		H[i * row_num + j] = evec2[i + 1];
-	      else
-		H[i * row_num + j] = 0.;
-	    }
-	}
-    }
-#endif
-
-    static void bisec (std::vector < RealD > &c,
-		       std::vector < RealD > &b,
-		       int n,
-		       int m1,
-		       int m2,
-		       RealD eps1,
-		       RealD relfeh, std::vector < RealD > &x, RealD & eps2)
-    {
-      std::vector < RealD > wu (n + 2);
-
-      RealD h, q, x1, xu, x0, xmin, xmax;
-      int i, a, k;
-
-      b[1] = 0.0;
-      xmin = c[n] - fabs (b[n]);
-      xmax = c[n] + fabs (b[n]);
-      for (i = 1; i < n; i++)
-	{
-	  h = fabs (b[i]) + fabs (b[i + 1]);
-	  if (c[i] + h > xmax)
-	    xmax = c[i] + h;
-	  if (c[i] - h < xmin)
-	    xmin = c[i] - h;
-	}
-      xmax *= 2.;
-
-      eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
-      if (eps1 <= 0.0)
-	eps1 = eps2;
-      eps2 = 0.5 * eps1 + 7.0 * (eps2);
-      x0 = xmax;
-      for (i = m1; i <= m2; i++)
-	{
-	  x[i] = xmax;
-	  wu[i] = xmin;
-	}
-
-      for (k = m2; k >= m1; k--)
-	{
-	  xu = xmin;
-	  i = k;
-	  do
-	    {
-	      if (xu < wu[i])
-		{
-		  xu = wu[i];
-		  i = m1 - 1;
-		}
-	      i--;
-	    }
-	  while (i >= m1);
-	  if (x0 > x[k])
-	    x0 = x[k];
-	  while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
-	    {
-	      x1 = (xu + x0) / 2;
-
-	      a = 0;
-	      q = 1.0;
-	      for (i = 1; i <= n; i++)
-		{
-		  q =
-		    c[i] - x1 -
-		    ((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
-		  if (q < 0)
-		    a++;
-		}
-//      printf("x1=%0.14e a=%d\n",x1,a);
-	      if (a < k)
-		{
-		  if (a < m1)
-		    {
-		      xu = x1;
-		      wu[m1] = x1;
-		    }
-		  else
-		    {
-		      xu = x1;
-		      wu[a + 1] = x1;
-		      if (x[a] > x1)
-			x[a] = x1;
-		    }
-		}
-	      else
-		x0 = x1;
-	    }
-	  printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
-	  x[k] = (x0 + xu) / 2;
-	}
-    }
-  };
-
-/////////////////////////////////////////////////////////////
-// Implicitly restarted lanczos
-/////////////////////////////////////////////////////////////
-
-
-  template < class Field > class SimpleLanczos
-  {
-
-    const RealD small = 1.0e-16;
-  public:
-    int lock;
-    int get;
-    int Niter;
-    int converged;
-
-    int Nstop;			// Number of evecs checked for convergence
-    int Nk;			// Number of converged sought
-    int Np;			// Np -- Number of spare vecs in kryloc space
-    int Nm;			// Nm -- total number of vectors
-
-
-    RealD OrthoTime;
-
-    RealD eresid;
-
-//    SortEigen < Field > _sort;
-
-    LinearFunction < Field > &_Linop;
-
-//    OperatorFunction < Field > &_poly;
-
-    /////////////////////////
-    // Constructor
-    /////////////////////////
-    void init (void)
-    {
-    };
-//    void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector  < RealD > >&evecs);
-
-    SimpleLanczos (LinearFunction < Field > &Linop,	// op
-//		   OperatorFunction < Field > &poly,	// polynmial
-		   int _Nstop,	// sought vecs
-		   int _Nk,	// sought vecs
-		   int _Nm,	// spare vecs
-		   RealD _eresid,	// resid in lmdue deficit 
-		   int _Niter):	// Max iterations
-     
-      _Linop (Linop),
- //     _poly (poly),
-      Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
-    {
-      Np = Nm - Nk;
-      assert (Np > 0);
-    };
-
-    /////////////////////////
-    // Sanity checked this routine (step) against Saad.
-    /////////////////////////
-    void RitzMatrix (std::vector < Field > &evec, int k)
-    {
-
-      if (1)
-	return;
-
-      GridBase *grid = evec[0].Grid();
-      Field w (grid);
-      std::cout << GridLogMessage << "RitzMatrix " << std::endl;
-      for (int i = 0; i < k; i++)
-	{
-	  _Linop(evec[i], w);
-//      _poly(_Linop,evec[i],w);
-	  std::cout << GridLogMessage << "[" << i << "] ";
-	  for (int j = 0; j < k; j++)
-	    {
-	      ComplexD in = innerProduct (evec[j], w);
-	      if (fabs ((double) i - j) > 1)
-		{
-		  if (abs (in) > 1.0e-9)
-		    {
-		      std::cout << GridLogMessage << "oops" << std::endl;
-		      abort ();
-		    }
-		  else
-		    std::cout << GridLogMessage << " 0 ";
-		}
-	      else
-		{
-		  std::cout << GridLogMessage << " " << in << " ";
-		}
-	    }
-	  std::cout << GridLogMessage << std::endl;
-	}
-    }
-
-    void step (std::vector < RealD > &lmd,
-	       std::vector < RealD > &lme,
-	       Field & last, Field & current, Field & next, uint64_t k)
-    {
-      if (lmd.size () <= k)
-	lmd.resize (k + Nm);
-      if (lme.size () <= k)
-	lme.resize (k + Nm);
-
-
-//      _poly(_Linop,current,next );   // 3. wk:=Avk−βkv_{k−1}
-      _Linop(current, next);	// 3. wk:=Avk−βkv_{k−1}
-      if (k > 0)
-	{
-	  next -= lme[k - 1] * last;
-	}
-//      std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
-
-      ComplexD zalph = innerProduct (current, next);	// 4. αk:=(wk,vk)
-      RealD alph = real (zalph);
-
-      next = next - alph * current;	// 5. wk:=wk−αkvk
-//      std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
-
-      RealD beta = normalise (next);	// 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-      // 7. vk+1 := wk/βk+1
-//       norm=beta;
-
-      int interval = Nm / 100 + 1;
-      if ((k % interval) == 0)
-	std::
-	  cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
-	  beta << std::endl;
-      const RealD tiny = 1.0e-20;
-      if (beta < tiny)
-	{
-	  std::cout << GridLogMessage << " beta is tiny " << beta << std::
-	    endl;
-	}
-      lmd[k] = alph;
-      lme[k] = beta;
-
-    }
-
-    void qr_decomp (std::vector < RealD > &lmd,
-		    std::vector  < RealD > &lme,
-		    int Nk,
-		    int Nm,
-		    std::vector  < RealD > &Qt, RealD Dsh, int kmin, int kmax)
-    {
-      int k = kmin - 1;
-      RealD x;
-
-      RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
-      RealD c = (lmd[k] - Dsh) * Fden;
-      RealD s = -lme[k] * Fden;
-
-      RealD tmpa1 = lmd[k];
-      RealD tmpa2 = lmd[k + 1];
-      RealD tmpb = lme[k];
-
-      lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
-      lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
-      lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
-      x = -s * lme[k + 1];
-      lme[k + 1] = c * lme[k + 1];
-
-      for (int i = 0; i < Nk; ++i)
-	{
-	  RealD Qtmp1 = Qt[i + Nm * k];
-	  RealD Qtmp2 = Qt[i + Nm * (k + 1)];
-	  Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
-	  Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
-	}
-
-      // Givens transformations
-      for (int k = kmin; k < kmax - 1; ++k)
-	{
-
-	  RealD Fden = 1.0 / hypot (x, lme[k - 1]);
-	  RealD c = lme[k - 1] * Fden;
-	  RealD s = -x * Fden;
-
-	  RealD tmpa1 = lmd[k];
-	  RealD tmpa2 = lmd[k + 1];
-	  RealD tmpb = lme[k];
-
-	  lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
-	  lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
-	  lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
-	  lme[k - 1] = c * lme[k - 1] - s * x;
-
-	  if (k != kmax - 2)
-	    {
-	      x = -s * lme[k + 1];
-	      lme[k + 1] = c * lme[k + 1];
-	    }
-
-	  for (int i = 0; i < Nk; ++i)
-	    {
-	      RealD Qtmp1 = Qt[i + Nm * k];
-	      RealD Qtmp2 = Qt[i + Nm * (k + 1)];
-	      Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
-	      Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
-	    }
-	}
-    }
-
-#if 0
-#ifdef USE_LAPACK
-#ifdef USE_MKL
-#define LAPACK_INT MKL_INT
-#else
-#define LAPACK_INT long long
-#endif
-    void diagonalize_lapack (std::vector  < RealD > &lmd, std::vector  < RealD > &lme, int N1,	// all
-			     int N2,	// get
-			     GridBase * grid)
-    {
-      const int size = Nm;
-      LAPACK_INT NN = N1;
-      double evals_tmp[NN];
-      double DD[NN];
-      double EE[NN];
-      for (int i = 0; i < NN; i++)
-	for (int j = i - 1; j <= i + 1; j++)
-	  if (j < NN && j >= 0)
-	    {
-	      if (i == j)
-		DD[i] = lmd[i];
-	      if (i == j)
-		evals_tmp[i] = lmd[i];
-	      if (j == (i - 1))
-		EE[j] = lme[j];
-	    }
-      LAPACK_INT evals_found;
-      LAPACK_INT lwork =
-	((18 * NN) >
-	 (1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
-      LAPACK_INT liwork = 3 + NN * 10;
-      LAPACK_INT iwork[liwork];
-      double work[lwork];
-      LAPACK_INT isuppz[2 * NN];
-      char jobz = 'N';		// calculate evals only
-      char range = 'I';		// calculate il-th to iu-th evals
-      //    char range = 'A'; // calculate all evals
-      char uplo = 'U';		// refer to upper half of original matrix
-      char compz = 'I';		// Compute eigenvectors of tridiagonal matrix
-      int ifail[NN];
-      LAPACK_INT info;
-//  int total = QMP_get_number_of_nodes();
-//  int node = QMP_get_node_number();
-//  GridBase *grid = evec[0]._grid;
-      int total = grid->_Nprocessors;
-      int node = grid->_processor;
-      int interval = (NN / total) + 1;
-      double vl = 0.0, vu = 0.0;
-      LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
-      if (iu > NN)
-	iu = NN;
-      double tol = 0.0;
-      if (1)
-	{
-	  memset (evals_tmp, 0, sizeof (double) * NN);
-	  if (il <= NN)
-	    {
-	      printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
-#ifdef USE_MKL
-	      dstegr (&jobz, &range, &NN,
-#else
-	      LAPACK_dstegr (&jobz, &range, &NN,
-#endif
-			     (double *) DD, (double *) EE, &vl, &vu, &il, &iu,	// these four are ignored if second parameteris 'A'
-			     &tol,	// tolerance
-			     &evals_found, evals_tmp, (double *) NULL, &NN,
-			     isuppz, work, &lwork, iwork, &liwork, &info);
-	      for (int i = iu - 1; i >= il - 1; i--)
-		{
-		  printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
-			  evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
-		  evals_tmp[i] = evals_tmp[i - (il - 1)];
-		  if (il > 1)
-		    evals_tmp[i - (il - 1)] = 0.;
-		}
-	    }
-	  {
-	    grid->GlobalSumVector (evals_tmp, NN);
-	  }
-	}
-// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
-    }
-#undef LAPACK_INT
-#endif
-
-
-    void diagonalize (std::vector  < RealD > &lmd,
-		      std::vector  < RealD > &lme,
-		      int N2, int N1, GridBase * grid)
-    {
-
-#ifdef USE_LAPACK
-      const int check_lapack = 0;	// just use lapack if 0, check against lapack if 1
-
-      if (!check_lapack)
-	return diagonalize_lapack (lmd, lme, N2, N1, grid);
-
-//      diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
-#endif
-    }
-#endif
-
-    static RealD normalise (Field & v)
-    {
-      RealD nn = norm2 (v);
-      nn = sqrt (nn);
-      v = v * (1.0 / nn);
-      return nn;
-    }
-
-    void orthogonalize (Field & w, std::vector < Field > &evec, int k)
-    {
-      double t0 = -usecond () / 1e6;
-      typedef typename Field::scalar_type MyComplex;
-      MyComplex ip;
-
-      if (0)
-	{
-	  for (int j = 0; j < k; ++j)
-	    {
-	      normalise (evec[j]);
-	      for (int i = 0; i < j; i++)
-		{
-		  ip = innerProduct (evec[i], evec[j]);	// are the evecs normalised? ; this assumes so.
-		  evec[j] = evec[j] - ip * evec[i];
-		}
-	    }
-	}
-
-      for (int j = 0; j < k; ++j)
-	{
-	  ip = innerProduct (evec[j], w);	// are the evecs normalised? ; this assumes so.
-	  w = w - ip * evec[j];
-	}
-      normalise (w);
-      t0 += usecond () / 1e6;
-      OrthoTime += t0;
-    }
-
-    void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
-    {
-      for (int i = 0; i < Qt.size (); ++i)
-	Qt[i] = 0.0;
-      for (int k = 0; k < Nm; ++k)
-	Qt[k + k * Nm] = 1.0;
-    }
-
-
-    void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
-    {
-
-      GridBase *grid = src.Grid();
-//      assert(grid == src._grid);
-
-      std::
-	cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
-	endl;
-      std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
-      std::cout << GridLogMessage << " -- size of eval   = " << eval.
-	size () << std::endl;
-
-//      assert(c.size() && Nm == eval.size());
-
-      std::vector < RealD > lme (Nm);
-      std::vector < RealD > lmd (Nm);
-
-
-      Field current (grid);
-      Field last (grid);
-      Field next (grid);
-
-      Nconv = 0;
-
-      RealD beta_k;
-
-      // Set initial vector
-      // (uniform vector) Why not src??
-      //      evec[0] = 1.0;
-      current = src;
-      std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
-	endl;
-      normalise (current);
-      std::
-	cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
-	std::endl;
-
-      // Initial Nk steps
-      OrthoTime = 0.;
-      double t0 = usecond () / 1e6;
-      RealD norm;		// sqrt norm of last vector
-
-      uint64_t iter = 0;
-
-      bool initted = false;
-      std::vector < RealD > low (Nstop * 10);
-      std::vector < RealD > high (Nstop * 10);
-      RealD cont = 0.;
-      while (1) {
-	  cont = 0.;
-	  std::vector < RealD > lme2 (Nm);
-	  std::vector < RealD > lmd2 (Nm);
-	  for (uint64_t k = 0; k < Nm; ++k, iter++) {
-	      step (lmd, lme, last, current, next, iter);
-	      last = current;
-	      current = next;
-	    }
-	  double t1 = usecond () / 1e6;
-	  std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
-	    t0 << "seconds" << std::endl;
-	  t0 = t1;
-	  std::
-	    cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
-	    OrthoTime << "seconds" << std::endl;
-
-	  // getting eigenvalues
-	  lmd2.resize (iter + 2);
-	  lme2.resize (iter + 2);
-	  for (uint64_t k = 0; k < iter; ++k) {
-	      lmd2[k + 1] = lmd[k];
-	      lme2[k + 2] = lme[k];
-	    }
-	  t1 = usecond () / 1e6;
-	  std::cout << GridLogMessage << "IRL:: copy: " << t1 -
-	    t0 << "seconds" << std::endl;
-	  t0 = t1;
-	  {
-	    int total = grid->_Nprocessors;
-	    int node = grid->_processor;
-	    int interval = (Nstop / total) + 1;
-	    int iu = (iter + 1) - (interval * node + 1);
-	    int il = (iter + 1) - (interval * (node + 1));
-	    std::vector < RealD > eval2 (iter + 3);
-	    RealD eps2;
-	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
-			      eps2);
-//        diagonalize(eval2,lme2,iter,Nk,grid);
-	    RealD diff = 0.;
-	    for (int i = il; i <= iu; i++) {
-		if (initted)
-		  diff =
-		    fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
-						      fabs (high[iu-i]));
-		if (initted && (diff > eresid))
-		  cont = 1.;
-		if (initted)
-		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
-			  high[iu-i], diff);
-		high[iu-i] = eval2[i];
-	      }
-	    il = (interval * node + 1);
-	    iu = (interval * (node + 1));
-	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
-			      eps2);
-	    for (int i = il; i <= iu; i++) {
-		if (initted)
-		  diff =
-		    fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
-						fabs (low[i]));
-		if (initted && (diff > eresid))
-		  cont = 1.;
-		if (initted)
-		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
-			  low[i], diff);
-		low[i] = eval2[i];
-	      }
-	    t1 = usecond () / 1e6;
-	    std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
-	      t0 << "seconds" << std::endl;
-	    t0 = t1;
-	  }
-
-	  for (uint64_t k = 0; k < Nk; ++k) {
-//          eval[k] = eval2[k];
-	    }
-	  if (initted)
-	    {
-	      grid->GlobalSumVector (&cont, 1);
-	      if (cont < 1.) return;
-	    }
-	  initted = true;
-	}
-
-    }
-
-
-
-
-
-#if 0
-
-/**
-   There is some matrix Q such that for any vector y
-   Q.e_1 = y and Q is unitary.
-**/
-    template < class T >
-      static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
-    {
-      int N = y.size ();	//Matrix Size
-      Fill (Q, 0.0);
-      T tau;
-      for (int i = 0; i < N; i++)
-	{
-	  Q[i][0] = y[i];
-	}
-      T sig = conj (y[0]) * y[0];
-      T tau0 = fabs (sqrt (sig));
-
-      for (int j = 1; j < N; j++)
-	{
-	  sig += conj (y[j]) * y[j];
-	  tau = abs (sqrt (sig));
-
-	  if (abs (tau0) > 0.0)
-	    {
-
-	      T gam = conj ((y[j] / tau) / tau0);
-	      for (int k = 0; k <= j - 1; k++)
-		{
-		  Q[k][j] = -gam * y[k];
-		}
-	      Q[j][j] = tau0 / tau;
-	    }
-	  else
-	    {
-	      Q[j - 1][j] = 1.0;
-	    }
-	  tau0 = tau;
-	}
-      return tau;
-    }
-
-/**
-	There is some matrix Q such that for any vector y
-	Q.e_k = y and Q is unitary.
-**/
-    template < class T >
-      static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
-    {
-      T tau = orthQ (Q, y);
-      SL (Q);
-      return tau;
-    }
-
-
-/**
-	Wind up with a matrix with the first con rows untouched
-
-say con = 2
-	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
-	and the matrix is upper hessenberg
-	and with f and Q appropriately modidied with Q is the arnoldi factorization
-
-**/
-
-    template < class T > static void Lock (DenseMatrix < T > &H,	///Hess mtx     
-					   DenseMatrix < T > &Q,	///Lock Transform
-					   T val,	///value to be locked
-					   int con,	///number already locked
-					   RealD small, int dfg, bool herm)
-    {
-      //ForceTridiagonal(H);
-
-      int M = H.dim;
-      DenseVector < T > vec;
-      Resize (vec, M - con);
-
-      DenseMatrix < T > AH;
-      Resize (AH, M - con, M - con);
-      AH = GetSubMtx (H, con, M, con, M);
-
-      DenseMatrix < T > QQ;
-      Resize (QQ, M - con, M - con);
-
-      Unity (Q);
-      Unity (QQ);
-
-      DenseVector < T > evals;
-      Resize (evals, M - con);
-      DenseMatrix < T > evecs;
-      Resize (evecs, M - con, M - con);
-
-      Wilkinson < T > (AH, evals, evecs, small);
-
-      int k = 0;
-      RealD cold = abs (val - evals[k]);
-      for (int i = 1; i < M - con; i++)
-	{
-	  RealD cnew = abs (val - evals[i]);
-	  if (cnew < cold)
-	    {
-	      k = i;
-	      cold = cnew;
-	    }
-	}
-      vec = evecs[k];
-
-      ComplexD tau;
-      orthQ (QQ, vec);
-      //orthQM(QQ,AH,vec);
-
-      AH = Hermitian (QQ) * AH;
-      AH = AH * QQ;
-
-      for (int i = con; i < M; i++)
-	{
-	  for (int j = con; j < M; j++)
-	    {
-	      Q[i][j] = QQ[i - con][j - con];
-	      H[i][j] = AH[i - con][j - con];
-	    }
-	}
-
-      for (int j = M - 1; j > con + 2; j--)
-	{
-
-	  DenseMatrix < T > U;
-	  Resize (U, j - 1 - con, j - 1 - con);
-	  DenseVector < T > z;
-	  Resize (z, j - 1 - con);
-	  T nm = norm (z);
-	  for (int k = con + 0; k < j - 1; k++)
-	    {
-	      z[k - con] = conj (H (j, k + 1));
-	    }
-	  normalise (z);
-
-	  RealD tmp = 0;
-	  for (int i = 0; i < z.size () - 1; i++)
-	    {
-	      tmp = tmp + abs (z[i]);
-	    }
-
-	  if (tmp < small / ((RealD) z.size () - 1.0))
-	    {
-	      continue;
-	    }
-
-	  tau = orthU (U, z);
-
-	  DenseMatrix < T > Hb;
-	  Resize (Hb, j - 1 - con, M);
-
-	  for (int a = 0; a < M; a++)
-	    {
-	      for (int b = 0; b < j - 1 - con; b++)
-		{
-		  T sum = 0;
-		  for (int c = 0; c < j - 1 - con; c++)
-		    {
-		      sum += H[a][con + 1 + c] * U[c][b];
-		    }		//sum += H(a,con+1+c)*U(c,b);}
-		  Hb[b][a] = sum;
-		}
-	    }
-
-	  for (int k = con + 1; k < j; k++)
-	    {
-	      for (int l = 0; l < M; l++)
-		{
-		  H[l][k] = Hb[k - 1 - con][l];
-		}
-	    }			//H(Hb[k-1-con][l] , l,k);}}
-
-	  DenseMatrix < T > Qb;
-	  Resize (Qb, M, M);
-
-	  for (int a = 0; a < M; a++)
-	    {
-	      for (int b = 0; b < j - 1 - con; b++)
-		{
-		  T sum = 0;
-		  for (int c = 0; c < j - 1 - con; c++)
-		    {
-		      sum += Q[a][con + 1 + c] * U[c][b];
-		    }		//sum += Q(a,con+1+c)*U(c,b);}
-		  Qb[b][a] = sum;
-		}
-	    }
-
-	  for (int k = con + 1; k < j; k++)
-	    {
-	      for (int l = 0; l < M; l++)
-		{
-		  Q[l][k] = Qb[k - 1 - con][l];
-		}
-	    }			//Q(Qb[k-1-con][l] , l,k);}}
-
-	  DenseMatrix < T > Hc;
-	  Resize (Hc, M, M);
-
-	  for (int a = 0; a < j - 1 - con; a++)
-	    {
-	      for (int b = 0; b < M; b++)
-		{
-		  T sum = 0;
-		  for (int c = 0; c < j - 1 - con; c++)
-		    {
-		      sum += conj (U[c][a]) * H[con + 1 + c][b];
-		    }		//sum += conj( U(c,a) )*H(con+1+c,b);}
-		  Hc[b][a] = sum;
-		}
-	    }
-
-	  for (int k = 0; k < M; k++)
-	    {
-	      for (int l = con + 1; l < j; l++)
-		{
-		  H[l][k] = Hc[k][l - 1 - con];
-		}
-	    }			//H(Hc[k][l-1-con] , l,k);}}
-
-	}
-    }
-#endif
-
-
-  };
-
-}
-#endif
@@ -1,608 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/Aggregates.h
-
-    Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
-
-NAMESPACE_BEGIN(Grid);
-
-inline RealD AggregatePowerLaw(RealD x)
-{
-  //  return std::pow(x,-4);
-  //  return std::pow(x,-3);
-  return std::pow(x,-5);
-}
-
-template<class Fobj,class CComplex,int nbasis>
-class Aggregation {
-public:
-  constexpr int Nbasis(void) { return nbasis; };
-  
-  typedef iVector<CComplex,nbasis >             siteVector;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-
-  GridBase *CoarseGrid;
-  GridBase *FineGrid;
-  std::vector<Lattice<Fobj> > subspace;
-  int checkerboard;
-  int Checkerboard(void){return checkerboard;}
-  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
-    CoarseGrid(_CoarseGrid),
-    FineGrid(_FineGrid),
-    subspace(nbasis,_FineGrid),
-    checkerboard(_checkerboard)
-  {
-  };
-  
-  
-  void Orthogonalise(void){
-    CoarseScalar InnerProd(CoarseGrid); 
-    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
-    blockOrthogonalise(InnerProd,subspace);
-  } 
-  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
-    blockProject(CoarseVec,FineVec,subspace);
-  }
-  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
-    FineVec.Checkerboard() = subspace[0].Checkerboard();
-    blockPromote(CoarseVec,FineVec,subspace);
-  }
-
-  virtual void CreateSubspaceRandom(GridParallelRNG  &RNG) {
-    int nn=nbasis;
-    RealD scale;
-    FineField noise(FineGrid);
-    for(int b=0;b<nn;b++){
-      subspace[b] = Zero();
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-      subspace[b] = noise;
-    }
-  }
-  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
-  {
-
-    RealD scale;
-
-    ConjugateGradient<FineField> CG(1.0e-3,400,false);
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-
-    for(int b=0;b<nn;b++){
-      
-      subspace[b] = Zero();
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-      
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      for(int i=0;i<4;i++){
-
-	CG(hermop,noise,subspace[b]);
-
-	noise = subspace[b];
-	scale = std::pow(norm2(noise),-0.5); 
-	noise=noise*scale;
-
-      }
-
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
-      subspace[b]   = noise;
-
-    }
-  }
-
-  virtual void CreateSubspaceGCR(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
-  {
-    RealD scale;
-
-    TrivialPrecon<FineField> simple_fine;
-    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
-    FineField noise(FineGrid);
-    FineField src(FineGrid);
-    FineField guess(FineGrid);
-    FineField Mn(FineGrid);
-
-    for(int b=0;b<nn;b++){
-      
-      subspace[b] = Zero();
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-      
-      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
-
-      for(int i=0;i<2;i++){
-	//  void operator() (const Field &src, Field &psi){
-#if 1
-	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
-	src = noise;
-	guess=Zero();
-	GCR(src,guess);
-	subspace[b] = guess;
-#else
-	std::cout << GridLogMessage << " inverting on zero "<<std::endl;
-	src=Zero();
-	guess = noise;
-	GCR(src,guess);
-	subspace[b] = guess;
-#endif
-	noise = subspace[b];
-	scale = std::pow(norm2(noise),-0.5); 
-	noise=noise*scale;
-
-      }
-
-      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
-      subspace[b]   = noise;
-
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
-  // and this is the best I found
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter,
-				       int ordermin,
-				       int orderstep,
-				       double filterlo
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    gaussian(RNG,noise);
-    scale = std::pow(norm2(noise),-0.5); 
-    noise=noise*scale;
-
-    std::cout << GridLogMessage<<" Chebyshev subspace pass-1 : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
-    std::cout << GridLogMessage<<" Chebyshev subspace pass-2 : nbasis"<<nn<<" min "
-	      <<ordermin<<" step "<<orderstep
-	      <<" lo"<<filterlo<<std::endl;
-
-    // Initial matrix element
-    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-    int b =0;
-    {
-      ComplexD ip;
-      // Filter
-      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
-      Cheb(hermop,noise,Mn);
-      // normalise
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-      subspace[b]   = Mn;
-
-      hermop.Op(Mn,tmp);
-      ip= innerProduct(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
-
-      hermop.AdjOp(Mn,tmp); 
-      ip = innerProduct(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
-      b++;
-    }
-
-    // Generate a full sequence of Chebyshevs
-    {
-      lo=filterlo;
-      noise=Mn;
-
-      FineField T0(FineGrid); T0 = noise;  
-      FineField T1(FineGrid); 
-      FineField T2(FineGrid);
-      FineField y(FineGrid);
-      
-      FineField *Tnm = &T0;
-      FineField *Tn  = &T1;
-      FineField *Tnp = &T2;
-
-      // Tn=T1 = (xscale M + mscale)in
-      RealD xscale = 2.0/(hi-lo);
-      RealD mscale = -(hi+lo)/(hi-lo);
-      hermop.HermOp(T0,y);
-      T1=y*xscale+noise*mscale;
-
-      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
-	
-	hermop.HermOp(*Tn,y);
-
-	autoView( y_v , y, AcceleratorWrite);
-	autoView( Tn_v , (*Tn), AcceleratorWrite);
-	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
-	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
-	const int Nsimd = CComplex::Nsimd();
-	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
-	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
-	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
-        });
-
-	// Possible more fine grained control is needed than a linear sweep,
-	// but huge productivity gain if this is simple algorithm and not a tunable
-	int m =1;
-	if ( n>=ordermin ) m=n-ordermin;
-	if ( (m%orderstep)==0 ) { 
-	  Mn=*Tnp;
-	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
-	  subspace[b] = Mn;
-
-
-	  ComplexD ip;
-
-	  hermop.Op(Mn,tmp);
-	  ip= innerProduct(Mn,tmp); 
-	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
-
-	  hermop.AdjOp(Mn,tmp); 
-	  ip = innerProduct(Mn,tmp); 
-	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
-	  
-	  b++;
-	}
-
-	// Cycle pointers to avoid copies
-	FineField *swizzle = Tnm;
-	Tnm    =Tn;
-	Tn     =Tnp;
-	Tnp    =swizzle;
-	  
-      }
-    }
-    assert(b==nn);
-  }
-
-
-  virtual void CreateSubspacePolyCheby(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo1,
-				       int orderfilter,
-				       double lo2,
-				       int orderstep)
-  {
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    gaussian(RNG,noise);
-    scale = std::pow(norm2(noise),-0.5); 
-    noise=noise*scale;
-
-    std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
-    // Initial matrix element
-    hermop.Op(noise,Mn);
-    std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-    int b =0;
-    {
-      // Filter
-      std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
-      Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
-      Cheb(hermop,noise,Mn);
-      // normalise
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
-    }
-
-    // Generate a full sequence of Chebyshevs
-    for(int n=1;n<nn;n++){
-      std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
-      Chebyshev<FineField> Cheb(lo2,hi,orderstep);
-      Cheb(hermop,subspace[n-1],Mn);
-
-      for(int m=0;m<n;m++){
-	ComplexD c = innerProduct(subspace[m],Mn);
-	Mn = Mn - c*subspace[m];
-      }
-      
-      // normalise
-      scale = std::pow(norm2(Mn),-0.5);
-      Mn=Mn*scale;
-      
-      subspace[n]=Mn;
-      
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
-
-    }
-  }
-
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
-    std::cout << GridLogMessage<<" Chebyshev subspace pure noise  : nbasis "<<nn<<std::endl;
-
-
-    for(int b =0;b<nbasis;b++)
-    {
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-
-      // Initial matrix element
-      hermop.Op(noise,Mn);
-      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      // Filter
-      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
-      Cheb(hermop,noise,Mn);
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-
-      // Refine
-      Chebyshev<FineField> PowerLaw(lo,hi,1000,AggregatePowerLaw);
-      noise = Mn;
-      PowerLaw(hermop,noise,Mn);
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-
-      // normalise
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-    }
-
-  }
-
-  virtual void CreateSubspaceChebyshevPowerLaw(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-					       int nn,
-					       double hi,
-					       int orderfilter
-					       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" [0,"<<hi<<"]"<<std::endl;
-    std::cout << GridLogMessage<<" Chebyshev subspace pure noise  : nbasis "<<nn<<std::endl;
-
-    for(int b =0;b<nbasis;b++)
-    {
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-
-      // Initial matrix element
-      hermop.Op(noise,Mn);
-      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-      // Filter
-      Chebyshev<FineField> Cheb(0.0,hi,orderfilter,AggregatePowerLaw);
-      Cheb(hermop,noise,Mn);
-      // normalise
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-    }
-
-  }
-  virtual void CreateSubspaceChebyshevNew(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-					  double hi
-					  ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    for(int b =0;b<nbasis;b++)
-    {
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-
-      // Initial matrix element
-      hermop.Op(noise,Mn);
-      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-      // Filter
-      //#opt2(x) =  acheb(x,3,90,300)* acheb(x,1,90,50) * acheb(x,0.5,90,200) * acheb(x,0.05,90,400) * acheb(x,0.01,90,1500)
-      /*266
-      Chebyshev<FineField> Cheb1(3.0,hi,300);
-      Chebyshev<FineField> Cheb2(1.0,hi,50);
-      Chebyshev<FineField> Cheb3(0.5,hi,300);
-      Chebyshev<FineField> Cheb4(0.05,hi,500);
-      Chebyshev<FineField> Cheb5(0.01,hi,2000);
-      */
-      /* 242 */
-      /*
-      Chebyshev<FineField> Cheb3(0.1,hi,300);
-      Chebyshev<FineField> Cheb2(0.02,hi,1000);
-      Chebyshev<FineField> Cheb1(0.003,hi,2000);
-      8?
-      */
-      /* How many??
-      */
-      Chebyshev<FineField> Cheb2(0.001,hi,2500); // 169 iters on HDCG after refine
-      Chebyshev<FineField> Cheb1(0.02,hi,600);
-
-      //      Chebyshev<FineField> Cheb2(0.001,hi,1500);
-      //      Chebyshev<FineField> Cheb1(0.02,hi,600);
-      Cheb1(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); 	noise=Mn*scale;
-      hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb1 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-      Cheb2(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); 	noise=Mn*scale;
-      hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb2 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-      //      Cheb3(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); 	noise=Mn*scale;
-      //      hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb3 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-      //      Cheb4(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); 	noise=Mn*scale;
-      //      hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb4 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-      //      Cheb5(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); 	noise=Mn*scale;
-      //      hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb5 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-      subspace[b]   = noise;
-      hermop.Op(subspace[b],tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<< " norm " << norm2(noise)<<std::endl;
-    }
-
-  }
-
-  virtual void CreateSubspaceMultishift(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-					double Lo,double tol,int maxit)
-  {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    std::cout << GridLogMessage<<" Multishift subspace : Lo "<<Lo<<std::endl;
-
-    // Filter
-    // [ 1/6(x+Lo)  - 1/2(x+2Lo) + 1/2(x+3Lo)  -1/6(x+4Lo) = Lo^3 /[ (x+1Lo)(x+2Lo)(x+3Lo)(x+4Lo) ]
-    //
-    // 1/(x+Lo)  - 1/(x+2 Lo)
-    double epsilon      = Lo/3;
-    std::vector<RealD> alpha({1.0/6.0,-1.0/2.0,1.0/2.0,-1.0/6.0});
-    std::vector<RealD> shifts({Lo,Lo+epsilon,Lo+2*epsilon,Lo+3*epsilon});
-    std::vector<RealD> tols({tol,tol,tol,tol});
-    std::cout << "sizes "<<alpha.size()<<" "<<shifts.size()<<" "<<tols.size()<<std::endl;
-
-    MultiShiftFunction msf(4,0.0,95.0);
-    std::cout << "msf constructed "<<std::endl;
-    msf.poles=shifts;
-    msf.residues=alpha;
-    msf.tolerances=tols;
-    msf.norm=0.0;
-    msf.order=alpha.size();
-    ConjugateGradientMultiShift<FineField> MSCG(maxit,msf);
-    
-    for(int b =0;b<nbasis;b++)
-    {
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-
-      // Initial matrix element
-      hermop.Op(noise,Mn);
-      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      MSCG(hermop,noise,Mn);
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-
-    }
-
-  }
-  virtual void RefineSubspace(LinearOperatorBase<FineField> &hermop,
-			      double Lo,double tol,int maxit)
-  {
-    FineField tmp(FineGrid);
-    for(int b =0;b<nbasis;b++)
-    {
-      ConjugateGradient<FineField>  CGsloppy(tol,maxit,false);
-      ShiftedHermOpLinearOperator<FineField> ShiftedFineHermOp(hermop,Lo);
-      tmp=Zero();
-      CGsloppy(hermop,subspace[b],tmp);
-      RealD scale = std::pow(norm2(tmp),-0.5); 	tmp=tmp*scale;
-      subspace[b]=tmp;
-      hermop.Op(subspace[b],tmp);
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-    }
-  }
-  virtual void RefineSubspaceHDCG(LinearOperatorBase<FineField> &hermop,
-				  TwoLevelADEF2mrhs<FineField,CoarseVector> & theHDCG,
-				  int nrhs)
-  {
-    std::vector<FineField> src_mrhs(nrhs,FineGrid);
-    std::vector<FineField> res_mrhs(nrhs,FineGrid);
-    FineField tmp(FineGrid);
-    for(int b =0;b<nbasis;b+=nrhs)
-    {
-      tmp = subspace[b];
-      RealD scale = std::pow(norm2(tmp),-0.5); 	tmp=tmp*scale;
-      subspace[b] =tmp;
-      hermop.Op(subspace[b],tmp);
-      std::cout<<GridLogMessage << "before filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-
-      for(int r=0;r<MIN(nbasis-b,nrhs);r++){
-	src_mrhs[r] = subspace[b+r];
-      }
-      for(int r=0;r<nrhs;r++){
-	res_mrhs[r] = Zero();
-      }
-      theHDCG(src_mrhs,res_mrhs);
-
-      for(int r=0;r<MIN(nbasis-b,nrhs);r++){
-	tmp = res_mrhs[r];
-	RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
-	subspace[b+r]=tmp;
-      }
-      hermop.Op(subspace[b],tmp);
-      std::cout<<GridLogMessage << "after filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-    }
-  }
-
-  
-  
-};
-NAMESPACE_END(Grid);
-
@@ -1,629 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
-
-#include <Grid/lattice/PaddedCell.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// Fine Object == (per site) type of fine field
-// nbasis      == number of deflation vectors
-template<class Fobj,class CComplex,int nbasis>
-class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
-public:
-
-  typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
-  typedef iVector<CComplex,nbasis >           siteVector;
-  typedef iMatrix<CComplex,nbasis >           siteMatrix;
-  typedef Lattice<iScalar<CComplex> >         CoarseComplexField;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-  typedef iMatrix<CComplex,nbasis >  Cobj;
-  typedef iVector<CComplex,nbasis >  Cvec;
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-  typedef Lattice<CComplex >    FineComplexField;
-  typedef CoarseVector Field;
-  ////////////////////
-  // Data members
-  ////////////////////
-  int hermitian;
-  GridBase      *       _FineGrid; 
-  GridCartesian *       _CoarseGrid; 
-  NonLocalStencilGeometry &geom;
-  PaddedCell Cell;
-  GeneralLocalStencil Stencil;
-  
-  std::vector<CoarseMatrix> _A;
-  std::vector<CoarseMatrix> _Adag;
-  std::vector<CoarseVector> MultTemporaries;
-
-  ///////////////////////
-  // Interface
-  ///////////////////////
-  GridBase      * Grid(void)           { return _CoarseGrid; };   // this is all the linalg routines need to know
-  GridBase      * FineGrid(void)       { return _FineGrid; };   // this is all the linalg routines need to know
-  GridCartesian * CoarseGrid(void)     { return _CoarseGrid; };   // this is all the linalg routines need to know
-
-  /*  void ShiftMatrix(RealD shift)
-  {
-    int Nd=_FineGrid->Nd(); 
-    Coordinate zero_shift(Nd,0);
-    for(int p=0;p<geom.npoint;p++){
-      if ( zero_shift==geom.shifts[p] ) {
-	_A[p] = _A[p]+shift;
-	//	_Adag[p] = _Adag[p]+shift;
-      }
-    }    
-  }
-  void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
-  {
-    int nfound=0;
-    std::cout << GridLogMessage <<"GeneralCoarsenedMatrix::ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
-    for(int p=0;p<geom.npoint;p++){
-      for(int pp=0;pp<CopyMe.geom.npoint;pp++){
- 	// Search for the same relative shift
-	// Avoids brutal handling of Grid pointers
-	if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
-	  _A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
-	  //	  _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
-	  nfound++;
-	}
-      }
-    }
-    assert(nfound==geom.npoint);
-    ExchangeCoarseLinks();
-  }
-  */
-  
-  GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
-    : geom(_geom),
-      _FineGrid(FineGrid),
-      _CoarseGrid(CoarseGrid),
-      hermitian(1),
-      Cell(_geom.Depth(),_CoarseGrid),
-      Stencil(Cell.grids.back(),geom.shifts)
-  {
-    {
-      int npoint = _geom.npoint;
-    }
-    _A.resize(geom.npoint,CoarseGrid);
-    //    _Adag.resize(geom.npoint,CoarseGrid);
-  }
-  void M (const CoarseVector &in, CoarseVector &out)
-  {
-    Mult(_A,in,out);
-  }
-  void Mdag (const CoarseVector &in, CoarseVector &out)
-  {
-    assert(hermitian);
-    Mult(_A,in,out);
-    //    if ( hermitian ) M(in,out);
-    //    else Mult(_Adag,in,out);
-  }
-  void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
-  {
-    RealD tviews=0;    RealD ttot=0;    RealD tmult=0;   RealD texch=0;    RealD text=0; RealD ttemps=0; RealD tcopy=0;
-    RealD tmult2=0;
-
-    ttot=-usecond();
-    conformable(CoarseGrid(),in.Grid());
-    conformable(in.Grid(),out.Grid());
-    out.Checkerboard() = in.Checkerboard();
-    CoarseVector tin=in;
-
-    texch-=usecond();
-    CoarseVector pin = Cell.ExchangePeriodic(tin);
-    texch+=usecond();
-
-    CoarseVector pout(pin.Grid());
-
-    int npoint = geom.npoint;
-    typedef LatticeView<Cobj> Aview;
-    typedef LatticeView<Cvec> Vview;
-      
-    const int Nsimd = CComplex::Nsimd();
-    
-    int64_t osites=pin.Grid()->oSites();
-
-    RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
-    RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint
-                + 2.0*osites*sizeof(siteVector)*npoint;
-      
-    {
-      tviews-=usecond();
-      autoView( in_v , pin, AcceleratorRead);
-      autoView( out_v , pout, AcceleratorWriteDiscard);
-      autoView( Stencil_v  , Stencil, AcceleratorRead);
-      tviews+=usecond();
-
-      // Static and prereserve to keep UVM region live and not resized across multiple calls
-      ttemps-=usecond();
-      MultTemporaries.resize(npoint,pin.Grid());       
-      ttemps+=usecond();
-      std::vector<Aview> AcceleratorViewContainer_h;
-      std::vector<Vview> AcceleratorVecViewContainer_h; 
-
-      tviews-=usecond();
-      for(int p=0;p<npoint;p++) {
-	AcceleratorViewContainer_h.push_back(      A[p].View(AcceleratorRead));
-	AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
-      }
-      tviews+=usecond();
-
-      static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
-      static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); 
-      
-      auto Aview_p = &AcceleratorViewContainer[0];
-      auto Vview_p = &AcceleratorVecViewContainer[0];
-      tcopy-=usecond();
-      acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
-      acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
-      tcopy+=usecond();
-
-      tmult-=usecond();
-      accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
-	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
-	  int32_t ss   = spb/(nbasis*npoint);
-	  int32_t bp   = spb%(nbasis*npoint);
-	  int32_t point= bp/nbasis;
-	  int32_t b    = bp%nbasis;
-	  auto SE  = Stencil_v.GetEntry(point,ss);
-	  auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
-	  auto res = coalescedRead(Aview_p[point][ss](0,b))*nbr(0);
-	  for(int bb=1;bb<nbasis;bb++) {
-	    res = res + coalescedRead(Aview_p[point][ss](bb,b))*nbr(bb);
-	  }
-	  coalescedWrite(Vview_p[point][ss](b),res);
-      });
-      tmult2-=usecond();
-      accelerator_for(sb, osites*nbasis, Nsimd, {
-	  int ss = sb/nbasis;
-	  int b  = sb%nbasis;
-	  auto res = coalescedRead(Vview_p[0][ss](b));
-	  for(int point=1;point<npoint;point++){
-	    res = res + coalescedRead(Vview_p[point][ss](b));
-	  }
-	  coalescedWrite(out_v[ss](b),res);
-      });
-      tmult2+=usecond();
-      tmult+=usecond();
-      for(int p=0;p<npoint;p++) {
-	AcceleratorViewContainer_h[p].ViewClose();
-	AcceleratorVecViewContainer_h[p].ViewClose();
-      }
-    }
-
-    text-=usecond();
-    out = Cell.Extract(pout);
-    text+=usecond();
-    ttot+=usecond();
-    
-    std::cout << GridLogPerformance<<"Coarse 1rhs Mult Aviews "<<tviews<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<" of which mult2  "<<tmult2<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult copy  "<<tcopy<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
-    //    std::cout << GridLogPerformance<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel flops "<< flops<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel bytes/s "<< bytes/tmult<<" MB/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
-
-  };
-  
-  void PopulateAdag(void)
-  {
-    for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
-      Coordinate bcoor;
-      CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
-      
-      for(int p=0;p<geom.npoint;p++){
-	Coordinate scoor = bcoor;
-	for(int mu=0;mu<bcoor.size();mu++){
-	  int L = CoarseGrid()->GlobalDimensions()[mu];
-	  scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic
-	}
-	// Flip to poke/peekLocalSite and not too bad
-	auto link = peekSite(_A[p],scoor);
-	int pp = geom.Reverse(p);
-	pokeSite(adj(link),_Adag[pp],bcoor);
-      }
-    }
-  }
-  /////////////////////////////////////////////////////////////
-  // 
-  // A) Only reduced flops option is to use a padded cell of depth 4
-  // and apply MpcDagMpc in the padded cell.
-  //
-  // Makes for ONE application of MpcDagMpc per vector instead of 30 or 80.
-  // With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio
-  // Cost is 81x more, same as stencil size.
-  //
-  // But: can eliminate comms and do as local dirichlet.
-  //
-  // Local exchange gauge field once.
-  // Apply to all vectors, local only computation.
-  // Must exchange ghost subcells in reverse process of PaddedCell to take inner products
-  //
-  // B) Can reduce cost: pad by 1, apply Deo      (4^4+6^4+8^4+8^4 )/ (4x 4^4)
-  //                     pad by 2, apply Doe
-  //                     pad by 3, apply Deo
-  //                     then break out 8x directions; cost is ~10x MpcDagMpc per vector
-  //
-  // => almost factor of 10 in setup cost, excluding data rearrangement
-  //
-  // Intermediates -- ignore the corner terms, leave approximate and force Hermitian
-  // Intermediates -- pad by 2 and apply 1+8+24 = 33 times.
-  /////////////////////////////////////////////////////////////
-
-    //////////////////////////////////////////////////////////
-    // BFM HDCG style approach: Solve a system of equations to get Aij
-    //////////////////////////////////////////////////////////
-    /*
-     *     Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM.
-     *
-     *     conj(phases[block]) proj[k][ block*Nvec+j ] =  \sum_ball  e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} > 
-     *                                                 =  \sum_ball e^{iqk.delta} A_ji
-     *
-     *     Must invert matrix M_k,l = e^[i q_k . delta_l]
-     *
-     *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
-     */
-#if 0
-  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
-  {
-    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
-    GridBase *grid = FineGrid();
-
-    RealD tproj=0.0;
-    RealD teigen=0.0;
-    RealD tmat=0.0;
-    RealD tphase=0.0;
-    RealD tinv=0.0;
-
-    /////////////////////////////////////////////////////////////
-    // Orthogonalise the subblocks over the basis
-    /////////////////////////////////////////////////////////////
-    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
-
-    const int npoint = geom.npoint;
-      
-    Coordinate clatt = CoarseGrid()->GlobalDimensions();
-    int Nd = CoarseGrid()->Nd();
-
-      /*
-       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
-       *     Matrix index i is mapped to this shift via 
-       *               geom.shifts[i]
-       *
-       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
-       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
-       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
-       *       = M_{kl} A_ji^{b.b+l}
-       *
-       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
-       *  
-       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
-       *
-       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
-       */
-    teigen-=usecond();
-    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
-    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
-    ComplexD ci(0.0,1.0);
-    for(int k=0;k<npoint;k++){ // Loop over momenta
-
-      for(int l=0;l<npoint;l++){ // Loop over nbr relative
-	ComplexD phase(0.0,0.0);
-	for(int mu=0;mu<Nd;mu++){
-	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	  phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
-	}
-	phase=exp(phase*ci);
-	Mkl(k,l) = phase;
-      }
-    }
-    invMkl = Mkl.inverse();
-    teigen+=usecond();
-
-    ///////////////////////////////////////////////////////////////////////
-    // Now compute the matrix elements of linop between the orthonormal
-    // set of vectors.
-    ///////////////////////////////////////////////////////////////////////
-    FineField phaV(grid); // Phased block basis vector
-    FineField MphaV(grid);// Matrix applied
-    CoarseVector coarseInner(CoarseGrid());
-
-    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
-    std::vector<CoarseVector>          FT(npoint,CoarseGrid());
-    for(int i=0;i<nbasis;i++){// Loop over basis vectors
-      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
-      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
-	/////////////////////////////////////////////////////
-	// Stick a phase on every block
-	/////////////////////////////////////////////////////
-	tphase-=usecond();
-	CoarseComplexField coor(CoarseGrid());
-	CoarseComplexField pha(CoarseGrid());	pha=Zero();
-	for(int mu=0;mu<Nd;mu++){
-	  LatticeCoordinate(coor,mu);
-	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	  pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
-	}
-	pha  =exp(pha*ci);
-	phaV=Zero();
-	blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
-	tphase+=usecond();
-
-	/////////////////////////////////////////////////////////////////////
-	// Multiple phased subspace vector by matrix and project to subspace
-	// Remove local bulk phase to leave relative phases
-	/////////////////////////////////////////////////////////////////////
-	tmat-=usecond();
-	linop.Op(phaV,MphaV);
-	tmat+=usecond();
-
-	tproj-=usecond();
-	blockProject(coarseInner,MphaV,Subspace.subspace);
-	coarseInner = conjugate(pha) * coarseInner;
-
-	ComputeProj[p] = coarseInner;
-	tproj+=usecond();
-
-      }
-
-      tinv-=usecond();
-      for(int k=0;k<npoint;k++){
-	FT[k] = Zero();
-	for(int l=0;l<npoint;l++){
-	  FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
-	}
-      
-	int osites=CoarseGrid()->oSites();
-	autoView( A_v  , _A[k], AcceleratorWrite);
-	autoView( FT_v  , FT[k], AcceleratorRead);
-	accelerator_for(sss, osites, 1, {
-	    for(int j=0;j<nbasis;j++){
-	      A_v[sss](i,j) = FT_v[sss](j);
-	    }
-        });
-      }
-      tinv+=usecond();
-    }
-
-    // Only needed if nonhermitian
-    if ( ! hermitian ) {
-      //      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
-      //      PopulateAdag();
-    }
-
-    // Need to write something to populate Adag from A
-    ExchangeCoarseLinks();
-    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator mat    "<<tmat <<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator proj   "<<tproj<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
-  }
-#else
-  //////////////////////////////////////////////////////////////////////
-  // Galerkin projection of matrix
-  //////////////////////////////////////////////////////////////////////
-  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
-  {
-    CoarsenOperator(linop,Subspace,Subspace);
-  }
-  //////////////////////////////////////////////////////////////////////
-  // Petrov - Galerkin projection of matrix
-  //////////////////////////////////////////////////////////////////////
-  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & U,
-		       Aggregation<Fobj,CComplex,nbasis> & V)
-  {
-    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
-    GridBase *grid = FineGrid();
-
-    RealD tproj=0.0;
-    RealD teigen=0.0;
-    RealD tmat=0.0;
-    RealD tphase=0.0;
-    RealD tphaseBZ=0.0;
-    RealD tinv=0.0;
-
-    /////////////////////////////////////////////////////////////
-    // Orthogonalise the subblocks over the basis
-    /////////////////////////////////////////////////////////////
-    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,V.subspace);
-    blockOrthogonalise(InnerProd,U.subspace);
-
-    const int npoint = geom.npoint;
-      
-    Coordinate clatt = CoarseGrid()->GlobalDimensions();
-    int Nd = CoarseGrid()->Nd();
-
-      /*
-       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
-       *     Matrix index i is mapped to this shift via 
-       *               geom.shifts[i]
-       *
-       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
-       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
-       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
-       *       = M_{kl} A_ji^{b.b+l}
-       *
-       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
-       *  
-       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
-       *
-       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
-       */
-    teigen-=usecond();
-    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
-    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
-    ComplexD ci(0.0,1.0);
-    for(int k=0;k<npoint;k++){ // Loop over momenta
-
-      for(int l=0;l<npoint;l++){ // Loop over nbr relative
-	ComplexD phase(0.0,0.0);
-	for(int mu=0;mu<Nd;mu++){
-	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	  phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
-	}
-	phase=exp(phase*ci);
-	Mkl(k,l) = phase;
-      }
-    }
-    invMkl = Mkl.inverse();
-    teigen+=usecond();
-
-    ///////////////////////////////////////////////////////////////////////
-    // Now compute the matrix elements of linop between the orthonormal
-    // set of vectors.
-    ///////////////////////////////////////////////////////////////////////
-    FineField phaV(grid); // Phased block basis vector
-    FineField MphaV(grid);// Matrix applied
-    std::vector<FineComplexField> phaF(npoint,grid);
-    std::vector<CoarseComplexField> pha(npoint,CoarseGrid());
-    
-    CoarseVector coarseInner(CoarseGrid());
-    
-    typedef typename CComplex::scalar_type SComplex;
-    FineComplexField one(grid); one=SComplex(1.0);
-    FineComplexField zz(grid); zz = Zero();
-    tphase=-usecond();
-    for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
-      /////////////////////////////////////////////////////
-      // Stick a phase on every block
-      /////////////////////////////////////////////////////
-      CoarseComplexField coor(CoarseGrid());
-      pha[p]=Zero();
-      for(int mu=0;mu<Nd;mu++){
-	LatticeCoordinate(coor,mu);
-	RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	pha[p] = pha[p] + (TwoPiL * geom.shifts[p][mu]) * coor;
-      }
-      pha[p]  =exp(pha[p]*ci);
-
-      blockZAXPY(phaF[p],pha[p],one,zz);
-      
-    }
-    tphase+=usecond();
-    
-    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
-    std::vector<CoarseVector>          FT(npoint,CoarseGrid());
-    for(int i=0;i<nbasis;i++){// Loop over basis vectors
-      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
-      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
-	tphaseBZ-=usecond();
-	phaV = phaF[p]*V.subspace[i];
-	tphaseBZ+=usecond();
-
-	/////////////////////////////////////////////////////////////////////
-	// Multiple phased subspace vector by matrix and project to subspace
-	// Remove local bulk phase to leave relative phases
-	/////////////////////////////////////////////////////////////////////
-	tmat-=usecond();
-	linop.Op(phaV,MphaV);
-	tmat+=usecond();
-	//	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
-
-	tproj-=usecond();
-	blockProject(coarseInner,MphaV,U.subspace);
-	coarseInner = conjugate(pha[p]) * coarseInner;
-
-	ComputeProj[p] = coarseInner;
-	tproj+=usecond();
-	//	std::cout << i << " " <<p << " ComputeProj "<<norm2(ComputeProj[p])<<std::endl;
-
-      }
-
-      tinv-=usecond();
-      for(int k=0;k<npoint;k++){
-	FT[k] = Zero();
-	for(int l=0;l<npoint;l++){
-	  FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
-	}
-      
-	int osites=CoarseGrid()->oSites();
-	autoView( A_v  , _A[k], AcceleratorWrite);
-	autoView( FT_v  , FT[k], AcceleratorRead);
-	accelerator_for(sss, osites, 1, {
-	    for(int j=0;j<nbasis;j++){
-	      A_v[sss](i,j) = FT_v[sss](j);
-	    }
-        });
-      }
-      tinv+=usecond();
-    }
-
-    // Only needed if nonhermitian
-    if ( ! hermitian ) {
-      //      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
-      //      PopulateAdag();
-    }
-
-    for(int p=0;p<geom.npoint;p++){
-      std::cout << " _A["<<p<<"] "<<norm2(_A[p])<<std::endl;
-    }
-
-    // Need to write something to populate Adag from A
-    ExchangeCoarseLinks();
-    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator mat    "<<tmat <<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator proj   "<<tproj<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
-  }
-#endif  
-  void ExchangeCoarseLinks(void){
-    for(int p=0;p<geom.npoint;p++){
-      _A[p] = Cell.ExchangePeriodic(_A[p]);
-      //      _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
-    }
-  }
-  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
-};
-
-
-  
-NAMESPACE_END(Grid);
@@ -1,729 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/GeneralCoarsenedMatrixMultiRHS.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-
-NAMESPACE_BEGIN(Grid);
-
-
-// Fine Object == (per site) type of fine field
-// nbasis      == number of deflation vectors
-template<class Fobj,class CComplex,int nbasis>
-class MultiGeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
-public:
-  typedef typename CComplex::scalar_object SComplex;
-  typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
-  typedef MultiGeneralCoarsenedMatrix<Fobj,CComplex,nbasis> MultiGeneralCoarseOp;
-
-  typedef iVector<CComplex,nbasis >           siteVector;
-  typedef iMatrix<CComplex,nbasis >           siteMatrix;
-  typedef iVector<SComplex,nbasis >           calcVector;
-  typedef iMatrix<SComplex,nbasis >           calcMatrix;
-  typedef Lattice<iScalar<CComplex> >         CoarseComplexField;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-  typedef iMatrix<CComplex,nbasis >  Cobj;
-  typedef iVector<CComplex,nbasis >  Cvec;
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-  typedef Lattice<CComplex >    FineComplexField;
-  typedef CoarseVector Field;
-
-  ////////////////////
-  // Data members
-  ////////////////////
-  GridCartesian *       _CoarseGridMulti; 
-  NonLocalStencilGeometry geom;
-  NonLocalStencilGeometry geom_srhs;
-  PaddedCell Cell;
-  GeneralLocalStencil Stencil;
-
-  deviceVector<calcVector> BLAS_B;
-  deviceVector<calcVector> BLAS_C;
-  std::vector<deviceVector<calcMatrix> > BLAS_A;
-
-  std::vector<deviceVector<ComplexD *> > BLAS_AP;
-  std::vector<deviceVector<ComplexD *> > BLAS_BP;
-  deviceVector<ComplexD *>               BLAS_CP;
-
-  ///////////////////////
-  // Interface
-  ///////////////////////
-  GridBase      * Grid(void)           { return _CoarseGridMulti; };   // this is all the linalg routines need to know
-  GridCartesian * CoarseGrid(void)     { return _CoarseGridMulti; };   // this is all the linalg routines need to know
-
-  // Can be used to do I/O on the operator matrices externally
-  void SetMatrix (int p,CoarseMatrix & A)
-  {
-    assert(A.size()==geom_srhs.npoint);
-    GridtoBLAS(A[p],BLAS_A[p]);
-  }
-  void GetMatrix (int p,CoarseMatrix & A)
-  {
-    assert(A.size()==geom_srhs.npoint);
-    BLAStoGrid(A[p],BLAS_A[p]);
-  }
-  void CopyMatrix (GeneralCoarseOp &_Op)
-  {
-    for(int p=0;p<geom.npoint;p++){
-      auto Aup = _Op.Cell.Extract(_Op._A[p]);
-      //Unpadded
-      GridtoBLAS(Aup,BLAS_A[p]);
-    }
-  }
-  /*
-  void CheckMatrix (GeneralCoarseOp &_Op)
-  {
-    std::cout <<"************* Checking the little direc operator mRHS"<<std::endl;
-    for(int p=0;p<geom.npoint;p++){
-      //Unpadded
-      auto Aup = _Op.Cell.Extract(_Op._A[p]);
-      auto Ack = Aup;
-      BLAStoGrid(Ack,BLAS_A[p]);
-      std::cout << p<<" Ack "<<norm2(Ack)<<std::endl;
-      std::cout << p<<" Aup "<<norm2(Aup)<<std::endl;
-    }
-    std::cout <<"************* "<<std::endl;
-  }
-  */
-  
-  MultiGeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridCartesian *CoarseGridMulti) :
-    _CoarseGridMulti(CoarseGridMulti),
-    geom_srhs(_geom),
-    geom(_CoarseGridMulti,_geom.hops,_geom.skip+1),
-    Cell(geom.Depth(),_CoarseGridMulti),
-    Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
-  {
-    int32_t padded_sites   = Cell.grids.back()->lSites();
-    int32_t unpadded_sites = CoarseGridMulti->lSites();
-    
-    int32_t nrhs  = CoarseGridMulti->FullDimensions()[0];  // # RHS
-    int32_t orhs  = nrhs/CComplex::Nsimd();
-
-    padded_sites   = padded_sites/nrhs;
-    unpadded_sites = unpadded_sites/nrhs;
-    
-    /////////////////////////////////////////////////
-    // Device data vector storage
-    /////////////////////////////////////////////////
-    BLAS_A.resize(geom.npoint);
-    for(int p=0;p<geom.npoint;p++){
-      BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
-    }
-    
-    BLAS_B.resize(nrhs *padded_sites);   // includes ghost zone
-    BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
-    BLAS_AP.resize(geom.npoint);
-    BLAS_BP.resize(geom.npoint);
-    for(int p=0;p<geom.npoint;p++){
-      BLAS_AP[p].resize(unpadded_sites);
-      BLAS_BP[p].resize(unpadded_sites);
-    }
-    BLAS_CP.resize(unpadded_sites);
-
-    /////////////////////////////////////////////////
-    // Pointers to data
-    /////////////////////////////////////////////////
-
-    // Site identity mapping for A
-    for(int p=0;p<geom.npoint;p++){
-      for(int ss=0;ss<unpadded_sites;ss++){
-	ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
-	acceleratorPut(BLAS_AP[p][ss],ptr);
-      }
-    }
-    // Site identity mapping for C
-    for(int ss=0;ss<unpadded_sites;ss++){
-      ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
-      acceleratorPut(BLAS_CP[ss],ptr);
-    }
-
-    // Neighbour table is more complicated
-    int32_t j=0; // Interior point counter (unpadded)
-    for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
-      int ghost_zone=0;
-      for(int32_t point = 0 ; point < geom.npoint; point++){
-	int i=s*orhs*geom.npoint+point;
-	if( Stencil._entries[i]._wrap ) { // stencil is indexed by the oSite of the CoarseGridMulti, hence orhs factor
-	  ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
-	}
-      }
-
-      if( ghost_zone==0) {
-	for(int32_t point = 0 ; point < geom.npoint; point++){
-	  int i=s*orhs*geom.npoint+point;
- 	  int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
-	  assert(nbr<BLAS_B.size());
-	  ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
-	  acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
-	}
-	j++;
-      }
-    }
-    assert(j==unpadded_sites);
-  }
-  template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
-  {
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  GridBase *Fg = from.Grid();
-  assert(!Fg->_isCheckerBoarded);
-  int nd = Fg->_ndimension;
-
-  to.resize(Fg->lSites());
-
-  Coordinate LocalLatt = Fg->LocalDimensions();
-  size_t nsite = 1;
-  for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
-
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // do the index calc on the GPU
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  Coordinate f_ostride = Fg->_ostride;
-  Coordinate f_istride = Fg->_istride;
-  Coordinate f_rdimensions = Fg->_rdimensions;
-
-  autoView(from_v,from,AcceleratorRead);
-  auto to_v = &to[0];
-
-  const int words=sizeof(vobj)/sizeof(vector_type);
-  accelerator_for(idx,nsite,1,{
-      
-      Coordinate from_coor, base;
-      Lexicographic::CoorFromIndex(base,idx,LocalLatt);
-      for(int i=0;i<nd;i++){
-	from_coor[i] = base[i];
-      }
-      int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
-      int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
-
-      const vector_type* from = (const vector_type *)&from_v[from_oidx];
-      scalar_type* to = (scalar_type *)&to_v[idx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	to[w] = stmp;
-      }
-    });
-  }    
-  template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
-  {
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  GridBase *Tg = grid.Grid();
-  assert(!Tg->_isCheckerBoarded);
-  int nd = Tg->_ndimension;
-  
-  assert(in.size()==Tg->lSites());
-
-  Coordinate LocalLatt = Tg->LocalDimensions();
-  size_t nsite = 1;
-  for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
-
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // do the index calc on the GPU
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  Coordinate t_ostride = Tg->_ostride;
-  Coordinate t_istride = Tg->_istride;
-  Coordinate t_rdimensions = Tg->_rdimensions;
-
-  autoView(to_v,grid,AcceleratorWrite);
-  auto from_v = &in[0];
-
-  const int words=sizeof(vobj)/sizeof(vector_type);
-  accelerator_for(idx,nsite,1,{
-      
-      Coordinate to_coor, base;
-      Lexicographic::CoorFromIndex(base,idx,LocalLatt);
-      for(int i=0;i<nd;i++){
-	to_coor[i] = base[i];
-      }
-      int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
-      int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
-
-      vector_type* to = (vector_type *)&to_v[to_oidx];
-      scalar_type* from = (scalar_type *)&from_v[idx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp=from[w];
-	putlane(to[w], stmp, to_lane);
-      }
-    });
-  }
-  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace,
-		       GridBase *CoarseGrid)
-  {
-#if 0
-    std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
-
-    GridBase *grid = Subspace.FineGrid;
-
-    /////////////////////////////////////////////////////////////
-    // Orthogonalise the subblocks over the basis
-    /////////////////////////////////////////////////////////////
-    CoarseScalar InnerProd(CoarseGrid); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
-
-    const int npoint = geom_srhs.npoint;
-
-    Coordinate clatt = CoarseGrid->GlobalDimensions();
-    int Nd = CoarseGrid->Nd();
-      /*
-       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
-       *     Matrix index i is mapped to this shift via 
-       *               geom.shifts[i]
-       *
-       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
-       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
-       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
-       *       = M_{kl} A_ji^{b.b+l}
-       *
-       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
-       *  
-       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
-       *
-       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
-       */
-    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
-    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
-    ComplexD ci(0.0,1.0);
-    for(int k=0;k<npoint;k++){ // Loop over momenta
-
-      for(int l=0;l<npoint;l++){ // Loop over nbr relative
-	ComplexD phase(0.0,0.0);
-	for(int mu=0;mu<Nd;mu++){
-	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	  phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
-	}
-	phase=exp(phase*ci);
-	Mkl(k,l) = phase;
-      }
-    }
-    invMkl = Mkl.inverse();
-
-    ///////////////////////////////////////////////////////////////////////
-    // Now compute the matrix elements of linop between the orthonormal
-    // set of vectors.
-    ///////////////////////////////////////////////////////////////////////
-    FineField phaV(grid); // Phased block basis vector
-    FineField MphaV(grid);// Matrix applied
-    std::vector<FineComplexField> phaF(npoint,grid);
-    std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
-    
-    CoarseVector coarseInner(CoarseGrid);
-    
-    typedef typename CComplex::scalar_type SComplex;
-    FineComplexField one(grid); one=SComplex(1.0);
-    FineComplexField zz(grid); zz = Zero();
-    for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
-      /////////////////////////////////////////////////////
-      // Stick a phase on every block
-      /////////////////////////////////////////////////////
-      CoarseComplexField coor(CoarseGrid);
-      pha[p]=Zero();
-      for(int mu=0;mu<Nd;mu++){
-	LatticeCoordinate(coor,mu);
-	RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
-      }
-      pha[p]  =exp(pha[p]*ci);	
-
-      blockZAXPY(phaF[p],pha[p],one,zz);
-    }
-
-    // Could save on temporary storage here
-    std::vector<CoarseMatrix> _A;
-    _A.resize(geom_srhs.npoint,CoarseGrid);
-
-    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
-    CoarseVector          FT(CoarseGrid);
-    for(int i=0;i<nbasis;i++){// Loop over basis vectors
-      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
-      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
-
-	phaV = phaF[p]*Subspace.subspace[i];
-
-	/////////////////////////////////////////////////////////////////////
-	// Multiple phased subspace vector by matrix and project to subspace
-	// Remove local bulk phase to leave relative phases
-	/////////////////////////////////////////////////////////////////////
-	linop.Op(phaV,MphaV);
-
-	// Fixme, could use batched block projector here
-	blockProject(coarseInner,MphaV,Subspace.subspace);
-
-	coarseInner = conjugate(pha[p]) * coarseInner;
-
-	ComputeProj[p] = coarseInner;
-      }
-
-      // Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
-      for(int k=0;k<npoint;k++){
-
-	FT = Zero();
-	for(int l=0;l<npoint;l++){
-	  FT= FT+ invMkl(l,k)*ComputeProj[l];
-	}
-      
-	int osites=CoarseGrid->oSites();
-	autoView( A_v  , _A[k], AcceleratorWrite);
-	autoView( FT_v  , FT, AcceleratorRead);
-	accelerator_for(sss, osites, 1, {
-	    for(int j=0;j<nbasis;j++){
-	      A_v[sss](i,j) = FT_v[sss](j);
-	    }
-        });
-      }
-    }
-
-    // Only needed if nonhermitian
-    //    if ( ! hermitian ) {
-    //      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
-    //      PopulateAdag();
-    //    }
-    // Need to write something to populate Adag from A
-
-    for(int p=0;p<geom_srhs.npoint;p++){
-      GridtoBLAS(_A[p],BLAS_A[p]);
-    }
-    /*
-Grid : Message : 11698.730546 s : CoarsenOperator eigen  1334 us
-Grid : Message : 11698.730563 s : CoarsenOperator phase  34729 us
-Grid : Message : 11698.730565 s : CoarsenOperator phaseBZ 2423814 us
-Grid : Message : 11698.730566 s : CoarsenOperator mat    127890998 us
-Grid : Message : 11698.730567 s : CoarsenOperator proj   515840840 us
-Grid : Message : 11698.730568 s : CoarsenOperator inv    103948313 us
-Takes 600s to compute matrix elements, DOMINATED by the block project.
-Easy to speed up with the batched block project.
-Store npoint vectors, get npoint x Nbasis block projection, and 81 fold faster.
-
-// Block project below taks to 240s
-Grid : Message : 328.193418 s : CoarsenOperator phase      38338 us
-Grid : Message : 328.193434 s : CoarsenOperator phaseBZ  1711226 us
-Grid : Message : 328.193436 s : CoarsenOperator mat    122213270 us
-//Grid : Message : 328.193438 s : CoarsenOperator proj   1181154 us <-- this is mistimed
-//Grid : Message : 11698.730568 s : CoarsenOperator inv  103948313 us <-- Cut this ~10x if lucky by loop fusion
-     */
-#else
-    RealD tproj=0.0;
-    RealD tmat=0.0;
-    RealD tphase=0.0;
-    RealD tphaseBZ=0.0;
-    RealD tinv=0.0;
-
-    std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
-
-    GridBase *grid = Subspace.FineGrid;
-
-    /////////////////////////////////////////////////////////////
-    // Orthogonalise the subblocks over the basis
-    /////////////////////////////////////////////////////////////
-    CoarseScalar InnerProd(CoarseGrid); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
-
-
-    MultiRHSBlockProject<Lattice<Fobj> >    Projector;
-    Projector.Allocate(nbasis,grid,CoarseGrid);
-    Projector.ImportBasis(Subspace.subspace);
-    
-    const int npoint = geom_srhs.npoint;
-
-    Coordinate clatt = CoarseGrid->GlobalDimensions();
-    int Nd = CoarseGrid->Nd();
-      /*
-       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
-       *     Matrix index i is mapped to this shift via 
-       *               geom.shifts[i]
-       *
-       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
-       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
-       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
-       *       = M_{kl} A_ji^{b.b+l}
-       *
-       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
-       *  
-       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
-       *
-       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
-       */
-    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
-    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
-    ComplexD ci(0.0,1.0);
-    for(int k=0;k<npoint;k++){ // Loop over momenta
-
-      for(int l=0;l<npoint;l++){ // Loop over nbr relative
-	ComplexD phase(0.0,0.0);
-	for(int mu=0;mu<Nd;mu++){
-	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	  phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
-	}
-	phase=exp(phase*ci);
-	Mkl(k,l) = phase;
-      }
-    }
-    invMkl = Mkl.inverse();
-
-    ///////////////////////////////////////////////////////////////////////
-    // Now compute the matrix elements of linop between the orthonormal
-    // set of vectors.
-    ///////////////////////////////////////////////////////////////////////
-    FineField phaV(grid); // Phased block basis vector
-    FineField MphaV(grid);// Matrix applied
-    std::vector<FineComplexField> phaF(npoint,grid);
-    std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
-    
-    CoarseVector coarseInner(CoarseGrid);
-    
-    tphase=-usecond();
-    typedef typename CComplex::scalar_type SComplex;
-    FineComplexField one(grid); one=SComplex(1.0);
-    FineComplexField zz(grid); zz = Zero();
-    for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
-      /////////////////////////////////////////////////////
-      // Stick a phase on every block
-      /////////////////////////////////////////////////////
-      CoarseComplexField coor(CoarseGrid);
-      pha[p]=Zero();
-      for(int mu=0;mu<Nd;mu++){
-	LatticeCoordinate(coor,mu);
-	RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
-      }
-      pha[p]  =exp(pha[p]*ci);	
-
-      blockZAXPY(phaF[p],pha[p],one,zz);
-    }
-    tphase+=usecond();
-
-    // Could save on temporary storage here
-    std::vector<CoarseMatrix> _A;
-    _A.resize(geom_srhs.npoint,CoarseGrid);
-
-    // Count use small chunks than npoint == 81 and save memory
-    int batch = 9;
-    std::vector<FineField>    _MphaV(batch,grid);
-    std::vector<CoarseVector> TmpProj(batch,CoarseGrid);
-
-    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
-    CoarseVector          FT(CoarseGrid);
-    for(int i=0;i<nbasis;i++){// Loop over basis vectors
-      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
-
-      //      std::cout << GridLogMessage << " phasing the fine vector "<<std::endl;
-      // Fixme : do this in batches
-      for(int p=0;p<npoint;p+=batch){ // Loop over momenta in npoint
-
-	for(int b=0;b<MIN(batch,npoint-p);b++){
-	  tphaseBZ-=usecond();
-	  phaV = phaF[p+b]*Subspace.subspace[i];
-	  tphaseBZ+=usecond();
-
-	  /////////////////////////////////////////////////////////////////////
-	  // Multiple phased subspace vector by matrix and project to subspace
-	  // Remove local bulk phase to leave relative phases
-	  /////////////////////////////////////////////////////////////////////
-	  // Memory footprint was an issue
-	  tmat-=usecond();
-	  linop.Op(phaV,MphaV);
-	  _MphaV[b] = MphaV;
-	  tmat+=usecond();
-	}      
-
-	//	std::cout << GridLogMessage << " Calling block project "<<std::endl;
-	tproj-=usecond();
-	Projector.blockProject(_MphaV,TmpProj);
-	tproj+=usecond();
-	
-	//	std::cout << GridLogMessage << " conj phasing the coarse vectors "<<std::endl;
-	for(int b=0;b<MIN(batch,npoint-p);b++){
-	  ComputeProj[p+b] = conjugate(pha[p+b])*TmpProj[b];
-	}
-      }
-
-      // Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
-      
-      // std::cout << GridLogMessage << " Starting FT inv "<<std::endl;
-      tinv-=usecond();
-      for(int k=0;k<npoint;k++){
-	FT = Zero();
-	// 81 kernel calls as many ComputeProj vectors
-	// Could fuse with a vector of views, but ugly
-	// Could unroll the expression and run fewer kernels -- much more attractive
-	// Could also do non blocking.
-#if 0	
-	for(int l=0;l<npoint;l++){
-	  FT= FT+ invMkl(l,k)*ComputeProj[l];
-	}
-#else
-	const int radix = 9;
-	int ll;
-	for(ll=0;ll+radix-1<npoint;ll+=radix){
-	  // When ll = npoint-radix, ll+radix-1 = npoint-1, and we do it all.
-	  FT = FT 
-	    + invMkl(ll+0,k)*ComputeProj[ll+0]
-	    + invMkl(ll+1,k)*ComputeProj[ll+1]
-	    + invMkl(ll+2,k)*ComputeProj[ll+2]
-	    + invMkl(ll+3,k)*ComputeProj[ll+3]
-	    + invMkl(ll+4,k)*ComputeProj[ll+4]
-	    + invMkl(ll+5,k)*ComputeProj[ll+5]
-	    + invMkl(ll+6,k)*ComputeProj[ll+6]
-	    + invMkl(ll+7,k)*ComputeProj[ll+7]
-	    + invMkl(ll+8,k)*ComputeProj[ll+8];
-	}
-	for(int l=ll;l<npoint;l++){
-	  FT= FT+ invMkl(l,k)*ComputeProj[l];
-	}
-#endif
-      
-	// 1 kernel call -- must be cheaper
-	int osites=CoarseGrid->oSites();
-	autoView( A_v  , _A[k], AcceleratorWrite);
-	autoView( FT_v  , FT, AcceleratorRead);
-	accelerator_for(sss, osites, 1, {
-	    for(int j=0;j<nbasis;j++){
-	      A_v[sss](i,j) = FT_v[sss](j);
-	    }
-        });
-      }
-      tinv+=usecond();
-    }
-
-    // Only needed if nonhermitian
-    //    if ( ! hermitian ) {
-    //      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
-    //      PopulateAdag();
-    //    }
-    // Need to write something to populate Adag from A
-    //    std::cout << GridLogMessage << " Calling GridtoBLAS "<<std::endl;
-    for(int p=0;p<geom_srhs.npoint;p++){
-      GridtoBLAS(_A[p],BLAS_A[p]);
-    }
-    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator mat    "<<tmat <<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator proj   "<<tproj<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
-#endif
-  }
-  void Mdag(const CoarseVector &in, CoarseVector &out)
-  {
-    this->M(in,out);
-  }
-  void M (const CoarseVector &in, CoarseVector &out)
-  {
-    //    std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
-    conformable(CoarseGrid(),in.Grid());
-    conformable(in.Grid(),out.Grid());
-    out.Checkerboard() = in.Checkerboard();
-
-    RealD t_tot;
-    RealD t_exch;
-    RealD t_GtoB;
-    RealD t_BtoG;
-    RealD t_mult;
-
-    t_tot=-usecond();
-    CoarseVector tin=in;
-    t_exch=-usecond();
-    CoarseVector pin = Cell.ExchangePeriodic(tin); //padded input
-    t_exch+=usecond();
-
-    CoarseVector pout(pin.Grid());
-
-    int npoint = geom.npoint;
-    typedef calcMatrix* Aview;
-    typedef LatticeView<Cvec> Vview;
-      
-    const int Nsimd = CComplex::Nsimd();
-
-    int64_t nrhs  =pin.Grid()->GlobalDimensions()[0];
-    assert(nrhs>=1);
-
-    RealD flops,bytes;
-    int64_t osites=in.Grid()->oSites(); // unpadded
-    int64_t unpadded_vol = CoarseGrid()->lSites()/nrhs;
-    
-    flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
-    bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
-          + 2.0*osites*sizeof(siteVector)*npoint;
-    
-
-    t_GtoB=-usecond();
-    GridtoBLAS(pin,BLAS_B);
-    t_GtoB+=usecond();
-
-    GridBLAS BLAS;
-
-    t_mult=-usecond();
-    for(int p=0;p<geom.npoint;p++){
-      RealD c = 1.0;
-      if (p==0) c = 0.0;
-      ComplexD beta(c);
-
-      BLAS.gemmBatched(nbasis,nrhs,nbasis,
-		       ComplexD(1.0),
-		       BLAS_AP[p], 
-		       BLAS_BP[p], 
-		       ComplexD(c), 
-		       BLAS_CP);
-    }
-    BLAS.synchronise();
-    t_mult+=usecond();
-
-    t_BtoG=-usecond();
-    BLAStoGrid(out,BLAS_C);
-    t_BtoG+=usecond();
-    t_tot+=usecond();
-    /*
-    std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult GtoB  "<<t_GtoB<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult BtoG  "<<t_BtoG<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult tot  "<<t_tot<<" us"<<std::endl;
-    */
-    //    std::cout << GridLogMessage<<std::endl;
-    //    std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
-    //    std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
-    //    std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
-    //    std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
-    //    std::cout << GridLogMessage<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
-  };
-  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
-};
-  
-NAMESPACE_END(Grid);
@@ -1,238 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-
-/////////////////////////////////////////////////////////////////
-// Geometry class in cartesian case
-/////////////////////////////////////////////////////////////////
-
-class Geometry {
-public:
-  int npoint;
-  int base;
-  std::vector<int> directions   ;
-  std::vector<int> displacements;
-  std::vector<int> points_dagger;
-
-  Geometry(int _d)  {
-    
-    base = (_d==5) ? 1:0;
-
-    // make coarse grid stencil for 4d , not 5d
-    if ( _d==5 ) _d=4;
-
-    npoint = 2*_d+1;
-    directions.resize(npoint);
-    displacements.resize(npoint);
-    points_dagger.resize(npoint);
-    for(int d=0;d<_d;d++){
-      directions[d   ] = d+base;
-      directions[d+_d] = d+base;
-      displacements[d  ] = +1;
-      displacements[d+_d]= -1;
-      points_dagger[d   ] = d+_d;
-      points_dagger[d+_d] = d;
-    }
-    directions   [2*_d]=0;
-    displacements[2*_d]=0;
-    points_dagger[2*_d]=2*_d;
-  }
-
-  int point(int dir, int disp) {
-    assert(disp == -1 || disp == 0 || disp == 1);
-    assert(base+0 <= dir && dir < base+4);
-
-    // directions faster index = new indexing
-    // 4d (base = 0):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   0  1  2  3  0  1  2  3  0
-    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
-    // 5d (base = 1):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   1  2  3  4  1  2  3  4  0
-    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
-
-    // displacements faster index = old indexing
-    // 4d (base = 0):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   0  0  1  1  2  2  3  3  0
-    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
-    // 5d (base = 1):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   1  1  2  2  3  3  4  4  0
-    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
-
-    if(dir == 0 and disp == 0)
-      return 8;
-    else // New indexing
-      return (1 - disp) / 2 * 4 + dir - base;
-    // else // Old indexing
-    //   return (4 * (dir - base) + 1 - disp) / 2;
-  }
-};
-
-/////////////////////////////////////////////////////////////////
-// Less local equivalent of Geometry class in cartesian case
-/////////////////////////////////////////////////////////////////
-class NonLocalStencilGeometry {
-public:
-  //  int depth;
-  int skip;
-  int hops;
-  int npoint;
-  std::vector<Coordinate> shifts;
-  Coordinate stencil_size;
-  Coordinate stencil_lo;
-  Coordinate stencil_hi;
-  GridCartesian *grid;
-  GridCartesian *Grid() {return grid;};
-  int Depth(void){return 1;};   // Ghost zone depth
-  int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil
-  int DimSkip(void){return skip;};
-
-  virtual ~NonLocalStencilGeometry() {};
-
-  int  Reverse(int point)
-  {
-    int Nd = Grid()->Nd();
-    Coordinate shft = shifts[point];
-    Coordinate rev(Nd);
-    for(int mu=0;mu<Nd;mu++) rev[mu]= -shft[mu];
-    for(int p=0;p<npoint;p++){
-      if(rev==shifts[p]){
-	return p;
-      }
-    }
-    assert(0);
-    return -1;
-  }
-  void BuildShifts(void)
-  {
-    this->shifts.resize(0);
-    int Nd = this->grid->Nd();
-
-    int dd = this->DimSkip();
-    for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){
-    for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){
-    for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){
-    for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){
-      Coordinate sft(Nd,0);
-      sft[dd+0] = s0;
-      sft[dd+1] = s1;
-      sft[dd+2] = s2;
-      sft[dd+3] = s3;
-      int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3);
-      if(nhops<=this->hops) this->shifts.push_back(sft);
-    }}}}
-    this->npoint = this->shifts.size();
-    std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<<std::endl;
-  }
-  
-  NonLocalStencilGeometry(GridCartesian *_coarse_grid,int _hops,int _skip) : grid(_coarse_grid), hops(_hops), skip(_skip)
-  {
-    Coordinate latt = grid->GlobalDimensions();
-    stencil_size.resize(grid->Nd());
-    stencil_lo.resize(grid->Nd());
-    stencil_hi.resize(grid->Nd());
-    for(int d=0;d<grid->Nd();d++){
-     if ( latt[d] == 1 ) {
-      stencil_lo[d] = 0;
-      stencil_hi[d] = 0;
-      stencil_size[d]= 1;
-     } else if ( latt[d] == 2 ) {
-      stencil_lo[d] = -1;
-      stencil_hi[d] = 0;
-      stencil_size[d]= 2;
-     } else if ( latt[d] > 2 ) {
-       stencil_lo[d] = -1;
-       stencil_hi[d] =  1;
-       stencil_size[d]= 3;
-     }
-    }
-    this->BuildShifts();
-  };
-
-};
-
-// Need to worry about red-black now
-class NonLocalStencilGeometry4D : public NonLocalStencilGeometry {
-public:
-  virtual int DerivedDimSkip(void) { return 0;};
-  NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,0) { };
-  virtual ~NonLocalStencilGeometry4D() {};
-};
-class NonLocalStencilGeometry5D : public NonLocalStencilGeometry {
-public:
-  virtual int DerivedDimSkip(void) { return 1; }; 
-  NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,1)  { };
-  virtual ~NonLocalStencilGeometry5D() {};
-};
-/*
- * Bunch of different options classes
- */
-class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
-public:
-  NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,4)
-  {
-  };
-};
-class NextToNextToNextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,4)
-  {
-  };
-};
-class NextToNearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
-public:
-  NextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,2)
-  {
-  };
-};
-class NextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,2)
-  {
-  };
-};
-class NearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
-public:
-  NearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,1)
-  {
-  };
-};
-class NearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,1)
-  {
-  };
-};
-
-NAMESPACE_END(Grid);
@@ -1,34 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: Grid/algorithms/multigrid/MultiGrid.h
-
-    Copyright (C) 2023
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#pragma once
-
-#include <Grid/algorithms/multigrid/Aggregates.h>
-#include <Grid/algorithms/multigrid/Geometry.h>
-#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
-#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
-#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>
@@ -54,9 +54,6 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
-    if ( (_Tp*)ptr == (_Tp *) NULL ) {
-      printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
-    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -69,7 +66,7 @@ public:
  }

  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { };
+  void construct(pointer __p, const _Tp& __val) { assert(0);};
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
@@ -103,9 +100,6 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
-    if ( (_Tp*)ptr == (_Tp *) NULL ) {
-      printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
-    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -151,9 +145,6 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
-    if ( (_Tp*)ptr == (_Tp *) NULL ) {
-      printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
-    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -174,48 +165,18 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview
-template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // Really want to deprecate
-template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page
-template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector
+#ifdef ACCELERATOR_CSHIFT
+// Cshift on device
+template<class T> using cshiftAllocator = devAllocator<T>;
+#else
+// Cshift on host
+template<class T> using cshiftAllocator = std::allocator<T>;
+#endif

-/*
-template<class T> class vecView
-{
- protected:
-  T * data;
-  uint64_t size;
-  ViewMode mode;
-  void * cpu_ptr;
- public:
-  // Rvalue accessor
-  accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
-  vecView(Vector<T> &refer_to_me,ViewMode _mode)
-  {
-    cpu_ptr = &refer_to_me[0];
-    size = refer_to_me.size();
-    mode = _mode;
-    data =(T *) MemoryManager::ViewOpen(cpu_ptr,
-					size*sizeof(T),
-					mode,
-					AdviseDefault);
-  }
-  void ViewClose(void)
-  { // Inform the manager
-    MemoryManager::ViewClose(this->cpu_ptr,this->mode);    
-  }
-};
-
-template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
-{
-  vecView<T> ret(vec,_mode); // does the open
-  return ret;                // must be closed
-}
-
-#define autoVecView(v_v,v,mode)					\
-  auto v_v = VectorView(v,mode);				\
-  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
-*/
+template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
+template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
+template<class T> using commVector = std::vector<T,devAllocator<T> >;
+template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;

 NAMESPACE_END(Grid);

@@ -4,56 +4,15 @@ NAMESPACE_BEGIN(Grid);

 /*Allocation types, saying which pointer cache should be used*/
 #define Cpu      (0)
-#define CpuHuge  (1)
-#define CpuSmall (2)
-#define Acc      (3)
-#define AccHuge  (4)
-#define AccSmall (5)
-#define Shared   (6)
-#define SharedHuge  (7)
-#define SharedSmall (8)
+#define CpuSmall (1)
+#define Acc      (2)
+#define AccSmall (3)
+#define Shared   (4)
+#define SharedSmall (5)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
-
-#if defined(__has_feature)
-#if __has_feature(leak_sanitizer)
-#define ASAN_LEAK_CHECK
-#endif
-#endif
-
-#ifdef ASAN_LEAK_CHECK
-#include <sanitizer/asan_interface.h>
-#include <sanitizer/common_interface_defs.h>
-#include <sanitizer/lsan_interface.h>
-#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
-#else
-#define LEAK_CHECK(A) { }
-#endif
-
-void MemoryManager::DisplayMallinfo(void)
-{
-#ifdef __linux__
-  struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
-  
-  mi = mallinfo();
-
-  std::cout << "MemoryManager: Total non-mmapped bytes (arena):       "<< (size_t)mi.arena<<std::endl;
-  std::cout << "MemoryManager: # of free chunks (ordblks):            "<< (size_t)mi.ordblks<<std::endl;
-  std::cout << "MemoryManager: # of free fastbin blocks (smblks):     "<< (size_t)mi.smblks<<std::endl;
-  std::cout << "MemoryManager: # of mapped regions (hblks):           "<< (size_t)mi.hblks<<std::endl;
-  std::cout << "MemoryManager: Bytes in mapped regions (hblkhd):      "<< (size_t)mi.hblkhd<<std::endl;
-  std::cout << "MemoryManager: Max. total allocated space (usmblks):  "<< (size_t)mi.usmblks<<std::endl;
-  std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
-  std::cout << "MemoryManager: Total allocated space (uordblks):      "<< (size_t)mi.uordblks<<std::endl;
-  std::cout << "MemoryManager: Total free space (fordblks):           "<< (size_t)mi.fordblks<<std::endl;
-  std::cout << "MemoryManager: Topmost releasable block (keepcost):   "<< (size_t)mi.keepcost<<std::endl;
-#endif
-  LEAK_CHECK();
- 
-}
-
 void MemoryManager::PrintBytes(void)
 {
  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
@@ -73,18 +32,15 @@ void MemoryManager::PrintBytes(void)
 #ifdef GRID_CUDA
  cuda_mem();
 #endif
-  DisplayMallinfo();
+  
 }

-uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
-uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
-
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
 uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
@@ -214,16 +170,6 @@ void MemoryManager::Init(void)
    }
  }

-  str= getenv("GRID_ALLOC_NCACHE_HUGE");
-  if ( str ) {
-    Nc = atoi(str);
-    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
-      Ncache[CpuHuge]=Nc;
-      Ncache[AccHuge]=Nc;
-      Ncache[SharedHuge]=Nc;
-    }
-  }
-
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
  if ( str ) {
    Nc = atoi(str);
@@ -244,9 +190,7 @@ void MemoryManager::InitMessage(void) {
  
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
 #endif
  
 #ifdef GRID_UVM
@@ -278,11 +222,8 @@ void MemoryManager::InitMessage(void) {
 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
 {
 #ifdef ALLOCATION_CACHE
-  int cache;
-  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
-  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
-  else                                     cache = type;
-
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache = type + small;
  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
@@ -291,12 +232,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)

 void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
+  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 

-  if (ncache == 0) return ptr;
-
  void * ret = NULL;
  int v = -1;

@@ -331,11 +271,8 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 void *MemoryManager::Lookup(size_t bytes,int type)
 {
 #ifdef ALLOCATION_CACHE
-  int cache;
-  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
-  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
-  else                                     cache = type;
-
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache = type+small;
  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
@@ -344,6 +281,7 @@ void *MemoryManager::Lookup(size_t bytes,int type)

 void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
+  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
@@ -35,12 +35,6 @@ NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?

 #define GRID_ALLOC_SMALL_LIMIT (4096)
-#define GRID_ALLOC_HUGE_LIMIT  (2147483648)
-
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-#define FILE_LINE __FILE__ ":" TOSTRING(__LINE__)
-#define AUDIT(a) MemoryManager::Audit(FILE_LINE)

 /*Pinning pages is costly*/
 ////////////////////////////////////////////////////////////////////////////
@@ -71,21 +65,6 @@ enum ViewMode {
  CpuWriteDiscard = 0x10 // same for now
 };

-struct MemoryStatus {
-  uint64_t     DeviceBytes;
-  uint64_t     DeviceLRUBytes;
-  uint64_t     DeviceMaxBytes;
-  uint64_t     HostToDeviceBytes;
-  uint64_t     DeviceToHostBytes;
-  uint64_t     HostToDeviceXfer;
-  uint64_t     DeviceToHostXfer;
-  uint64_t     DeviceEvictions;
-  uint64_t     DeviceDestroy;
-  uint64_t     DeviceAllocCacheBytes;
-  uint64_t     HostAllocCacheBytes;
-};
-
-
 class MemoryManager {
 private:

@@ -99,7 +78,7 @@ private:
  } AllocationCacheEntry;

  static const int NallocCacheMax=128; 
-  static const int NallocType=9;
+  static const int NallocType=6;
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
@@ -113,9 +92,8 @@ private:
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;

- public:
  static void PrintBytes(void);
-  static void Audit(std::string s);
+ public:
  static void Init(void);
  static void InitMessage(void);
  static void *AcceleratorAllocate(size_t bytes);
@@ -135,28 +113,7 @@ private:
  static uint64_t     DeviceToHostBytes;
  static uint64_t     HostToDeviceXfer;
  static uint64_t     DeviceToHostXfer;
-  static uint64_t     DeviceEvictions;
-  static uint64_t     DeviceDestroy;
-  
-  static uint64_t     DeviceCacheBytes();
-  static uint64_t     HostCacheBytes();
-
-  static MemoryStatus GetFootprint(void) {
-    MemoryStatus stat;
-    stat.DeviceBytes       = DeviceBytes;
-    stat.DeviceLRUBytes    = DeviceLRUBytes;
-    stat.DeviceMaxBytes    = DeviceMaxBytes;
-    stat.HostToDeviceBytes = HostToDeviceBytes;
-    stat.DeviceToHostBytes = DeviceToHostBytes;
-    stat.HostToDeviceXfer  = HostToDeviceXfer;
-    stat.DeviceToHostXfer  = DeviceToHostXfer;
-    stat.DeviceEvictions   = DeviceEvictions;
-    stat.DeviceDestroy     = DeviceDestroy;
-    stat.DeviceAllocCacheBytes = DeviceCacheBytes();
-    stat.HostAllocCacheBytes   = HostCacheBytes();
-    return stat;
-  };
-  
+ 
 private:
 #ifndef GRID_UVM
  //////////////////////////////////////////////////////////////////////
@@ -209,12 +166,10 @@ private:
  static void     CpuViewClose(uint64_t Ptr);
  static uint64_t CpuViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 #endif
+  static void NotifyDeletion(void * CpuPtr);

 public:
-  static void DisplayMallinfo(void);
-  static void NotifyDeletion(void * CpuPtr);
  static void Print(void);
-  static void PrintAll(void);
  static void PrintState( void* CpuPtr);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
@@ -1,15 +1,11 @@
 #include <Grid/GridCore.h>
 #ifndef GRID_UVM

+#warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
+//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
+#define dprintf(...)

-#define MAXLINE 512
-static char print_buffer [ MAXLINE ];
-
-#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
-#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug  << print_buffer << std::endl;
-//#define dprintf(...) 
-//#define mprintf(...) 

 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
@@ -27,8 +23,6 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
-uint64_t  MemoryManager::DeviceEvictions;
-uint64_t  MemoryManager::DeviceDestroy;

 ////////////////////////////////////
 // Priority ordering for unlocked entries
@@ -110,17 +104,15 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+   dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr) {
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
-    DeviceDestroy++;
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    AccCache.AccPtr=(uint64_t) NULL;
-    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@@ -129,36 +121,26 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 {
  ///////////////////////////////////////////////////////////////////////////
-  // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
-  // Cannot be acclocked. If allocated must be in LRU pool.
-  //
-  // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
-  //                          and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
-  //                          but there is a weakness where CpuLock entries are attempted for erase
-  //                          Take these OUT LRU queue when CPU locked?
-  //                          Cannot take out the table as cpuLock data is important.
+  // Make CPU consistent, remove from Accelerator, remove entry
+  // Cannot be locked. If allocated must be in LRU pool.
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
-	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
-	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
-  if (AccCache.accLock!=0) return;
-  if (AccCache.cpuLock!=0) return;
+  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  assert(AccCache.accLock==0);
+  assert(AccCache.cpuLock==0);
  if(AccCache.state==AccDirty) {
    Flush(AccCache);
  }
+  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr) {
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
-    LRUremove(AccCache);
-    AccCache.AccPtr=(uint64_t)NULL;
-    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    LRUremove(AccCache);
+    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
-  //  uint64_t CpuPtr = AccCache.CpuPtr;
-  DeviceEvictions++;
-  //  EntryErase(CpuPtr);
+  uint64_t CpuPtr = AccCache.CpuPtr;
+  EntryErase(CpuPtr);
 }
 void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
 {
@@ -168,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -183,9 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx",
-	  (uint64_t)AccCache.bytes,
-	  (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@@ -211,7 +191,6 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@@ -223,7 +202,6 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@@ -234,19 +212,13 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
-  if(bytes>=DeviceMaxBytes) {
-    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
-  }
-  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
      assert(LRU.size()>0);
-      uint64_t victim = LRU.back(); // From the LRU
+      uint64_t victim = LRU.back();
      auto AccCacheIterator = EntryLookup(victim);
      auto & AccCache = AccCacheIterator->second;
      Evict(AccCache);
-    } else {
-      return;
    }
  }
 }
@@ -269,12 +241,11 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error

  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
+    dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
-	            (uint64_t)bytes,
-		    (uint64_t)AccCache.accLock);
+		    (uint64_t)bytes);
    assert(AccCache.CpuPtr == CpuPtr);
    assert(AccCache.bytes  ==bytes);
  }
@@ -309,7 +280,6 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
-    dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@@ -322,30 +292,28 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
+    dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
+    dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
+    dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
  } else {
    assert(0);
  }

-  assert(AccCache.accLock>0);
-  // If view is opened on device must remove from LRU
+  // If view is opened on device remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
-    dprintf("AccCache entry removed from LRU ");
    LRUremove(AccCache);
  }

@@ -366,12 +334,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  assert(AccCache.accLock>0);

  AccCache.accLock--;
+
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
-  } else {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@@ -408,10 +374,9 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;

-  // CPU doesn't need to free space
-  //  if (!AccCache.AccPtr) {
-  //    EvictVictims(bytes);
-  //  }
+  if (!AccCache.AccPtr) {
+     EvictVictims(bytes);
+  }

  assert((mode==CpuRead)||(mode==CpuWrite));
  assert(AccCache.accLock==0);  // Programming error
@@ -465,29 +430,20 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 void  MemoryManager::Print(void)
 {
  PrintBytes();
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-  std::cout << GridLogMessage << "Memory Manager                             " << std::endl;
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-  std::cout << GridLogMessage << DeviceBytes   << " bytes allocated on device " << std::endl;
-  std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
-  std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device       " << std::endl;
-  std::cout << GridLogMessage << HostToDeviceXfer << " transfers        to   device " << std::endl;
-  std::cout << GridLogMessage << DeviceToHostXfer << " transfers        from device " << std::endl;
-  std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
-  std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
-  std::cout << GridLogMessage << DeviceEvictions  << " Evictions from device " << std::endl;
-  std::cout << GridLogMessage << DeviceDestroy    << " Destroyed vectors on device " << std::endl;
-  std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
-  acceleratorMem();
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-}
-void  MemoryManager::PrintAll(void)
-{
-  Print();
-  std::cout << GridLogMessage << std::endl;
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-  std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
+  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
+  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogDebug << DeviceBytes   << " bytes allocated on device " << std::endl;
+  std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
+  std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device       " << std::endl;
+  std::cout << GridLogDebug << HostToDeviceXfer << " transfers        to   device " << std::endl;
+  std::cout << GridLogDebug << DeviceToHostXfer << " transfers        from device " << std::endl;
+  std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
+  std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
+  std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
+  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
+  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
    auto &AccCache = it->second;
    
@@ -497,13 +453,13 @@ void  MemoryManager::PrintAll(void)
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");

-    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
+    std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 	      << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 	      << "\t" << AccCache.cpuLock
 	      << "\t" << AccCache.accLock
 	      << "\t" << AccCache.LRU_valid<<std::endl;
  }
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
+  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;

 };
 int   MemoryManager::isOpen   (void* _CpuPtr) 
@@ -517,63 +473,6 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
    return 0;
  }
 }
-void MemoryManager::Audit(std::string s)
-{
-  uint64_t CpuBytes=0;
-  uint64_t AccBytes=0;
-  uint64_t LruBytes1=0;
-  uint64_t LruBytes2=0;
-  uint64_t LruCnt=0;
-  
-  std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
-  for(auto it=LRU.begin();it!=LRU.end();it++){
-    uint64_t cpuPtr = *it;
-    assert(EntryPresent(cpuPtr));
-    auto AccCacheIterator = EntryLookup(cpuPtr);
-    auto & AccCache = AccCacheIterator->second;
-    LruBytes2+=AccCache.bytes;
-    assert(AccCache.LRU_valid==1);
-    assert(AccCache.LRU_entry==it);
-  }
-  std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
-
-  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
-    auto &AccCache = it->second;
-    
-    std::string str;
-    if ( AccCache.state==Empty    ) str = std::string("Empty");
-    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
-    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
-    if ( AccCache.state==Consistent)str = std::string("Consistent");
-
-    CpuBytes+=AccCache.bytes;
-    if( AccCache.AccPtr )    AccBytes+=AccCache.bytes;
-    if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
-    if( AccCache.LRU_valid ) LruCnt++;
-    
-    if ( AccCache.cpuLock || AccCache.accLock ) {
-      assert(AccCache.LRU_valid==0);
-
-      std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
-		<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
-		<< "\t cpuLock  " << AccCache.cpuLock
-		<< "\t accLock  " << AccCache.accLock
-		<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
-    }
-
-    assert( AccCache.cpuLock== 0 ) ;
-    assert( AccCache.accLock== 0 ) ;
-  }
-  std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
-  assert(LruBytes1==LruBytes2);
-  assert(LruBytes1==DeviceLRUBytes);
-  std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
-  assert(AccBytes==DeviceBytes);
-  std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
-  assert(LruCnt == LRU.size());
-  std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
-
-}

 void MemoryManager::PrintState(void* _CpuPtr)
 {
@@ -590,8 +489,8 @@ void MemoryManager::PrintState(void* _CpuPtr)
    if ( AccCache.state==EvictNext) str = std::string("EvictNext");

    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
-    std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
-    << "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
+    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
+    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
    << "\t" << AccCache.cpuLock
    << "\t" << AccCache.accLock
    << "\t" << AccCache.LRU_valid<<std::endl;
@@ -12,10 +12,7 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
-uint64_t  MemoryManager::DeviceEvictions;
-uint64_t  MemoryManager::DeviceDestroy;

-void  MemoryManager::Audit(std::string s){};
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
@@ -24,7 +21,6 @@ void  MemoryManager::PrintState(void* CpuPtr)
 std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
 };
 void  MemoryManager::Print(void){};
-void  MemoryManager::PrintAll(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};

 NAMESPACE_END(Grid);
@@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
-  std::vector<uint64_t> pagedata(npages);
+  uint64_t pagedata[npages];
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
-  ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
+  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
@@ -70,8 +70,8 @@ public:
  Coordinate _istride;    // Inner stride i.e. within simd lane
  int _osites;                  // _isites*_osites = product(dimensions).
  int _isites;
-  int64_t _fsites;                  // _isites*_osites = product(dimensions).
-  int64_t _gsites;
+  int _fsites;                  // _isites*_osites = product(dimensions).
+  int _gsites;
  Coordinate _slice_block;// subslice information
  Coordinate _slice_stride;
  Coordinate _slice_nblock;
@@ -82,7 +82,6 @@ public:
  bool _isCheckerBoarded; 
  int        LocallyPeriodic;
  Coordinate _checker_dim_mask;
-  int              _checker_dim;

 public:

@@ -90,7 +89,7 @@ public:
  // Checkerboarding interface is virtual and overridden by 
  // GridCartesian / GridRedBlackCartesian
  ////////////////////////////////////////////////////////////////
-  virtual int CheckerBoarded(int dim) =0;
+  virtual int CheckerBoarded(int dim)=0;
  virtual int CheckerBoard(const Coordinate &site)=0;
  virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
  virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
@@ -184,7 +183,7 @@ public:
  inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
  inline int oSites(void) const { return _osites; };
  inline int lSites(void) const { return _isites*_osites; }; 
-  inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; 
+  inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
  inline int Nd    (void) const { return _ndimension;};

  inline const Coordinate LocalStarts(void)             { return _lstart;    };
@@ -215,7 +214,7 @@ public:
  ////////////////////////////////////////////////////////////////
  // Global addressing
  ////////////////////////////////////////////////////////////////
-  void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
+  void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
    assert(gidx< gSites());
    Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
  }
@@ -223,7 +222,7 @@ public:
    assert(lidx<lSites());
    Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
  }
-  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
+  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
    gidx=0;
    int mult=1;
    for(int mu=0;mu<_ndimension;mu++) {
@@ -38,7 +38,7 @@ class GridCartesian: public GridBase {

 public:
  int dummy;
-  //  Coordinate _checker_dim_mask;
+  Coordinate _checker_dim_mask;
  virtual int  CheckerBoardFromOindexTable (int Oindex) {
    return 0;
  }
@@ -46,7 +46,7 @@ public:
  {
    return 0;
  }
-  virtual int CheckerBoarded(int dim) {
+  virtual int CheckerBoarded(int dim){
    return 0;
  }
  virtual int CheckerBoard(const Coordinate &site){
@@ -106,7 +106,6 @@ public:
    _rdimensions.resize(_ndimension);
    _simd_layout.resize(_ndimension);
    _checker_dim_mask.resize(_ndimension);;
-    _checker_dim = -1;
    _lstart.resize(_ndimension);
    _lend.resize(_ndimension);

@@ -57,10 +57,9 @@ class GridRedBlackCartesian : public GridBase
 {
 public:
  //  Coordinate _checker_dim_mask;
-  //  int              _checker_dim;
+  int              _checker_dim;
  std::vector<int> _checker_board;

-  virtual int isCheckerBoarded(void) const { return 1; };
  virtual int CheckerBoarded(int dim){
    if( dim==_checker_dim) return 1;
    else return 0;
@@ -148,7 +147,7 @@ public:
  {
    Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim)  ;
  }
-  
+
  virtual ~GridRedBlackCartesian() = default;

  void Init(const Coordinate &dimensions,
@@ -57,29 +57,18 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////

-#ifdef USE_GRID_REDUCTION
-void CartesianCommunicator::GlobalSum(ComplexF &c)
-{
-  GlobalSumP2P(c);
-}
-void CartesianCommunicator::GlobalSum(ComplexD &c)
-{
-  GlobalSumP2P(c);
-}
-#else
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
-void CartesianCommunicator::GlobalSum(ComplexD &c)
-{
-  GlobalSumVector((double *)&c,2);
-}
-#endif
 void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 {
  GlobalSumVector((float *)c,2*N);
 }
+void CartesianCommunicator::GlobalSum(ComplexD &c)
+{
+  GlobalSumVector((double *)&c,2);
+}
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
@@ -33,8 +33,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 #include <Grid/communicator/SharedMemory.h>

-#define NVLINK_GET
-
 NAMESPACE_BEGIN(Grid);

 extern bool Stencil_force_mpi ;
@@ -55,11 +53,10 @@ public:
  // Communicator should know nothing of the physics grid, only processor grid.
  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
-  int              _processor;       // linear processor rank
-  unsigned long    _ndimension;
-  Coordinate _shm_processors;  // Which dimensions get relayed out over processors lanes.
  Coordinate _processors;      // Which dimensions get relayed out over processors lanes.
+  int              _processor;       // linear processor rank
  Coordinate _processor_coor;  // linear processor coordinate
+  unsigned long    _ndimension;
  static Grid_MPI_Comm      communicator_world;
  Grid_MPI_Comm             communicator;
  std::vector<Grid_MPI_Comm> communicator_halo;
@@ -100,16 +97,14 @@ public:
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const Coordinate & ThisProcessorCoor(void) ;
-  const Coordinate & ShmGrid(void)  { return _shm_processors; }  ;
  const Coordinate & ProcessorGrid(void)     ;
-  int                ProcessorCount(void)    ;
+  int                      ProcessorCount(void)    ;

  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
-  static void BarrierWorld(void);
  
  ////////////////////////////////////////////////////////////
  // Reduction
@@ -129,54 +124,17 @@ public:
  void GlobalSumVector(ComplexD *c,int N);
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
-
-  template<class obj> void GlobalSumP2P(obj &o)
-  {
-    std::vector<obj> column;
-    obj accum = o;
-    int source,dest;
-    for(int d=0;d<_ndimension;d++){
-      column.resize(_processors[d]);
-      column[0] = accum;
-      std::vector<MpiCommsRequest_t> list;
-      for(int p=1;p<_processors[d];p++){
-	ShiftedRanks(d,p,source,dest);
-	SendToRecvFromBegin(list,
-			    &column[0],
-			    dest,
-			    &column[p],
-			    source,
-			    sizeof(obj),d*100+p);
-
-      }
-      if (!list.empty()) // avoid triggering assert in comms == none
-	CommsComplete(list);
-      for(int p=1;p<_processors[d];p++){
-	accum = accum + column[p];
-      }
-    }
-    Broadcast(0,accum);
-    o=accum;
-  }
-
+  
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
-    scalar_type * ptr = (scalar_type *)& o; // Safe alias 
+    scalar_type * ptr = (scalar_type *)& o;
    GlobalSumVector(ptr,words);
  }
  
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
-  void CommsComplete(std::vector<MpiCommsRequest_t> &list);
-  void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
-			   void *xmit,
-			   int dest,
-			   void *recv,
-			   int from,
-			   int bytes,int dir);
-  
  void SendToRecvFrom(void *xmit,
 		      int xmit_to_rank,
 		      void *recv,
@@ -184,28 +142,17 @@ public:
 		      int bytes);
  
  double StencilSendToRecvFrom(void *xmit,
-			       int xmit_to_rank,int do_xmit,
+			       int xmit_to_rank,
 			       void *recv,
-			       int recv_from_rank,int do_recv,
+			       int recv_from_rank,
 			       int bytes,int dir);

-  double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
-				      void *xmit,
-				      int xmit_to_rank,int do_xmit,
-				      void *recv,
-				      int recv_from_rank,int do_recv,
-				      int xbytes,int rbytes,int dir);
-
-  // Could do a PollHtoD and have a CommsMerge dependence
-  void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
-  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
-
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
-				    int xmit_to_rank,int do_xmit,
+				    int xmit_to_rank,
 				    void *recv,
-				    int recv_from_rank,int do_recv,
-				    int xbytes,int rbytes,int dir);
+				    int recv_from_rank,
+				    int bytes,int dir);
  
  
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
@@ -30,7 +30,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;

 ////////////////////////////////////////////
@@ -107,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm,_shm_processors);
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
@@ -125,13 +124,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  Coordinate parent_processor_coor(_ndimension,0);
  Coordinate parent_processors    (_ndimension,1);
-  Coordinate shm_processors       (_ndimension,1);
+
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
-    shm_processors       [pad+d]=parent._shm_processors[d];
  }

  //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -156,7 +154,6 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
-    if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
  }

  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@@ -258,25 +255,6 @@ CartesianCommunicator::~CartesianCommunicator()
    }
  }
 }
-#ifdef USE_GRID_REDUCTION
-void CartesianCommunicator::GlobalSum(float &f){
-  CartesianCommunicator::GlobalSumP2P(f);
-}
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  CartesianCommunicator::GlobalSumP2P(d);
-}
-#else
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-#endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@@ -307,54 +285,25 @@ void CartesianCommunicator::GlobalMax(double &d)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
  assert(ierr==0);
 }
+void CartesianCommunicator::GlobalSum(float &f){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
-
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes,int dir)
-{
-  MPI_Request xrq;
-  MPI_Request rrq;
-
-  assert(dest != _processor);
-  assert(from != _processor);
-
-  int tag;
-
-  tag= dir+from*32;
-  int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
-  assert(ierr==0);
-  list.push_back(rrq);
-  
-  tag= dir+_processor*32;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
-  assert(ierr==0);
-  list.push_back(xrq);
-}
-void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
-{
-  int nreq=list.size();
-
-  if (nreq==0) return;
-
-  std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-  assert(ierr==0);
-  list.resize(0);
-}
-
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
@@ -362,7 +311,9 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int from,
 					   int bytes)
 {
-  std::vector<MpiCommsRequest_t> reqs(0);
+  std::vector<CommsRequest_t> reqs(0);
+  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
+  unsigned long  rcrc = crc32(0L, Z_NULL, 0);

  int myrank = _processor;
  int ierr;
@@ -378,40 +329,29 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);

+  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
+  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
+  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int dest, int dox,
+						     int dest,
 						     void *recv,
-						     int from, int dor,
+						     int from,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
-  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }

-
-#ifdef ACCELERATOR_AWARE_MPI
-void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
-void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
-double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
-							   void *xmit,
-							   int dest,int dox,
-							   void *recv,
-							   int from,int dor,
-							   int xbytes,int rbytes,int dir)
-{
-  return 0.0; // Do nothing -- no preparation required
-}
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int dest,int dox,
+							 int dest,
 							 void *recv,
-							 int from,int dor,
-							 int xbytes,int rbytes,int dir)
+							 int from,
+							 int bytes,int dir)
 {
  int ncomm  =communicator_halo.size();
  int commdir=dir%ncomm;
@@ -429,369 +369,49 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
  int tag;
-  
-  if ( dor ) {
-    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
-      tag= dir+from*32;
-      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      assert(ierr==0);
-      list.push_back(rrq);
-      off_node_bytes+=rbytes;
-    }
-#ifdef NVLINK_GET
-    else { 
-      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
-      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
-    }
-#endif
+
+  if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+    tag= dir+from*32;
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+    assert(ierr==0);
+    list.push_back(rrq);
+    off_node_bytes+=bytes;
  }
-  // This is a NVLINK PUT  
-  if (dox) {
-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-      tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
-      list.push_back(xrq);
-      off_node_bytes+=xbytes;
-    } else {
-#ifndef NVLINK_GET
-      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
-      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
-#endif
-    }
+
+  if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+    tag= dir+_processor*32;
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+    assert(ierr==0);
+    list.push_back(xrq);
+    off_node_bytes+=bytes;
+  } else {
+    // TODO : make a OMP loop on CPU, call threaded bcopy
+    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+    assert(shm!=NULL);
+    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
+    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
  }
+
+  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
+    this->StencilSendToRecvFromComplete(list,dir);
+  }
+
  return off_node_bytes;
 }
-
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  int nreq=list.size();
-  /*finishes Get/Put*/
+  //   std::cout << "Copy Synchronised\n"<<std::endl;
  acceleratorCopySynchronise();

+  int nreq=list.size();
+
  if (nreq==0) return;
+
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
-  this->StencilBarrier(); 
 }
-
-#else /* NOT     ... ACCELERATOR_AWARE_MPI */
-///////////////////////////////////////////
-// Pipeline mode through host memory
-///////////////////////////////////////////
-  /*
-   * In prepare (phase 1):
-   * PHASE 1: (prepare)
-   * - post MPI receive buffers asynch
-   * - post device - host send buffer transfer asynch
-   * PHASE 2: (Begin)
-   * - complete all copies
-   * - post MPI send asynch
-   * - post device - device transfers
-   * PHASE 3: (Complete)
-   * - MPI_waitall
-   * - host-device transfers
-   *
-   *********************************
-   * NB could split this further:
-   *--------------------------------
-   * PHASE 1: (Prepare)
-   * - post MPI receive buffers asynch
-   * - post device - host send buffer transfer asynch
-   * PHASE 2: (BeginInterNode)
-   * - complete all copies 
-   * - post MPI send asynch
-   * PHASE 3: (BeginIntraNode)
-   * - post device - device transfers
-   * PHASE 4: (Complete)
-   * - MPI_waitall
-   * - host-device transfers asynch
-   * - (complete all copies) 
-   */
-double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
-							   void *xmit,
-							   int dest,int dox,
-							   void *recv,
-							   int from,int dor,
-							   int xbytes,int rbytes,int dir)
-{
-/*
- * Bring sequence from Stencil.h down to lower level.
- * Assume using XeLink is ok
- */  
-  int ncomm  =communicator_halo.size();
-  int commdir=dir%ncomm;
-
-  MPI_Request xrq;
-  MPI_Request rrq;
-
-  int ierr;
-  int gdest = ShmRanks[dest];
-  int gfrom = ShmRanks[from];
-  int gme   = ShmRanks[_processor];
-
-  assert(dest != _processor);
-  assert(from != _processor);
-  assert(gme  == ShmRank);
-  double off_node_bytes=0.0;
-  int tag;
-
-  void * host_recv = NULL;
-  void * host_xmit = NULL;
-
-  /*
-   * PHASE 1: (Prepare)
-   * - post MPI receive buffers asynch
-   * - post device - host send buffer transfer asynch
-   */
-  
-  if ( dor ) {
-    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
-      tag= dir+from*32;
-      host_recv = this->HostBufferMalloc(rbytes);
-      ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      assert(ierr==0);
-      CommsRequest_t srq;
-      srq.PacketType = InterNodeRecv;
-      srq.bytes      = rbytes;
-      srq.req        = rrq;
-      srq.host_buf   = host_recv;
-      srq.device_buf = recv;
-      list.push_back(srq);
-      off_node_bytes+=rbytes;
-    }
-  }
-  
-  if (dox) {
-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-
-      tag= dir+_processor*32;
-
-      host_xmit = this->HostBufferMalloc(xbytes);
-      CommsRequest_t srq;
-
-      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
-      
-      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      //      assert(ierr==0);
-      //      off_node_bytes+=xbytes;
-
-      srq.PacketType = InterNodeXmit;
-      srq.bytes      = xbytes;
-      //      srq.req        = xrq;
-      srq.host_buf   = host_xmit;
-      srq.device_buf = xmit;
-      srq.tag        = tag;
-      srq.dest       = dest;
-      srq.commdir    = commdir;
-      list.push_back(srq);
-    }
-  }
-
-  return off_node_bytes;
-}
-/*
- * In the interest of better pipelining, poll for completion on each DtoH and 
- * start MPI_ISend in the meantime
- */
-void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
-{
-  int pending = 0;
-  do {
-
-    pending = 0;
-
-    for(int idx = 0; idx<list.size();idx++){
-
-      if ( list[idx].PacketType==InterNodeRecv ) {
-
-	int flag = 0;
-	MPI_Status status;
-	int ierr = MPI_Test(&list[idx].req,&flag,&status);
-	assert(ierr==0);
-
-	if ( flag ) {
-	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
-	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
-	  list[idx].PacketType=InterNodeReceiveHtoD;
-	} else {
-	  pending ++;
-	}
-      }
-    }
-    //    std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
-  } while ( pending );
-  
-}
-void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
-{
-  int pending = 0;
-  do {
-
-    pending = 0;
-
-    for(int idx = 0; idx<list.size();idx++){
-
-      if ( list[idx].PacketType==InterNodeXmit ) {
-
-	if ( acceleratorEventIsComplete(list[idx].ev) ) {
-
-	  void *host_xmit = list[idx].host_buf;
-	  uint32_t xbytes = list[idx].bytes;
-	  int dest        = list[idx].dest;
-	  int tag         = list[idx].tag;
-	  int commdir     = list[idx].commdir;
-	  ///////////////////
-	  // Send packet
-	  ///////////////////
-
-	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
-	  
-	  MPI_Request xrq;
-	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-	  assert(ierr==0);
-
-	  list[idx].req        = xrq; // Update the MPI request in the list
-
-	  list[idx].PacketType=InterNodeXmitISend;
-
-	} else {
-	  // not done, so return to polling loop
-	  pending++;
-	}
-      }
-    }
-  } while (pending);
-}  
-
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
-							 int dest,int dox,
-							 void *recv,
-							 int from,int dor,
-							 int xbytes,int rbytes,int dir)
-{
-  int ncomm  =communicator_halo.size();
-  int commdir=dir%ncomm;
-
-  MPI_Request xrq;
-  MPI_Request rrq;
-
-  int ierr;
-  int gdest = ShmRanks[dest];
-  int gfrom = ShmRanks[from];
-  int gme   = ShmRanks[_processor];
-
-  assert(dest != _processor);
-  assert(from != _processor);
-  assert(gme  == ShmRank);
-  double off_node_bytes=0.0;
-  int tag;
-
-  void * host_xmit = NULL;
-
-  ////////////////////////////////
-  // Receives already posted
-  // Copies already started
-  ////////////////////////////////
-  /*  
-   * PHASE 2: (Begin)
-   * - complete all copies
-   * - post MPI send asynch
-   */
-#ifdef NVLINK_GET
-  if ( dor ) {
-
-    if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
-      // Intranode
-      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
-      assert(shm!=NULL);
-
-      CommsRequest_t srq;
-
-      srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
-
-      srq.PacketType = IntraNodeRecv;
-      srq.bytes      = xbytes;
-      //      srq.req        = xrq;
-      srq.host_buf   = NULL;
-      srq.device_buf = xmit;
-      srq.tag        = -1;
-      srq.dest       = dest;
-      srq.commdir    = dir;
-      list.push_back(srq);
-    }
-  }  
-#else
-  if (dox) {
-
-    if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
-      // Intranode
-      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
-      assert(shm!=NULL);
-
-      CommsRequest_t srq;
-      
-      srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
-
-      srq.PacketType = IntraNodeXmit;
-      srq.bytes      = xbytes;
-      //      srq.req        = xrq;
-      srq.host_buf   = NULL;
-      srq.device_buf = xmit;
-      srq.tag        = -1;
-      srq.dest       = dest;
-      srq.commdir    = dir;
-      list.push_back(srq);
-      
-    }
-  }
-#endif
-  return off_node_bytes;
-}
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
-{
-  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
-
-  std::vector<MPI_Status> status;
-  std::vector<MPI_Request> MpiRequests;
-    
-  for(int r=0;r<list.size();r++){
-    // Must check each Send buf is clear to reuse
-    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
-    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
-  }
-
-  int nreq=MpiRequests.size();
-
-  if (nreq>0) {
-    status.resize(MpiRequests.size());
-    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
-    assert(ierr==0);
-  }
-  
-  //  for(int r=0;r<nreq;r++){
-  //    if ( list[r].PacketType==InterNodeRecv ) {
-  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
-  //    }
-  //  }
-  
-  
-  list.resize(0);               // Delete the list
-  this->HostBufferFreeAll();    // Clean up the buffer allocs
-#ifndef NVLINK_GET
-  this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
-#endif   
-}
-#endif
-////////////////////////////////////////////
-// END PIPELINE MODE / NO CUDA AWARE MPI
-////////////////////////////////////////////
-
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
@@ -818,10 +438,6 @@ int CartesianCommunicator::RankWorld(void){
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
-void CartesianCommunicator::BarrierWorld(void){
-  int ierr = MPI_Barrier(communicator_world);
-  assert(ierr==0);
-}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
@@ -45,14 +45,12 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) 
  : CartesianCommunicator(processors) 
 {
-  _shm_processors = Coordinate(processors.size(),1);
  srank=0;
  SetCommunicator(communicator_world);
 }

 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 {
-  _shm_processors = Coordinate(processors.size(),1);
  _processors = processors;
  _ndimension = processors.size();  assert(_ndimension>=1);
  _processor_coor.resize(_ndimension);
@@ -91,17 +89,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes,int dir)
-{
-  assert(0);
-}
-
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
@@ -115,7 +102,6 @@ int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
-void CartesianCommunicator::BarrierWorld(void) { }
 int  CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) {  return 0;}
 void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){  coor = _processor_coor; }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
@@ -125,32 +111,21 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
 }

 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int xmit_to_rank,int dox,
+						     int xmit_to_rank,
 						     void *recv,
-						     int recv_from_rank,int dor,
+						     int recv_from_rank,
 						     int bytes, int dir)
 {
  return 2.0*bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
-void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
-double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
-							   void *xmit,
-							   int xmit_to_rank,int dox,
-							   void *recv,
-							   int recv_from_rank,int dor,
-							   int xbytes,int rbytes, int dir)
-{
-  return 0.0;
-}
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int xmit_to_rank,int dox,
+							 int xmit_to_rank,
 							 void *recv,
-							 int recv_from_rank,int dor,
-							 int xbytes,int rbytes, int dir)
+							 int recv_from_rank,
+							 int bytes, int dir)
 {
-  return xbytes+rbytes;
+  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
@@ -40,9 +40,6 @@ int                 GlobalSharedMemory::_ShmAlloc;
 uint64_t            GlobalSharedMemory::_ShmAllocBytes;

 std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
-#ifndef ACCELERATOR_AWARE_MPI
-void * GlobalSharedMemory::HostCommBuf;
-#endif

 Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
 int                 GlobalSharedMemory::WorldShmRank;
@@ -69,26 +66,6 @@ void GlobalSharedMemory::SharedMemoryFree(void)
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
-#ifndef ACCELERATOR_AWARE_MPI
-void *SharedMemory::HostBufferMalloc(size_t bytes){
-  void *ptr = (void *)host_heap_top;
-  host_heap_top  += bytes;
-  host_heap_bytes+= bytes;
-  if (host_heap_bytes >= host_heap_size) {
-    std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
-    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
-    std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
-    std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
-    std::cout<< " Current heap  is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
-    assert(host_heap_bytes<host_heap_size);
-  }
-  return ptr;
-}
-void SharedMemory::HostBufferFreeAll(void) { 
-  host_heap_top  =(size_t)HostCommBuf;
-  host_heap_bytes=0;
-}
-#endif
 void *SharedMemory::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
@@ -114,59 +91,6 @@ void *SharedMemory::ShmBufferSelf(void)
  //std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
  return ShmCommBufs[ShmRank];
 }
-static inline int divides(int a,int b)
-{
-  return ( b == ( (b/a)*a ) );
-}
-void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
-{
-  ////////////////////////////////////////////////////////////////
-  // Allow user to configure through environment variable
-  ////////////////////////////////////////////////////////////////
-  char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
-  if ( str ) {
-    std::vector<int> IntShmDims;
-    GridCmdOptionIntVector(std::string(str),IntShmDims);
-    assert(IntShmDims.size() == WorldDims.size());
-    long ShmSize = 1;
-    for (int dim=0;dim<WorldDims.size();dim++) {
-      ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
-      assert(divides(ShmDims[dim],WorldDims[dim]));
-    }
-    assert(ShmSize == WorldShmSize);
-    return;
-  }
-  
-  ////////////////////////////////////////////////////////////////
-  // Powers of 2,3,5 only in prime decomposition for now
-  ////////////////////////////////////////////////////////////////
-  int ndimension = WorldDims.size();
-  ShmDims=Coordinate(ndimension,1);
-
-  std::vector<int> primes({2,3,5});
-
-  int dim = 0;
-  int last_dim = ndimension - 1;
-  int AutoShmSize = 1;
-  while(AutoShmSize != WorldShmSize) {
-    int p;
-    for(p=0;p<primes.size();p++) {
-      int prime=primes[p];
-      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
-        && divides(prime,WorldShmSize/AutoShmSize)  ) {
-  AutoShmSize*=prime;
-  ShmDims[dim]*=prime;
-  last_dim = dim;
-  break;
-      }
-    }
-    if (p == primes.size() && last_dim == dim) {
-      std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
-      exit(EXIT_FAILURE);
-    }
-    dim=(dim+1) %ndimension;
-  }
-}

 NAMESPACE_END(Grid); 

@@ -46,40 +46,8 @@ NAMESPACE_BEGIN(Grid);

 #if defined (GRID_COMMS_MPI3) 
 typedef MPI_Comm    Grid_MPI_Comm;
-typedef MPI_Request MpiCommsRequest_t;
-#ifdef ACCELERATOR_AWARE_MPI
 typedef MPI_Request CommsRequest_t;
-#else
-/*
- * Enable state transitions as each packet flows.
- */
-enum PacketType_t {
-  FaceGather,
-  InterNodeXmit,
-  InterNodeRecv,
-  IntraNodeXmit,
-  IntraNodeRecv,
-  InterNodeXmitISend,
-  InterNodeReceiveHtoD
-};
-/*
- *Package arguments needed for various actions along packet flow
- */
-typedef struct {
-  PacketType_t PacketType;
-  void *host_buf;
-  void *device_buf;
-  int dest;
-  int tag;
-  int commdir;
-  unsigned long bytes;
-  acceleratorEvent_t ev;
-  MpiCommsRequest_t req;
-} CommsRequest_t;
-#endif
-
 #else 
-typedef int MpiCommsRequest_t;
 typedef int CommsRequest_t;
 typedef int Grid_MPI_Comm;
 #endif
@@ -107,9 +75,7 @@ public:
  static int           Hugepages;

  static std::vector<void *> WorldShmCommBufs;
-#ifndef ACCELERATOR_AWARE_MPI
-  static void *HostCommBuf;
-#endif
+
  static Grid_MPI_Comm WorldComm;
  static int           WorldRank;
  static int           WorldSize;
@@ -127,17 +93,16 @@ public:
  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
-  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
-  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
-  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
+  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
-  //  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
+  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
  static void SharedMemoryZero(void *dest,size_t bytes);

 };
@@ -154,13 +119,6 @@ private:
  size_t heap_bytes;
  size_t heap_size;

-#ifndef ACCELERATOR_AWARE_MPI
-  size_t host_heap_top;  // set in free all
-  size_t host_heap_bytes;// set in free all
-  void *HostCommBuf;     // set in SetCommunicator
-  size_t host_heap_size; // set in SetCommunicator
-#endif
-  
 protected:

  Grid_MPI_Comm    ShmComm; // for barriers
@@ -192,10 +150,7 @@ public:
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void  ShmBufferFreeAll(void) ;
-#ifndef ACCELERATOR_AWARE_MPI
-  void *HostBufferMalloc(size_t bytes);
-  void HostBufferFreeAll(void);
-#endif  
+  
  //////////////////////////////////////////////////////////////////////////
  // Make info on Nodes & ranks and Shared memory available
  //////////////////////////////////////////////////////////////////////////
@@ -27,8 +27,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 *************************************************************************************/
 /*  END LEGAL */

-#define Mheader "SharedMemoryMpi: "
-
 #include <Grid/GridCore.h>
 #include <pwd.h>

@@ -38,127 +36,12 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef GRID_HIP
 #include <hip/hip_runtime_api.h>
 #endif
-#ifdef GRID_SYCL
-#ifdef ACCELERATOR_AWARE_MPI
-#define GRID_SYCL_LEVEL_ZERO_IPC
-#define SHM_SOCKETS
-#else
-#ifdef HAVE_NUMAIF_H
-  #warning " Using NUMAIF "
-#include <numaif.h>
-#endif 
-#endif 
-#include <syscall.h>
-#endif
+#ifdef GRID_SYCl

-#include <sys/socket.h>
-#include <sys/un.h>
+#endif

 NAMESPACE_BEGIN(Grid); 
-
-#ifdef SHM_SOCKETS
-
-/*
- * Barbaric extra intranode communication route in case we need sockets to pass FDs
- * Forced by level_zero not being nicely designed
- */
-static int sock;
-static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
-static char sock_path[256];
-class UnixSockets {
-public:
-  static void Open(int rank)
-  {
-    int errnum;
-
-    sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0);
-
-    struct sockaddr_un sa_un = { 0 };
-    sa_un.sun_family = AF_UNIX;
-    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
-    unlink(sa_un.sun_path);
-    if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
-      perror("bind failure");
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  static int RecvFileDescriptor(void)
-  {
-    int n;
-    int fd;
-    char buf[1];
-    struct iovec iov;
-    struct msghdr msg;
-    struct cmsghdr *cmsg;
-    char cms[CMSG_SPACE(sizeof(int))];
-
-    iov.iov_base = buf;
-    iov.iov_len = 1;
-
-    memset(&msg, 0, sizeof msg);
-    msg.msg_name = 0;
-    msg.msg_namelen = 0;
-    msg.msg_iov = &iov;
-    msg.msg_iovlen = 1;
-
-    msg.msg_control = (caddr_t)cms;
-    msg.msg_controllen = sizeof cms;
-
-    if((n=recvmsg(sock, &msg, 0)) < 0) {
-      perror("recvmsg failed");
-      return -1;
-    }
-    if(n == 0){
-      perror("recvmsg returned 0");
-      return -1;
-    }
-    cmsg = CMSG_FIRSTHDR(&msg);
-
-    memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
-
-    return fd;
-  }
-
-  static void SendFileDescriptor(int fildes,int xmit_to_rank)
-  {
-    struct msghdr msg;
-    struct iovec iov;
-    struct cmsghdr *cmsg = NULL;
-    char ctrl[CMSG_SPACE(sizeof(int))];
-    char data = ' ';
-
-    memset(&msg, 0, sizeof(struct msghdr));
-    memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
-    iov.iov_base = &data;
-    iov.iov_len = sizeof(data);
-    
-    sprintf(sock_path,sock_path_fmt,xmit_to_rank);
-    
-    struct sockaddr_un sa_un = { 0 };
-    sa_un.sun_family = AF_UNIX;
-    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
-
-    msg.msg_name = (void *)&sa_un;
-    msg.msg_namelen = sizeof(sa_un);
-    msg.msg_iov = &iov;
-    msg.msg_iovlen = 1;
-    msg.msg_controllen =  CMSG_SPACE(sizeof(int));
-    msg.msg_control = ctrl;
-
-    cmsg = CMSG_FIRSTHDR(&msg);
-    cmsg->cmsg_level = SOL_SOCKET;
-    cmsg->cmsg_type = SCM_RIGHTS;
-    cmsg->cmsg_len = CMSG_LEN(sizeof(int));
-
-    *((int *) CMSG_DATA(cmsg)) = fildes;
-
-    sendmsg(sock, &msg, 0);
-  };
-};
-#endif
-
-
+#define header "SharedMemoryMpi: "
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
@@ -181,8 +64,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);

  if ( WorldRank == 0) {
-    std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;  
-    std::cout << Mheader " Node  communicator of size " <<WorldShmSize << std::endl;
+    std::cout << header " World communicator of size " <<WorldSize << std::endl;  
+    std::cout << header " Node  communicator of size " <<WorldShmSize << std::endl;
  }
  // WorldShmComm, WorldShmSize, WorldShmRank

@@ -269,7 +152,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
  }
  return log2size;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  //////////////////////////////////////////////////////////////////////////////
  // Look and see if it looks like an HPE 8600 based on hostname conventions
@@ -282,11 +165,63 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;

-  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
-  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
+  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
+  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm);
 }
+static inline int divides(int a,int b)
+{
+  return ( b == ( (b/a)*a ) );
+}
+void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
+{
+  ////////////////////////////////////////////////////////////////
+  // Allow user to configure through environment variable
+  ////////////////////////////////////////////////////////////////
+  char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
+  if ( str ) {
+    std::vector<int> IntShmDims;
+    GridCmdOptionIntVector(std::string(str),IntShmDims);
+    assert(IntShmDims.size() == WorldDims.size());
+    long ShmSize = 1;
+    for (int dim=0;dim<WorldDims.size();dim++) {
+      ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
+      assert(divides(ShmDims[dim],WorldDims[dim]));
+    }
+    assert(ShmSize == WorldShmSize);
+    return;
+  }
+  
+  ////////////////////////////////////////////////////////////////
+  // Powers of 2,3,5 only in prime decomposition for now
+  ////////////////////////////////////////////////////////////////
+  int ndimension = WorldDims.size();
+  ShmDims=Coordinate(ndimension,1);

-void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+  std::vector<int> primes({2,3,5});
+
+  int dim = 0;
+  int last_dim = ndimension - 1;
+  int AutoShmSize = 1;
+  while(AutoShmSize != WorldShmSize) {
+    int p;
+    for(p=0;p<primes.size();p++) {
+      int prime=primes[p];
+      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
+        && divides(prime,WorldShmSize/AutoShmSize)  ) {
+	AutoShmSize*=prime;
+	ShmDims[dim]*=prime;
+	last_dim = dim;
+	break;
+      }
+    }
+    if (p == primes.size() && last_dim == dim) {
+      std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    dim=(dim+1) %ndimension;
+  }
+}
+void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
@@ -359,8 +294,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  Coordinate HyperCoor(ndimension);

  GetShmDims(WorldDims,ShmDims);
-  SHM = ShmDims;
-  
+
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -407,7 +341,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 }
-void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
@@ -419,8 +353,6 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
  Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension);

  GetShmDims(WorldDims,ShmDims);
-  SHM=ShmDims;
-
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -459,7 +391,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
 #ifdef GRID_MPI3_SHMGET
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);

@@ -519,6 +451,46 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
 #if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL)
+
+//if defined(GRID_SYCL)
+#if 0
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  void * ShmCommBuf ; 
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the pointer array for shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Each MPI rank should allocate our own buffer
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  ShmCommBuf = acceleratorAllocDevice(bytes);
+
+  if (ShmCommBuf == (void *)NULL ) {
+    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    exit(EXIT_FAILURE);  
+  }
+
+  std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+
+  SharedMemoryZero(ShmCommBuf,bytes);
+
+  assert(WorldShmSize == 1);
+  for(int r=0;r<WorldShmSize;r++){
+    WorldShmCommBufs[r] = ShmCommBuf;
+  }
+  _ShmAllocBytes=bytes;
+  _ShmAlloc=1;
+}
+#endif
+
+#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)  
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@@ -541,61 +513,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-#ifndef ACCELERATOR_AWARE_MPI
-  // printf("Host buffer allocate for GPU non-aware MPI\n");
-#if 0
-  HostCommBuf= acceleratorAllocHost(bytes);
-#else 
-  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
-#if 0
-  #warning "Moving host buffers to specific NUMA domain"
-  int numa;
-  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
-  if(numa_name) {
-    unsigned long page_size = sysconf(_SC_PAGESIZE);
-    numa = atoi(numa_name);
-    unsigned long page_count = bytes/page_size;
-    std::vector<void *> pages(page_count);
-    std::vector<int>    nodes(page_count,numa);
-    std::vector<int>    status(page_count,-1);
-    for(unsigned long p=0;p<page_count;p++){
-      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
-    }
-    int ret = move_pages(0,
-			 page_count,
-			 &pages[0],
-			 &nodes[0],
-			 &status[0],
-			 MPOL_MF_MOVE);
-    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
-    if (ret) perror(" move_pages failed for reason:");
-  }
-#endif  
-  acceleratorPin(HostCommBuf,bytes);
-#endif  
-
-#endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
-	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
+    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
  std::cout<< "Setting up IPC"<<std::endl;
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef SHM_SOCKETS
-  UnixSockets::Open(WorldShmRank);
-#endif
  for(int r=0;r<WorldShmSize;r++){

-    MPI_Barrier(WorldShmComm);
-
 #ifndef GRID_MPI3_SHM_NONE
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
@@ -603,32 +536,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    void * thisBuf = ShmCommBuf;
    if(!Stencil_force_mpi) {
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
-    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
+    typedef struct { int fd; pid_t pid ; } clone_mem_t;

-    auto zeDevice    = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
+    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
+    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
      
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
-    
+
    if ( r==WorldShmRank ) { 
      auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
+	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
      handle.pid = getpid();
-      memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
-#ifdef SHM_SOCKETS
-      for(int rr=0;rr<WorldShmSize;rr++){
-	if(rr!=r){
-	  UnixSockets::SendFileDescriptor(handle.fd,rr);
-	}
-      }
-#endif
    }
 #endif
 #ifdef GRID_CUDA
@@ -656,7 +581,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
    { 
-      MPI_Barrier(WorldShmComm);
      int ierr=MPI_Bcast(&handle,
 			 sizeof(handle),
 			 MPI_BYTE,
@@ -672,10 +596,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    if ( r!=WorldShmRank ) {
      thisBuf = nullptr;
-      int myfd;
-#ifdef SHM_SOCKETS
-      myfd=UnixSockets::RecvFileDescriptor();
-#else
      std::cout<<"mapping seeking remote pid/fd "
 	       <<handle.pid<<"/"
 	       <<handle.fd<<std::endl;
@@ -683,22 +603,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
-      myfd  = syscall(438,pidfd,handle.fd,0);
-      int err_t = errno;
-      if (myfd < 0) {
-        fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
-	perror("pidfd_getfd failed ");
-	assert(0);
-      }
-#endif
-      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
-      memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
+      int myfd  = syscall(438,pidfd,handle.fd,0);
+
+      std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
+      
      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));

      auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
-	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
+	std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
+	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
@@ -733,18 +647,18 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #else
    WorldShmCommBufs[r] = ShmCommBuf;
 #endif
-    MPI_Barrier(WorldShmComm);
  }

  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 }
+#endif

 #else 
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -781,7 +695,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -791,7 +705,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHM_NONE
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -838,7 +752,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
@@ -916,14 +830,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
  bzero(dest,bytes);
 #endif
 }
-//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-//{
-//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-//  acceleratorCopyToDevice(src,dest,bytes);
-//#else   
-//  bcopy(src,dest,bytes);
-//#endif
-//}
+void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+{
+#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+  acceleratorCopyToDevice(src,dest,bytes);
+#else   
+  bcopy(src,dest,bytes);
+#endif
+}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
@@ -959,16 +873,9 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);

    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
-    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
  }
  ShmBufferFreeAll();

-#ifndef ACCELERATOR_AWARE_MPI
-  host_heap_size = heap_size;
-  HostCommBuf= GlobalSharedMemory::HostCommBuf;
-  HostBufferFreeAll();
-#endif  
-
  /////////////////////////////////////////////////////////////////////
  // find comm ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
@@ -990,7 +897,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  SharedMemoryTest();
+  //SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@@ -1012,18 +919,19 @@ void SharedMemory::SharedMemoryTest(void)
       check[0]=GlobalSharedMemory::WorldNode;
       check[1]=r;
       check[2]=magic;
-       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
+       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
    }
  }
  ShmBarrier();
  for(uint64_t r=0;r<ShmSize;r++){
-    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
+    ShmBarrier();
+    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
+    ShmBarrier();
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==magic);
+    ShmBarrier();
  }
-  ShmBarrier();
-  std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
 }

 void *SharedMemory::ShmBuffer(int rank)
@@ -48,10 +48,9 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  _ShmSetup=1;
 }

-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  optimal_comm = WorldComm;
-  SHM = Coordinate(processors.size(),1);
 }

 ////////////////////////////////////////////////////////////////////////////////////////////
@@ -51,6 +51,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #endif 

 NAMESPACE_BEGIN(Grid);
+
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
 auto Cshift(const Expression &expr,int dim,int shift)  -> decltype(closure(expr)) 
 {
@@ -29,28 +29,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-extern std::vector<std::pair<int,int> > Cshift_table; 
-extern deviceVector<std::pair<int,int> > Cshift_table_device; 
+extern Vector<std::pair<int,int> > Cshift_table; 

-inline std::pair<int,int> *MapCshiftTable(void)
-{
-  // GPU version
-  uint64_t sz=Cshift_table.size();
-  if (Cshift_table_device.size()!=sz )    {
-    Cshift_table_device.resize(sz);
-  }
-  acceleratorCopyToDevice((void *)&Cshift_table[0],
-			  (void *)&Cshift_table_device[0],
-			  sizeof(Cshift_table[0])*sz);
-
-  return &Cshift_table_device[0];
-  // CPU version use identify map
-}
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];

@@ -89,11 +74,18 @@ Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dim
  }
  {
    auto buffer_p = & buffer[0];
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for(i,ent,{
+      buffer_p[table[i].first]=rhs_v[table[i].second];
+    });
+#endif
  }
 }

@@ -118,6 +110,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
  int n1=rhs.Grid()->_slice_stride[dimension];

  if ( cbmask ==0x3){
+#ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@@ -128,10 +121,21 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for2d(n,e1,b,e2,{
+	int o      =   n*n1;
+	int offset = b+n*e2;
+	
+	vobj temp =rhs_v[so+o+b];
+	extract<vobj>(temp,pointers,offset);
+      });
+#endif
  } else { 
    Coordinate rdim=rhs.Grid()->_rdimensions;
    Coordinate cdm =rhs.Grid()->_checker_dim_mask;
    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
+#ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@@ -152,13 +156,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	  extract<vobj>(temp,pointers,offset);
 	}
      });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for2d(n,e1,b,e2,{
+
+	Coordinate coor;
+
+	int o=n*n1;
+	int oindex = o+b;
+
+       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
+
+	int ocb=1<<cb;
+	int offset = b+n*e2;
+
+	if ( ocb & cbmask ) {
+	  vobj temp =rhs_v[so+o+b];
+	  extract<vobj>(temp,pointers,offset);
+	}
+      });
+#endif
  }
 }

 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];

@@ -201,11 +225,18 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
  
  {
    auto buffer_p = & buffer[0];
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
    });
+#else
+    autoView( rhs_v, rhs, CpuWrite);
+    thread_for(i,ent,{
+      rhs_v[table[i].first]=buffer_p[table[i].second];
+    });
+#endif
  }
 }

@@ -228,6 +259,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  if(cbmask ==0x3 ) {
    int _slice_stride = rhs.Grid()->_slice_stride[dimension];
    int _slice_block = rhs.Grid()->_slice_block[dimension];
+#ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v , rhs, AcceleratorWrite);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@@ -236,6 +268,14 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
      });
+#else
+    autoView( rhs_v , rhs, CpuWrite);
+    thread_for2d(n,e1,b,e2,{
+	int o      = n*_slice_stride;
+	int offset = b+n*_slice_block;
+	merge(rhs_v[so+o+b],pointers,offset);
+    });
+#endif
  } else { 

    // Case of SIMD split AND checker dim cannot currently be hit, except in 
@@ -300,12 +340,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  }

  {
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    autoView(lhs_v , lhs, CpuWrite);
+    thread_for(i,ent,{
+      lhs_v[table[i].first]=rhs_v[table[i].second];
+    });
+#endif
  }
 }

@@ -344,12 +392,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  }

  {
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);
    accelerator_for(i,ent,1,{
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
    });
+#else
+    autoView( rhs_v, rhs, CpuRead);
+    autoView( lhs_v, lhs, CpuWrite);
+    thread_for(i,ent,{
+      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
+    });
+#endif
  }
 }

@@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>


 NAMESPACE_BEGIN(Grid); 
-const int Cshift_verbose=0;
+
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
  typedef typename vobj::vector_type vector_type;
@@ -52,20 +52,17 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
  int splice_dim      = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);

-  RealD t1,t0;
-  t0=usecond();
+
  if ( !comm_dim ) {
-    //    std::cout << "CSHIFT: Cshift_local" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
  } else if ( splice_dim ) {
-    //    std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
+    //std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift);
  } else {
-    //    std::cout << "CSHIFT: Cshift_comms" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
-  t1=usecond();
-  if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }

@@ -94,16 +91,18 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
  sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
  sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);

-  //  std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+  //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
  if ( sshift[0] == sshift[1] ) {
-    //    std::cout << "Single pass Cshift_comms" <<std::endl;
+    //std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
  } else {
-    //    std::cout << "Two pass Cshift_comms" <<std::endl;
+    //std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
 }
+#define ACCELERATOR_CSHIFT_NO_COPY
+#ifdef ACCELERATOR_CSHIFT_NO_COPY
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
@@ -123,29 +122,21 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
-  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
-#ifndef ACCELERATOR_AWARE_MPI
-  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
-  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
-#endif
-  
+  static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
+  static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
+    
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
+
  for(int x=0;x<rd;x++){       

    int sx        =  (x+sshift)%rd;
    int comm_proc = ((x+sshift)/rd)%pd;
    
    if (comm_proc==0) {
-      tcopy-=usecond();
+
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-      tcopy+=usecond();
+
    } else {

      int words = buffer_size;
@@ -153,52 +144,26 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r

      int bytes = words * sizeof(vobj);

-      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
-      tgather+=usecond();

      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
-
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-      
-      tcomms-=usecond();
+
      grid->Barrier();

-#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
-#else
-      // bouncy bouncy
-      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
-      grid->SendToRecvFrom((void *)&hsend_buf[0],
-			   xmit_to_rank,
-			   (void *)&hrecv_buf[0],
-			   recv_from_rank,
-			   bytes);
-      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
-#endif

-      xbytes+=bytes;
      grid->Barrier();
-      tcomms+=usecond();

-      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
-      tscatter+=usecond();
    }
  }
-  if (Cshift_verbose){
-    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  }
 }

 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -216,21 +181,15 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;

-  //  std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
-  //	    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
-  //	    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;

  assert(comm_dim==1);
  assert(simd_layout==2);
  assert(shift>=0);
  assert(shift<fd);

-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-  
  int permute_type=grid->PermuteType(dimension);

  ///////////////////////////////////////////////
@@ -239,20 +198,16 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);

-  static std::vector<deviceVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
-  static std::vector<deviceVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
-
+ 
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
-#ifndef ACCELERATOR_AWARE_MPI
-  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
-  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
-#endif
-  
+
  int bytes = buffer_size*sizeof(scalar_object);

  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
@@ -272,9 +227,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      pointers[i] = &send_buf_extract[i][0];
    }
    int sx   = (x+sshift)%rd;
-    tgather-=usecond();
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
-    tgather+=usecond();

    for(int i=0;i<Nsimd;i++){
      
@@ -299,31 +252,17 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 

-	tcomms-=usecond();
 	grid->Barrier();

 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
-#ifdef ACCELERATOR_AWARE_MPI
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
-#else
-      // bouncy bouncy
-	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
-	grid->SendToRecvFrom((void *)&hsend_buf[0],
-			     xmit_to_rank,
-			     (void *)&hrecv_buf[0],
-			     recv_from_rank,
-			     bytes);
-	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
-#endif

-	xbytes+=bytes;
 	grid->Barrier();
-	tcomms+=usecond();

 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
@@ -331,19 +270,198 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      }

    }
-    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
-    tscatter+=usecond();
  }
-  if(Cshift_verbose){
-    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+
+}
+#else 
+template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  GridBase *grid=rhs.Grid();
+  Lattice<vobj> temp(rhs.Grid());
+
+  int fd              = rhs.Grid()->_fdimensions[dimension];
+  int rd              = rhs.Grid()->_rdimensions[dimension];
+  int pd              = rhs.Grid()->_processors[dimension];
+  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
+  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
+  assert(simd_layout==1);
+  assert(comm_dim==1);
+  assert(shift>=0);
+  assert(shift<fd);
+  
+  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
+  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
+  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
+  vobj *send_buf;
+  vobj *recv_buf;
+  {
+    grid->ShmBufferFreeAll();
+    size_t bytes = buffer_size*sizeof(vobj);
+    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
+    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
+  }
+    
+  int cb= (cbmask==0x2)? Odd : Even;
+  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
+
+  for(int x=0;x<rd;x++){       
+
+    int sx        =  (x+sshift)%rd;
+    int comm_proc = ((x+sshift)/rd)%pd;
+    
+    if (comm_proc==0) {
+
+      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+
+    } else {
+
+      int words = buffer_size;
+      if (cbmask != 0x3) words=words>>1;
+
+      int bytes = words * sizeof(vobj);
+
+      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
+
+      //      int rank           = grid->_processor;
+      int recv_from_rank;
+      int xmit_to_rank;
+      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+
+
+      grid->Barrier();
+
+      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
+      grid->SendToRecvFrom((void *)&send_buf[0],
+			   xmit_to_rank,
+			   (void *)&recv_buf[0],
+			   recv_from_rank,
+			   bytes);
+      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
+
+      grid->Barrier();
+
+      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
+    }
  }
 }

+template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  GridBase *grid=rhs.Grid();
+  const int Nsimd = grid->Nsimd();
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+   
+  int fd = grid->_fdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
+  int ld = grid->_ldimensions[dimension];
+  int pd = grid->_processors[dimension];
+  int simd_layout     = grid->_simd_layout[dimension];
+  int comm_dim        = grid->_processors[dimension] >1 ;
+
+  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+
+  assert(comm_dim==1);
+  assert(simd_layout==2);
+  assert(shift>=0);
+  assert(shift<fd);
+
+  int permute_type=grid->PermuteType(dimension);
+
+  ///////////////////////////////////////////////
+  // Simd direction uses an extract/merge pair
+  ///////////////////////////////////////////////
+  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
+  //  int words = sizeof(vobj)/sizeof(vector_type);
+
+  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
+  scalar_object *  recv_buf_extract_mpi;
+  scalar_object *  send_buf_extract_mpi;
+  {
+    size_t bytes = sizeof(scalar_object)*buffer_size;
+    grid->ShmBufferFreeAll();
+    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
+    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
+  }
+  for(int s=0;s<Nsimd;s++){
+    send_buf_extract[s].resize(buffer_size);
+    recv_buf_extract[s].resize(buffer_size);
+  }
+
+  int bytes = buffer_size*sizeof(scalar_object);
+
+  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
+  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
+
+  ///////////////////////////////////////////
+  // Work out what to send where
+  ///////////////////////////////////////////
+  int cb    = (cbmask==0x2)? Odd : Even;
+  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
+
+  // loop over outer coord planes orthog to dim
+  for(int x=0;x<rd;x++){       
+
+    // FIXME call local permute copy if none are offnode.
+    for(int i=0;i<Nsimd;i++){       
+      pointers[i] = &send_buf_extract[i][0];
+    }
+    int sx   = (x+sshift)%rd;
+    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+
+    for(int i=0;i<Nsimd;i++){
+      
+      int inner_bit = (Nsimd>>(permute_type+1));
+      int ic= (i&inner_bit)? 1:0;
+
+      int my_coor          = rd*ic + x;
+      int nbr_coor         = my_coor+sshift;
+      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
+
+      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
+      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
+      int nbr_lane = (i&(~inner_bit));
+
+      int recv_from_rank;
+      int xmit_to_rank;
+
+      if (nbr_ic) nbr_lane|=inner_bit;
+
+      assert (sx == nbr_ox);
+
+      if(nbr_proc){
+	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
+
+	grid->Barrier();
+
+	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
+	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
+			     xmit_to_rank,
+			     (void *)recv_buf_extract_mpi,
+			     recv_from_rank,
+			     bytes);
+	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
+
+	grid->Barrier();
+	rpointers[i] = &recv_buf_extract[i][0];
+      } else { 
+	rpointers[i] = &send_buf_extract[nbr_lane][0];
+      }
+
+    }
+    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
+  }
+
+}
+#endif
 NAMESPACE_END(Grid); 

 #endif
@@ -1,5 +1,4 @@
 #include <Grid/GridCore.h>       
 NAMESPACE_BEGIN(Grid);
-std::vector<std::pair<int,int> > Cshift_table; 
-deviceVector<std::pair<int,int> > Cshift_table_device; 
+Vector<std::pair<int,int> > Cshift_table; 
 NAMESPACE_END(Grid);
@@ -35,7 +35,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transpose.h>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
-#include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
 #include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
@@ -47,4 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
-#include <Grid/lattice/PaddedCell.h>
+#include <Grid/lattice/Lattice_crc.h>
@@ -63,7 +63,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
  typename std::remove_const<vobj>::type ret;

  typedef typename vobj::scalar_object scalar_object;
-  //  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

  const int Nsimd = vobj::vector_type::Nsimd();
@@ -345,9 +345,7 @@ GridUnopClass(UnaryNot, Not(a));
 GridUnopClass(UnaryTrace, trace(a));
 GridUnopClass(UnaryTranspose, transpose(a));
 GridUnopClass(UnaryTa, Ta(a));
-GridUnopClass(UnarySpTa, SpTa(a));
 GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
-GridUnopClass(UnaryProjectOnSpGroup, ProjectOnSpGroup(a));
 GridUnopClass(UnaryTimesI, timesI(a));
 GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
 GridUnopClass(UnaryAbs, abs(a));
@@ -458,9 +456,7 @@ GRID_DEF_UNOP(operator!, UnaryNot);
 GRID_DEF_UNOP(trace, UnaryTrace);
 GRID_DEF_UNOP(transpose, UnaryTranspose);
 GRID_DEF_UNOP(Ta, UnaryTa);
-GRID_DEF_UNOP(SpTa, UnarySpTa);
 GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
-GRID_DEF_UNOP(ProjectOnSpGroup, UnaryProjectOnSpGroup);
 GRID_DEF_UNOP(timesI, UnaryTimesI);
 GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
 GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
@@ -36,7 +36,6 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
@@ -54,7 +53,6 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -72,7 +70,6 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -89,7 +86,6 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -110,7 +106,6 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -124,7 +119,6 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -139,7 +133,6 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -153,7 +146,6 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -171,7 +163,6 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("mult");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -186,7 +177,6 @@ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("mac");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -201,7 +191,6 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("sub");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -215,7 +204,6 @@ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("add");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -230,7 +218,6 @@ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class sobj,class vobj> inline
 void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
-  GRID_TRACE("axpy");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@@ -244,7 +231,6 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
 }
 template<class sobj,class vobj> inline
 void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
-  GRID_TRACE("axpby");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@@ -257,68 +243,16 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
  });
 }

-#define FAST_AXPY_NORM
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
-  GRID_TRACE("axpy_norm");
-#ifdef FAST_AXPY_NORM
-  return axpy_norm_fast(ret,a,x,y);
-#else
-  ret = a*x+y;
-  RealD nn=norm2(ret);
-  return nn;
-#endif
+    return axpy_norm_fast(ret,a,x,y);
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
-  GRID_TRACE("axpby_norm");
-#ifdef FAST_AXPY_NORM
-  return axpby_norm_fast(ret,a,b,x,y);
-#else
-  ret = a*x+b*y;
-  RealD nn=norm2(ret);
-  return nn;
-#endif
+    return axpby_norm_fast(ret,a,b,x,y);
 }

-/// Trace product
-template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2)
-  -> Lattice<decltype(trace(obj()))>
-{
-  typedef decltype(trace(obj())) robj;
-  Lattice<robj> ret_i(rhs_1.Grid());
-  autoView( rhs1 , rhs_1, AcceleratorRead);
-  autoView( rhs2 , rhs_2, AcceleratorRead);
-  autoView( ret , ret_i, AcceleratorWrite);
-  ret.Checkerboard() = rhs_1.Checkerboard();
-  accelerator_for(ss,rhs1.size(),obj::Nsimd(),{
-      coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss)));
-  });
-  return ret_i;
-}
-
-template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2)
-  -> Lattice<decltype(trace(obj1()))>
-{
-  typedef decltype(trace(obj1())) robj;
-  Lattice<robj> ret_i(rhs_1.Grid());
-  autoView( rhs1 , rhs_1, AcceleratorRead);
-  autoView( ret , ret_i, AcceleratorWrite);
-  ret.Checkerboard() = rhs_1.Checkerboard();
-  accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{
-      coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2));
-  });
-  return ret_i;
-}
-template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1)
-  -> Lattice<decltype(trace(obj1()))>
-{
-  return traceProduct(rhs_1,rhs_2);
-}
-
-
-
 NAMESPACE_END(Grid);
 #endif
@@ -117,7 +117,6 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
-    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -141,7 +140,6 @@ public:
  }
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
-    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -165,7 +163,6 @@ public:
  }
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
-    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -234,23 +231,10 @@ public:
  }

  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
-    vobj vtmp;
-    vtmp = r;
-#if 0
-    deviceVector<vobj> vvtmp(1);
-    acceleratorPut(vvtmp[0],vtmp);
-    vobj *vvtmp_p = & vvtmp[0];
-    auto me  = View(AcceleratorWrite);
-    accelerator_for(ss,me.size(),vobj::Nsimd(),{
-	auto stmp=coalescedRead(*vvtmp_p);
-	coalescedWrite(me[ss],stmp);
-    });
-#else    
    auto me  = View(CpuWrite);
    thread_for(ss,me.size(),{
-       me[ss]= r;
-      });
-#endif    
+	me[ss]= r;
+    });
    me.ViewClose();
    return *this;
  }
@@ -304,8 +288,8 @@ public:
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
    conformable(*this,r);
    this->checkerboard = r.Checkerboard();
-    auto him= r.View(AcceleratorRead);
    auto me =   View(AcceleratorWriteDiscard);
+    auto him= r.View(AcceleratorRead);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -319,8 +303,8 @@ public:
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
    this->checkerboard = r.Checkerboard();
    conformable(*this,r);
-    auto him= r.View(AcceleratorRead);
    auto me =   View(AcceleratorWriteDiscard);
+    auto him= r.View(AcceleratorRead);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -373,7 +357,7 @@ public:

 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
  typedef typename vobj::scalar_object sobj;
-  for(int64_t g=0;g<o.Grid()->_gsites;g++){
+  for(int g=0;g<o.Grid()->_gsites;g++){

    Coordinate gcoor;
    o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
@@ -53,19 +53,36 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  typedef decltype(basis[0]) Field;
  typedef decltype(basis[0].View(AcceleratorRead)) View;

-  hostVector<View>  h_basis_v(basis.size());
-  deviceVector<View> d_basis_v(basis.size());
-  typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
+  Vector<View> basis_v; basis_v.reserve(basis.size());
+  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
  typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
-
  GridBase* grid = basis[0].Grid();
      
  for(int k=0;k<basis.size();k++){
-    h_basis_v[k] = basis[k].View(AcceleratorWrite);
-    acceleratorPut(d_basis_v[k],h_basis_v[k]);
+    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }

-  View *basis_vp = &d_basis_v[0];
+#if ( (!defined(GRID_CUDA)) )
+  int max_threads = thread_max();
+  Vector < vobj > Bt(Nm * max_threads);
+  thread_region
+    {
+      vobj* B = &Bt[Nm * thread_num()];
+      thread_for_in_region(ss, grid->oSites(),{
+	  for(int j=j0; j<j1; ++j) B[j]=0.;
+      
+	  for(int j=j0; j<j1; ++j){
+	    for(int k=k0; k<k1; ++k){
+	      B[j] +=Qt(j,k) * basis_v[k][ss];
+	    }
+	  }
+	  for(int j=j0; j<j1; ++j){
+	    basis_v[j][ss] = B[j];
+	  }
+	});
+    }
+#else
+  View *basis_vp = &basis_v[0];

  int nrot = j1-j0;
  if (!nrot) // edge case not handled gracefully by Cuda
@@ -74,19 +91,17 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  uint64_t oSites   =grid->oSites();
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead

-  deviceVector <vobj> Bt(siteBlock * nrot); 
+  Vector <vobj> Bt(siteBlock * nrot); 
  auto Bp=&Bt[0];

  // GPU readable copy of matrix
-  hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
-  deviceVector<Coeff_t> Qt_jv(Nm*Nm);
+  Vector<Coeff_t> Qt_jv(Nm*Nm);
  Coeff_t *Qt_p = & Qt_jv[0];
  thread_for(i,Nm*Nm,{
      int j = i/Nm;
      int k = i%Nm;
-      h_Qt_jv[i]=Qt(j,k);
+      Qt_p[i]=Qt(j,k);
  });
-  acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));

  // Block the loop to keep storage footprint down
  for(uint64_t s=0;s<oSites;s+=siteBlock){
@@ -122,8 +137,9 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 	coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
      });
  }
+#endif

-  for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
+  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
 }

 // Extract a single rotated vector
@@ -136,19 +152,16 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in

  result.Checkerboard() = basis[0].Checkerboard();

-  hostVector<View>  h_basis_v(basis.size());
-  deviceVector<View> d_basis_v(basis.size());
+  Vector<View> basis_v; basis_v.reserve(basis.size());
  for(int k=0;k<basis.size();k++){
-    h_basis_v[k]=basis[k].View(AcceleratorRead);
-    acceleratorPut(d_basis_v[k],h_basis_v[k]);
+    basis_v.push_back(basis[k].View(AcceleratorRead));
  }
-
  vobj zz=Zero();
-  deviceVector<double> Qt_jv(Nm);
+  Vector<double> Qt_jv(Nm);
  double * Qt_j = & Qt_jv[0];
-  for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
+  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);

-  auto basis_vp=& d_basis_v[0];
+  auto basis_vp=& basis_v[0];
  autoView(result_v,result,AcceleratorWrite);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
    vobj zzz=Zero();
@@ -158,7 +171,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
    }
    coalescedWrite(result_v[ss], B);
  });
-  for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
+  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
 }

 template<class Field>
@@ -29,26 +29,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class vobj> void DumpSliceNorm(std::string s,const Lattice<vobj> &f,int mu=-1)
-{
-  auto ff = localNorm2(f);
-  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
-  typedef typename vobj::tensor_reduced normtype;
-  typedef typename normtype::scalar_object scalar;
-  std::vector<scalar> sff;
-  sliceSum(ff,sff,mu);
-  for(int t=0;t<sff.size();t++){
-    std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
-  }
-}
-
-template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
+template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }

-#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
+#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;

 NAMESPACE_END(Grid);

@@ -32,6 +32,7 @@ template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -81,6 +82,7 @@ template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -128,6 +130,7 @@ template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  
  GridBase *FullGrid  = lhs.Grid();
@@ -96,6 +96,9 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){

  GridBase *grid=l.Grid();

+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
  int Nsimd = grid->Nsimd();

  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
@@ -122,17 +125,14 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
-template<class vobj>
-typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
-  typename vobj::scalar_object s;
-  peekSite(s,l,site);
-  return s;
-}        
 template<class vobj,class sobj>
 void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
        
  GridBase *grid=l.Grid();

+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
  int Nsimd = grid->Nsimd();

  assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
@@ -165,7 +165,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)

  int Nsimd = grid->Nsimd();

-  //  assert( l.Checkerboard()== grid->CheckerBoard(site));
+  assert( l.Checkerboard()== grid->CheckerBoard(site));
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));

  static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -173,13 +173,13 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);
  
-  const vector_type *vp = (const vector_type *) &l[odx];
+  scalar_type * vp = (scalar_type *)&l[odx];
  scalar_type * pt = (scalar_type *)&s;
      
  for(int w=0;w<words;w++){
-    pt[w] = getlane(vp[w],idx);
+    pt[w] = vp[idx+w*Nsimd];
  }
-  //  std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
+      
  return;
 };
 template<class vobj,class sobj>
@@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)

  int Nsimd = grid->Nsimd();

-  //  assert( l.Checkerboard()== grid->CheckerBoard(site));
+  assert( l.Checkerboard()== grid->CheckerBoard(site));
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));

  static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -210,10 +210,10 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);

-  vector_type * vp = (vector_type *)&l[odx];
+  scalar_type * vp = (scalar_type *)&l[odx];
  scalar_type * pt = (scalar_type *)&s;
  for(int w=0;w<words;w++){
-    putlane(vp[w],pt[w],idx);
+    vp[idx+w*Nsimd] = pt[w];
  }
  return;
 };
@@ -28,10 +28,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_CUDA)||defined(GRID_HIP)
 #include <Grid/lattice/Lattice_reduction_gpu.h>
 #endif
-#if defined(GRID_SYCL)
-#include <Grid/lattice/Lattice_reduction_sycl.h>
-#endif
-#include <Grid/lattice/Lattice_slicesum_core.h>

 NAMESPACE_BEGIN(Grid);

@@ -46,7 +42,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
  //  const int Nsimd = vobj::Nsimd();
  const int nthread = GridThread::GetThreads();

-  std::vector<sobj> sumarray(nthread);
+  Vector<sobj> sumarray(nthread);
  for(int i=0;i<nthread;i++){
    sumarray[i]=Zero();
  }
@@ -75,7 +71,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)

  const int nthread = GridThread::GetThreads();

-  std::vector<sobj> sumarray(nthread);
+  Vector<sobj> sumarray(nthread);
  for(int i=0;i<nthread;i++){
    sumarray[i]=Zero();
  }
@@ -95,7 +91,10 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
  for(int i=0;i<nthread;i++){
    ssum = ssum+sumarray[i];
  } 
-  return ssum;
+  
+  typedef typename vobj::scalar_object ssobj;
+  ssobj ret = ssum;
+  return ret;
 }
 /*
 Threaded max, don't use for now
@@ -128,7 +127,7 @@ inline Double max(const Double *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  return sum_gpu(arg,osites);
 #else
  return sum_cpu(arg,osites);
@@ -137,61 +136,25 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  return sumD_gpu(arg,osites);
 #else
  return sumD_cpu(arg,osites);
 #endif  
 }
-template<class vobj>
-inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
-{
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
-  return sumD_gpu_large(arg,osites);
-#else
-  return sumD_cpu(arg,osites);
-#endif  
-}
-
-template<class vobj>
-inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
-{
-  Integer osites = arg.Grid()->oSites();
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
-  autoView( arg_v, arg, AcceleratorRead);
-  return sum_gpu(&arg_v[0],osites);
-#else
-  autoView(arg_v, arg, CpuRead);
-  return sum_cpu(&arg_v[0],osites);
-#endif  
-}

 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
-  auto ssum = rankSum(arg);
-  arg.Grid()->GlobalSum(ssum);
-  return ssum;
-}
-
-template<class vobj>
-inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
-{
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
-  return sum_gpu_large(&arg_v[0],osites);
+  auto ssum= sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
-  return sum_cpu(&arg_v[0],osites);
-#endif
-}
-
-template<class vobj>
-inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
-{
-  auto ssum = rankSumLarge(arg);
+  auto ssum= sum_cpu(&arg_v[0],osites);
+#endif  
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
@@ -204,27 +167,6 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  return real(nrm); 
 }

-
-template<class Op,class T1>
-inline auto norm2(const LatticeUnaryExpression<Op,T1> & expr)  ->RealD
-{
-  return norm2(closure(expr));
-}
-
-template<class Op,class T1,class T2>
-inline auto norm2(const LatticeBinaryExpression<Op,T1,T2> & expr)      ->RealD
-{
-  return norm2(closure(expr));
-}
-
-
-template<class Op,class T1,class T2,class T3>
-inline auto norm2(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)      ->RealD
-{
-  return norm2(closure(expr));
-}
-
-
 //The global maximum of the site norm2
 template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 {
@@ -255,6 +197,7 @@ template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 template<class vobj>
 inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  ComplexD  nrm;
  
@@ -264,8 +207,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  
  // Might make all code paths go this way.
-  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
-  deviceVector<inner_t> inner_tmp(sites);
+  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
+  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
    
  {
@@ -273,63 +216,24 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
    autoView( right_v,right, AcceleratorRead);

    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v(ss);
-	auto y_l = right_v(ss);
-	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
+    accelerator_for( ss, sites, 1,{
+	auto x_l = left_v[ss];
+	auto y_l = right_v[ss];
+	inner_tmp_v[ss]=innerProductD(x_l,y_l);
    });
  }
+
  // This is in single precision and fails some tests
-  auto anrm = sumD(inner_tmp_v,sites);  
+  auto anrm = sum(inner_tmp_v,sites);  
  nrm = anrm;
  return nrm;
 }

-
 template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();
-
-  bool ok;
-#ifdef GRID_SYCL
-  uint64_t csum=0;
-  uint64_t csum2=0;
-  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
-  {
-    // Hack
-    // Fast integer xor checksum. Can also be used in comms now.
-    autoView(l_v,left,AcceleratorRead);
-    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
-    uint64_t *base= (uint64_t *)&l_v[0];
-    csum=svm_xor(base,words);
-    ok = FlightRecorder::CsumLog(csum);
-    if ( !ok ) {
-      csum2=svm_xor(base,words);
-      std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
-    } else {
-      //      csum2=svm_xor(base,words);
-      //      std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
-    }
-    assert(ok);
-  }
-#endif
-  FlightRecorder::StepLog("rank inner product");
  ComplexD nrm = rankInnerProduct(left,right);
-  //  ComplexD nrmck=nrm;
-  RealD local = real(nrm);
-  ok = FlightRecorder::NormLog(real(nrm));
-  if ( !ok ) {
-    ComplexD nrm2 = rankInnerProduct(left,right);
-    RealD local2 = real(nrm2);
-    std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
-    assert(ok);
-  }
-  FlightRecorder::StepLog("Start global sum");
-  //  grid->GlobalSumP2P(nrm);
  grid->GlobalSum(nrm);
-  FlightRecorder::StepLog("Finished global sum");
-  //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
-  FlightRecorder::ReductionLog(local,real(nrm)); 
  return nrm;
 }

@@ -353,7 +257,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  conformable(z,x);
  conformable(x,y);

-  //  typedef typename vobj::vector_typeD vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
  RealD  nrm;
  
  GridBase *grid = x.Grid();
@@ -365,54 +270,18 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
-  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
-  deviceVector<inner_t> inner_tmp;
-  inner_tmp.resize(sites);
+
+  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
+  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];

-  accelerator_for( ss, sites, nsimd,{
-      auto tmp = a*x_v(ss)+b*y_v(ss);
-      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
-      coalescedWrite(z_v[ss],tmp);
+  accelerator_for( ss, sites, 1,{
+      auto tmp = a*x_v[ss]+b*y_v[ss];
+      inner_tmp_v[ss]=innerProductD(tmp,tmp);
+      z_v[ss]=tmp;
  });
-  bool ok;
-#ifdef GRID_SYCL
-  uint64_t csum=0;
-  uint64_t csum2=0;
-  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
-  {
-    // z_v
-    {
-      Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
-      uint64_t *base= (uint64_t *)&z_v[0];
-      csum=svm_xor(base,words);
-      ok = FlightRecorder::CsumLog(csum);
-      if ( !ok ) {
-	csum2=svm_xor(base,words);
-	std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
-      }
-      assert(ok);
-    }
-    // inner_v
-    {
-      Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
-      uint64_t *base= (uint64_t *)&inner_tmp_v[0];
-      csum=svm_xor(base,words);
-      ok = FlightRecorder::CsumLog(csum);
-      if ( !ok ) {
-	csum2=svm_xor(base,words);
-	std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
-      }
-      assert(ok);
-    }
-  }
-#endif
-  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
-  ok = FlightRecorder::NormLog(real(nrm));
-  assert(ok);
-  RealD local = real(nrm);
+  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
  grid->GlobalSum(nrm);
-  FlightRecorder::ReductionLog(local,real(nrm));
  return nrm; 
 }
 
@@ -421,8 +290,9 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
 {
  conformable(left,right);

+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
-  std::vector<ComplexD> tmp(2);
+  Vector<ComplexD> tmp(2);

  GridBase *grid = left.Grid();

@@ -432,8 +302,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
  // GPU
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  typedef decltype(innerProductD(vobj(),vobj())) norm_t;
-  deviceVector<inner_t> inner_tmp(sites);
-  deviceVector<norm_t>  norm_tmp(sites);
+  Vector<inner_t> inner_tmp(sites);
+  Vector<norm_t>  norm_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  auto norm_tmp_v = &norm_tmp[0];
  {
@@ -483,9 +353,7 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
 // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////

-template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
-					  std::vector<typename vobj::scalar_object> &result,
-					  int orthogdim)
+template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
 {
  ///////////////////////////////////////////////////////
  // FIXME precision promoted summation
@@ -507,8 +375,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];

-  std::vector<vobj> lvSum(rd); // will locally sum vectors first
-  std::vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
+  Vector<vobj> lvSum(rd); // will locally sum vectors first
+  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD

  result.resize(fd); // And then global sum to return the same vector to every node 
@@ -519,10 +387,19 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
-  int ostride=grid->_ostride[orthogdim];
-  
-  //Reduce Data down to lvSum
-  sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
+
+  // sum over reduced dimension planes, breaking out orthog dir
+  // Parallel over orthog direction
+  autoView( Data_v, Data, CpuRead);
+  thread_for( r,rd, {
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	lvSum[r]=lvSum[r]+Data_v[ss];
+      }
+    }
+  });

  // Sum across simd lanes in the plane, breaking out orthog dir.
  Coordinate icoor(Nd);
@@ -556,32 +433,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
  scalar_type * ptr = (scalar_type *) &result[0];
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
-  //  std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
-  
-}
-template<class vobj> inline
-std::vector<typename vobj::scalar_object> 
-sliceSum(const Lattice<vobj> &Data,int orthogdim)
-{
-  std::vector<typename vobj::scalar_object> result;
-  sliceSum(Data,result,orthogdim);
-  return result;
 }

-/*
-Reimplement
-
-1)
-template<class vobj>
-static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
-
-2)
-template<class vobj>
-static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
-
-3)
-- Make Slice Mul Matrix call sliceMaddMatrix
- */
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
@@ -601,8 +454,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];

-  std::vector<vector_type> lvSum(rd); // will locally sum vectors first
-  std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
+  Vector<vector_type> lvSum(rd); // will locally sum vectors first
+  Vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  

  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@@ -685,8 +538,7 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
 template<class vobj>
 static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 			    int orthogdim,RealD scale=1.0) 
-{
-  // perhaps easier to just promote A to a field and use regular madd
+{    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -717,7 +569,8 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
    for(int l=0;l<Nsimd;l++){
      grid->iCoorFromIindex(icoor,l);
      int ldx =r+icoor[orthogdim]*rd;
-      av.putlane(scalar_type(a[ldx])*zscale,l);
+      scalar_type *as =(scalar_type *)&av;
+      as[l] = scalar_type(a[ldx])*zscale;
    }

    tensor_reduced at; at=av;
@@ -732,96 +585,206 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };

+/*
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
  int nsimd = BlockSolverGrid->Nsimd();
  
-  std::vector<int> latt_phys(NN-1);
-  Coordinate simd_phys;
-  std::vector<int>  mpi_phys(NN-1);
-  Coordinate checker_dim_mask(NN-1);
-  int checker_dim=-1;
-
-  int dd;
+  std::vector<int> latt_phys(0);
+  std::vector<int> simd_phys(0);
+  std::vector<int>  mpi_phys(0);
+  
  for(int d=0;d<NN;d++){
    if( d!=Orthog ) { 
-      latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
-      mpi_phys[dd] =BlockSolverGrid->_processors[d];
-      checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
-      if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
-      dd++;
+      latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
+      simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
+      mpi_phys.push_back(BlockSolverGrid->_processors[d]);
    }
  }
-  simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
-  GridCartesian *tmp         = new GridCartesian(latt_phys,simd_phys,mpi_phys);
-  if(BlockSolverGrid->_isCheckerBoarded) {
-    GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
-    delete tmp;
-    return (GridBase *) ret;
-  } else { 
-    return (GridBase *) tmp;
-  }
+  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
+*/

 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
-  GridBase *FullGrid = X.Grid();
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-
-  Lattice<vobj> Ys(SliceGrid);
-  Lattice<vobj> Rs(SliceGrid);
-  Lattice<vobj> Xs(SliceGrid);
-  Lattice<vobj> RR(FullGrid);
-
-  RR = R; // Copies checkerboard for insert
-  
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
-  int Nslice = X.Grid()->GlobalDimensions()[Orthog];
-  for(int i=0;i<Nslice;i++){
-    ExtractSlice(Ys,Y,i,Orthog);
-    ExtractSlice(Rs,R,i,Orthog);
-    Rs=Ys;
-    for(int j=0;j<Nslice;j++){
-      ExtractSlice(Xs,X,j,Orthog);
-      Rs = Rs + Xs*(scale*aa(j,i));
-    }
-    InsertSlice(Rs,RR,i,Orthog);
+
+  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X.Grid();
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  //  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  //  int nl = nh-1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+
+  autoView( X_v, X, CpuRead);
+  autoView( Y_v, Y, CpuRead);
+  autoView( R_v, R, CpuWrite);
+  thread_region
+  {
+    Vector<vobj> s_x(Nblock);
+
+    thread_for_collapse_in_region(2, n,nblock, {
+     for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X_v[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = Y_v[o+i*ostride];
+	for(int j=0;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R_v[o+i*ostride]=dot;
+      }
+    }});
  }
-  R=RR; // Copy back handles arguments aliasing case
-  delete SliceGrid;
 };

 template<class vobj>
-static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
-{
-  R=Zero();
-  sliceMaddMatrix(R,aa,X,R,Orthog,scale);
+static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X.Grid();
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  //  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  //  int nl=1;
+
+  //FIXME package in a convenient iterator
+  // thread_for2d_in_region
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+  autoView( R_v, R, CpuWrite);
+  autoView( X_v, X, CpuRead);
+  thread_region
+  {
+    std::vector<vobj> s_x(Nblock);
+
+
+    thread_for_collapse_in_region( 2 ,n,nblock,{
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X_v[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = s_x[0]*(scale*aa(0,i));
+	for(int j=1;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R_v[o+i*ostride]=dot;
+      }
+    }});
+  }
 };


 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
-  GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
-
-  Lattice<vobj> ls(SliceGrid);
-  Lattice<vobj> rs(SliceGrid);
-  
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
-  int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
-  mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
-  for(int s=0;s<Nslice;s++){
-    ExtractSlice(ls,lhs,s,Orthog);
-    for(int ss=0;ss<Nslice;ss++){
-      ExtractSlice(rs,rhs,ss,Orthog);
-      mat(s,ss) = innerProduct(ls,rs);
-    }
+  
+  GridBase *FullGrid  = lhs.Grid();
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  
+  int Nblock = FullGrid->GlobalDimensions()[Orthog];
+  
+  //  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+  
+  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  //  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  //  int nl = nh-1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+
+  typedef typename vobj::vector_typeD vector_typeD;
+
+  autoView( lhs_v, lhs, CpuRead);
+  autoView( rhs_v, rhs, CpuRead);
+  thread_region
+  {
+    std::vector<vobj> Left(Nblock);
+    std::vector<vobj> Right(Nblock);
+    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+    thread_for_collapse_in_region( 2, n,nblock,{
+    for(int b=0;b<block;b++){
+
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	Left [i] = lhs_v[o+i*ostride];
+	Right[i] = rhs_v[o+i*ostride];
+      }
+
+      for(int i=0;i<Nblock;i++){
+      for(int j=0;j<Nblock;j++){
+	auto tmp = innerProduct(Left[i],Right[j]);
+	auto rtmp = TensorRemove(tmp);
+	auto red  =  Reduce(rtmp);
+	mat_thread(i,j) += std::complex<double>(real(red),imag(red));
+      }}
+    }});
+    thread_critical
+    {
+      mat += mat_thread;
+    }  
  }
-  delete SliceGrid;
+
+  for(int i=0;i<Nblock;i++){
+  for(int j=0;j<Nblock;j++){
+    ComplexD sum = mat(i,j);
+    FullGrid->GlobalSum(sum);
+    mat(i,j)=sum;
+  }}
+
+  return;
 }

 NAMESPACE_END(Grid);
@@ -23,27 +23,27 @@ unsigned int nextPow2(Iterator x) {
 }

 template <class Iterator>
-int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
+void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
  
  int device;
 #ifdef GRID_CUDA
  cudaGetDevice(&device);
 #endif
 #ifdef GRID_HIP
-  auto r=hipGetDevice(&device);
+  hipGetDevice(&device);
 #endif
  
  Iterator warpSize            = gpu_props[device].warpSize;
  Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
  Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;
  Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
-  /*  
+  
  std::cout << GridLogDebug << "GPU has:" << std::endl;
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
-  */  
+  
  if (warpSize != WARP_SIZE) {
    std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
    exit(EXIT_FAILURE);
@@ -53,12 +53,12 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  threads = warpSize;
  if ( threads*sizeofsobj > sharedMemPerBlock ) {
    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
-    return 0;
+    exit(EXIT_FAILURE);
  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
-  return 1;
+  
 }

 template <class sobj, class Iterator>
@@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_objectD sobj;
  typedef decltype(lat) Iterator;
@@ -207,67 +207,17 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  Integer size = osites*nsimd;

  Integer numThreads, numBlocks;
-  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
-  assert(ok);
-
+  getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
  Integer smemSize = numThreads * sizeof(sobj);
-  // Move out of UVM
-  // Turns out I had messed up the synchronise after move to compute stream
-  // as running this on the default stream fools the synchronise
-  deviceVector<sobj> buffer(numBlocks);
+
+  Vector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
-  sobj result;
-  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
+  
+  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
  accelerator_barrier();
-  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
+  auto result = buffer_v[0];
  return result;
 }
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::vector_type  vector;
-  typedef typename vobj::scalar_typeD scalarD;
-  typedef typename vobj::scalar_objectD sobj;
-  sobj ret;
-  scalarD *ret_p = (scalarD *)&ret;
-  
-  const int words = sizeof(vobj)/sizeof(vector);
-
-  deviceVector<vector> buffer(osites);
-  vector *dat = (vector *)lat;
-  vector *buf = &buffer[0];
-  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
-  for(int w=0;w<words;w++) {
-
-    accelerator_for(ss,osites,1,{
-	buf[ss] = dat[ss*words+w];
-      });
-      
-    ret_p[w] = sumD_gpu_small(tbuf,osites);
-  }
-  return ret;
-}
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::scalar_objectD sobj;
-  sobj ret;
-  
-  Integer nsimd= vobj::Nsimd();
-  Integer size = osites*nsimd;
-  Integer numThreads, numBlocks;
-  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
-  
-  if ( ok ) {
-    ret = sumD_gpu_small(lat,osites);
-  } else {
-    ret = sumD_gpu_large(lat,osites);
-  }
-  return ret;
-}
-
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -280,13 +230,6 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
  return result;
 }

-template <class vobj>
-inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::scalar_object sobj;
-  sobj result;
-  result = sumD_gpu_large(lat,osites);
-  return result;
-}
+

 NAMESPACE_END(Grid);
@@ -1,92 +0,0 @@
-NAMESPACE_BEGIN(Grid);
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Possibly promote to double and sum
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
-{
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_objectD sobjD;
-
-  sobj identity; zeroit(identity);
-  sobj ret; zeroit(ret);
-  Integer nsimd= vobj::Nsimd();
-  { 
-    sycl::buffer<sobj, 1> abuff(&ret, {1});
-    theGridAccelerator->submit([&](sycl::handler &cgh) {
-      auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
-      cgh.parallel_for(sycl::range<1>{osites},
-                      Reduction,
-                      [=] (sycl::id<1> item, auto &sum) {
-                        auto osite   = item[0];
-                        sum +=Reduce(lat[osite]);
-                      });
-    });
-  }
-  sobjD dret; convertType(dret,ret);
-  return dret;
-}
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
-{
-  return sumD_gpu_tensor(lat,osites);
-}
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
-{
-  return sumD_gpu_large(lat,osites);
-}
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
-{
-  return sumD_gpu_large(lat,osites);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Return as same precision as input performing reduction in double precision though
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-template <class vobj>
-inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) 
-{
-  typedef typename vobj::scalar_object sobj;
-  sobj result;
-  result = sumD_gpu(lat,osites);
-  return result;
-}
-
-template <class vobj>
-inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::scalar_object sobj;
-  sobj result;
-  result = sumD_gpu_large(lat,osites);
-  return result;
-}
-
-
-template<class Word> Word svm_xor(Word *vec,uint64_t L)
-{
-  Word identity;  identity=0;
-  Word ret = 0;
-  { 
-    sycl::buffer<Word, 1> abuff(&ret, {1});
-    theGridAccelerator->submit([&](sycl::handler &cgh) {
-      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
-      cgh.parallel_for(sycl::range<1>{L},
-                      Reduction,
-                      [=] (sycl::id<1> index, auto &sum) {
-                        sum ^=vec[index];
-                      });
-    });
-  }
-  theGridAccelerator->wait();
-  return ret;
-}
-
-NAMESPACE_END(Grid);
-
@@ -32,8 +32,9 @@
 #include <random>

 #ifdef RNG_SITMO
-#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#include <Grid/random/sitmo_prng_engine.hpp>
 #endif 
+#include <Grid/random/gaussian.h>

 #if defined(RNG_SITMO)
 #define RNG_FAST_DISCARD
@@ -142,8 +143,8 @@ public:

  std::vector<RngEngine>                             _generators;
  std::vector<std::uniform_real_distribution<RealD> > _uniform;
-  std::vector<std::normal_distribution<RealD> >       _gaussian;
-  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
+  std::vector<Grid::gaussian_distribution<RealD> >       _gaussian;
+  //  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
  std::vector<std::uniform_int_distribution<uint32_t> > _uid;

  ///////////////////////
@@ -152,7 +153,6 @@ public:
 #ifdef RNG_FAST_DISCARD
  static void Skip(RngEngine &eng,uint64_t site)
  {
-#if 0
    /////////////////////////////////////////////////////////////////////////////////////
    // Skip by 2^40 elements between successive lattice sites
    // This goes by 10^12.
@@ -163,9 +163,9 @@ public:
    // tens of seconds per trajectory so this is clean in all reasonable cases,
    // and margin of safety is orders of magnitude.
    // We could hack Sitmo to skip in the higher order words of state if necessary
-    //
-    // Replace with 2^30 ; avoid problem on large volumes
-    //
+      //
+      // Replace with 2^30 ; avoid problem on large volumes
+      //
    /////////////////////////////////////////////////////////////////////////////////////
    //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
    const int shift = 30;
@@ -180,9 +180,6 @@ public:
    assert((skip >> shift)==site); // check for overflow

    eng.discard(skip);
-#else
-    eng.discardhi(site);
-#endif
    //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
  } 
 #endif
@@ -247,8 +244,8 @@ public:
  GridSerialRNG() : GridRNGbase() {
    _generators.resize(1);
    _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
+    _gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
+    //    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
  }

@@ -361,18 +358,13 @@ public:

    _generators.resize(_vol);
    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
+    _gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
+    //    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }
-  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist)
-  {
-    if ( l.Grid()->_isCheckerBoarded ) {
-      Lattice<vobj> tmp(_grid);
-      fill(tmp,dist);
-      pickCheckerboard(l.Checkerboard(),l,tmp);
-      return;
-    }
+
+  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
+
    typedef typename vobj::scalar_object scalar_object;
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
@@ -416,7 +408,7 @@ public:
      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
      SeedFixedIntegers(seeds);
    }
-  void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
+  void SeedFixedIntegers(const std::vector<int> &seeds){

    // Everyone generates the same seed_seq based on input seeds
    CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
@@ -433,29 +425,22 @@ public:
    // MT implementation does not implement fast discard even though
    // in principle this is possible
    ////////////////////////////////////////////////
-    thread_for( lidx, _grid->lSites(), {

-	int64_t gidx;
+    // Everybody loops over global volume.
+    thread_for( gidx, _grid->_gsites, {
+	// Where is it?
+	int rank;
 	int o_idx;
 	int i_idx;
-	int rank;
-	Coordinate pcoor;
-	Coordinate lcoor;
+
 	Coordinate gcoor;
-	_grid->LocalIndexToLocalCoor(lidx,lcoor);
-	pcoor=_grid->ThisProcessorCoor();
-	_grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor);
-	_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
-
+	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
-
-	assert(rank == _grid->ThisRank() );
 	
-	int l_idx=generator_idx(o_idx,i_idx);
-	_generators[l_idx] = master_engine;
-	if ( britney ) { 
-	  Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
-	} else { 	
+	// If this is one of mine we take it
+	if( rank == _grid->ThisRank() ){
+	  int l_idx=generator_idx(o_idx,i_idx);
+	  _generators[l_idx] = master_engine;
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
    });
@@ -531,11 +516,11 @@ public:

 template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
 template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
-template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
+//template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}

 template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
 template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
-template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
+//template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }

 NAMESPACE_END(Grid);
 #endif
@@ -1,267 +0,0 @@
-#pragma once
-
-#if defined(GRID_CUDA)
-
-#include <cub/cub.cuh>
-#define gpucub cub
-#define gpuError_t cudaError_t
-#define gpuSuccess cudaSuccess
-
-#elif defined(GRID_HIP)
-
-#include <hipcub/hipcub.hpp>
-#define gpucub hipcub
-#define gpuError_t hipError_t
-#define gpuSuccess hipSuccess
-
-#endif
-
-
-NAMESPACE_BEGIN(Grid);
-
-
-#if defined(GRID_CUDA) || defined(GRID_HIP)
-template<class vobj>
-inline void sliceSumReduction_cub_small(const vobj *Data,
-					std::vector<vobj> &lvSum,
-					const int rd,
-					const int e1,
-					const int e2,
-					const int stride,
-					const int ostride,
-					const int Nsimd)
-{
-  size_t subvol_size = e1*e2;
-  deviceVector<vobj> reduction_buffer(rd*subvol_size);
-  auto rb_p = &reduction_buffer[0];
-  vobj zero_init;
-  zeroit(zero_init);
-
-  
-  void *temp_storage_array = NULL;
-  size_t temp_storage_bytes = 0;
-  vobj *d_out;
-  int* d_offsets;
-
-  std::vector<int> offsets(rd+1,0);
-
-  for (int i = 0; i < offsets.size(); i++) {
-    offsets[i] = i*subvol_size;
-  }
-  
-  //Allocate memory for output and offset arrays on device
-  d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
-  
-  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
-  
-  //copy offsets to device
-  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
-  
-  
-  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
-  if (gpuErr!=gpuSuccess) {
-    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  //allocate memory for temp_storage_array  
-  temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
-  
-  //prepare buffer for reduction
-  //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
-  //use 2d accelerator_for to avoid launch latencies found when serially looping over rd 
-  accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{ 
-  
-    int n = s / e2;
-    int b = s % e2;
-    int so=r*ostride; // base offset for start of plane 
-    int ss= so+n*stride+b;
-
-    coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
-
-  });
-  
-  //issue segmented reductions in computeStream
-  gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
-  if (gpuErr!=gpuSuccess) {
-    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
-    exit(EXIT_FAILURE);
-  }
-  
-  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
-  
-  //sync after copy
-  accelerator_barrier();
- 
-  acceleratorFreeDevice(temp_storage_array);
-  acceleratorFreeDevice(d_out);
-  acceleratorFreeDevice(d_offsets);
-  
-
-}
-#endif 
-
-
-#if defined(GRID_SYCL)
-template<class vobj>
-inline void sliceSumReduction_sycl_small(const vobj *Data,
-					 std::vector <vobj> &lvSum,
-					 const int  &rd,
-					 const int &e1,
-					 const int &e2,
-					 const int &stride,
-					 const int &ostride,
-					 const int &Nsimd)
-{
-  size_t subvol_size = e1*e2;
-
-  vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
-  vobj vobj_zero;
-  zeroit(vobj_zero);
-  for (int r = 0; r<rd; r++) { 
-    mysum[r] = vobj_zero; 
-  }
-
-  deviceVector<vobj> reduction_buffer(rd*subvol_size);    
-
-  auto rb_p = &reduction_buffer[0];
-
-  // autoView(Data_v, Data, AcceleratorRead);
-
-  //prepare reduction buffer 
-  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
-  
-      int n = s / e2;
-      int b = s % e2;
-      int so=r*ostride; // base offset for start of plane 
-      int ss= so+n*stride+b;
-
-      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
-
-  });
-
-  for (int r = 0; r < rd; r++) {
-      theGridAccelerator->submit([&](sycl::handler &cgh) {
-          auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
-          cgh.parallel_for(sycl::range<1>{subvol_size},
-          Reduction,
-          [=](sycl::id<1> item, auto &sum) {
-              auto s = item[0];
-              sum += rb_p[r*subvol_size+s];
-          });
-      });
-      
-     
-  }
-  theGridAccelerator->wait();
-  for (int r = 0; r < rd; r++) {
-    lvSum[r] = mysum[r];
-  }
-  free(mysum,*theGridAccelerator);
-}
-#endif
-
-template<class vobj>
-inline void sliceSumReduction_large(const vobj *Data,
-				    std::vector<vobj> &lvSum,
-				    const int rd,
-				    const int e1,
-				    const int e2,
-				    const int stride,
-				    const int ostride,
-				    const int Nsimd)
-{
-  typedef typename vobj::vector_type vector;
-  const int words = sizeof(vobj)/sizeof(vector);
-  const int osites = rd*e1*e2;
-  deviceVector<vector>buffer(osites);
-  vector *dat = (vector *)Data;
-  vector *buf = &buffer[0];
-  std::vector<vector> lvSum_small(rd);
-  vector *lvSum_ptr = (vector *)&lvSum[0];
-
-  for (int w = 0; w < words; w++) {
-    accelerator_for(ss,osites,1,{
-	    buf[ss] = dat[ss*words+w];
-    });
-
-    #if defined(GRID_CUDA) || defined(GRID_HIP)
-      sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
-    #elif defined(GRID_SYCL)
-      sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
-    #endif
-
-    for (int r = 0; r < rd; r++) {
-      lvSum_ptr[w+words*r]=lvSum_small[r];
-    }
-  }
-}
-
-template<class vobj>
-inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
-				  std::vector<vobj> &lvSum,
-				  const int rd,
-				  const int e1,
-				  const int e2,
-				  const int stride,
-				  const int ostride,
-				  const int Nsimd)
-{
-  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
-    if constexpr (sizeof(vobj) <= 256) { 
-
-      #if defined(GRID_CUDA) || defined(GRID_HIP)
-        sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
-      #elif defined (GRID_SYCL)
-        sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
-      #endif
-
-    }
-    else {
-      sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
-    }
-}
-
-
-template<class vobj>
-inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
-				  std::vector<vobj> &lvSum,
-				  const int &rd,
-				  const int &e1,
-				  const int &e2,
-				  const int &stride,
-				  const int &ostride,
-				  const int &Nsimd)
-{
-  // sum over reduced dimension planes, breaking out orthog dir
-  // Parallel over orthog direction
-  autoView( Data_v, Data, CpuRead);
-  thread_for( r,rd, {
-    int so=r*ostride; // base offset for start of plane 
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-        int ss= so+n*stride+b;
-        lvSum[r]=lvSum[r]+Data_v[ss];
-      }
-    }
-  });
-}
-
-template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
-						   std::vector<vobj> &lvSum,
-						   const int &rd,
-						   const int &e1,
-						   const int &e2,
-						   const int &stride,
-						   const int &ostride,
-						   const int &Nsimd) 
-{
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-#else
-  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-#endif
-}
-
-
-NAMESPACE_END(Grid);
@@ -66,65 +66,6 @@ inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<
  return ret;
 };

-template<int N, class Vec>
-Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
-{
-  GridBase *grid=Umu.Grid();
-  auto lvol = grid->lSites();
-  Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
-  typedef typename Vec::scalar_type scalar;
-  autoView(Umu_v,Umu,CpuRead);
-  autoView(ret_v,ret,CpuWrite);
-  thread_for(site,lvol,{
-    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
-    Coordinate lcoor;
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    iScalar<iScalar<iMatrix<scalar, N> > > Us;
-    peekLocalSite(Us, Umu_v, lcoor);
-    for(int i=0;i<N;i++){
-      for(int j=0;j<N;j++){
-	scalar tmp= Us()()(i,j);
-	ComplexD ztmp(real(tmp),imag(tmp));
-	EigenU(i,j)=ztmp;
-      }}
-    ComplexD detD  = EigenU.determinant();
-    typename Vec::scalar_type det(detD.real(),detD.imag());
-    pokeLocalSite(det,ret_v,lcoor);
-  });
-  return ret;
-}
-
-template<int N>
-Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
-{
-  GridBase *grid=Umu.Grid();
-  auto lvol = grid->lSites();
-  Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
-  
-  autoView(Umu_v,Umu,CpuRead);
-  autoView(ret_v,ret,CpuWrite);
-  thread_for(site,lvol,{
-    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
-    Coordinate lcoor;
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
-    iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
-    peekLocalSite(Us, Umu_v, lcoor);
-    for(int i=0;i<N;i++){
-      for(int j=0;j<N;j++){
-	EigenU(i,j) = Us()()(i,j);
-      }}
-    Eigen::MatrixXcd EigenUinv = EigenU.inverse();
-    for(int i=0;i<N;i++){
-      for(int j=0;j<N;j++){
-	Ui()()(i,j) = EigenUinv(i,j);
-      }}
-    pokeLocalSite(Ui,ret_v,lcoor);
-  });
-  return ret;
-}
-
-
 NAMESPACE_END(Grid);
 #endif

@@ -194,11 +194,11 @@ accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
 #endif

 accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
-  precisionChange(out,in);
+  out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
 }

 accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
-  precisionChange(out,in);
+  Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
 }

 template<typename T1,typename T2>
@@ -276,64 +276,20 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,

  autoView( coarseData_ , coarseData, AcceleratorWrite);
  autoView( ip_         , ip,         AcceleratorWrite);
-  RealD t_IP=0;
-  RealD t_co=0;
-  RealD t_za=0;
  for(int v=0;v<nbasis;v++) {
-    t_IP-=usecond();
    blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
-    t_IP+=usecond();
-    t_co-=usecond();
    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
 	convertType(coarseData_[sc](v),ip_[sc]);
    });
-    t_co+=usecond();

    // improve numerical stability of projection
    // |fine> = |fine> - <basis|fine> |basis>
    ip=-ip;
-    t_za-=usecond();
    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
-    t_za+=usecond();
-  }
-  //  std::cout << GridLogPerformance << " blockProject : blockInnerProduct :  "<<t_IP<<" us"<<std::endl;
-  //  std::cout << GridLogPerformance << " blockProject : conv              :  "<<t_co<<" us"<<std::endl;
-  //  std::cout << GridLogPerformance << " blockProject : blockZaxpy        :  "<<t_za<<" us"<<std::endl;
-}
-// This only minimises data motion from CPU to GPU
-// there is chance of better implementation that does a vxk loop of inner products to data share
-// at the GPU thread level
-template<class vobj,class CComplex,int nbasis,class VLattice>
-inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
-                               const std::vector<Lattice<vobj>> &fineData,
-                               const VLattice &Basis)
-{
-  int NBatch = fineData.size();
-  assert(coarseData.size() == NBatch);
-
-  GridBase * fine  = fineData[0].Grid();
-  GridBase * coarse= coarseData[0].Grid();
-
-  Lattice<iScalar<CComplex>> ip(coarse);
-  std::vector<Lattice<vobj>> fineDataCopy = fineData;
-
-  autoView(ip_, ip, AcceleratorWrite);
-  for(int v=0;v<nbasis;v++) {
-    for (int k=0; k<NBatch; k++) {
-      autoView( coarseData_ , coarseData[k], AcceleratorWrite);
-      blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
-      accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
-        convertType(coarseData_[sc](v),ip_[sc]);
-      });
-
-      // improve numerical stability of projection
-      // |fine> = |fine> - <basis|fine> |basis>
-      ip=-ip;
-      blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); 
-    }
  }
 }

+
 template<class vobj,class vobj2,class CComplex>
  inline void blockZAXPY(Lattice<vobj> &fineZ,
 			 const Lattice<CComplex> &coarseA,
@@ -408,15 +364,8 @@ template<class vobj,class CComplex>
  Lattice<dotp> coarse_inner(coarse);

  // Precision promotion
-  RealD t;
-  t=-usecond();
  fine_inner = localInnerProductD<vobj>(fineX,fineY);
-  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : localInnerProductD "<<t<<" us"<<std::endl;
-  
-  t=-usecond();
  blockSum(coarse_inner,fine_inner);
-  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : blockSum "<<t<<" us"<<std::endl;
-  t=-usecond();
  {
    autoView( CoarseInner_  , CoarseInner,AcceleratorWrite);
    autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
@@ -424,7 +373,6 @@ template<class vobj,class CComplex>
      convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
    });
  }
-  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : convertType "<<t<<" us"<<std::endl;
 
 }

@@ -467,9 +415,6 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 template<class vobj>
 inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) 
 {
-  const int maxsubsec=256;
-  typedef iVector<vobj,maxsubsec> vSubsec;
-
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();

@@ -489,62 +434,37 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  autoView( coarseData_ , coarseData, AcceleratorWrite);
  autoView( fineData_   , fineData, AcceleratorRead);

-  auto coarseData_p  = &coarseData_[0];
-  auto fineData_p    = &fineData_[0];
+  auto coarseData_p = &coarseData_[0];
+  auto fineData_p = &fineData_[0];
  
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;

  vobj zz = Zero();
-
-  // Somewhat lazy calculation
-  // Find the biggest power of two subsection divisor less than or equal to maxsubsec
-  int subsec=maxsubsec;
-  int subvol;
-  subvol=blockVol/subsec;
-  while(subvol*subsec!=blockVol){
-    subsec = subsec/2;
-    subvol=blockVol/subsec;
-  };
-
-  Lattice<vSubsec> coarseTmp(coarse);
-  autoView( coarseTmp_, coarseTmp, AcceleratorWriteDiscard);
-  auto coarseTmp_p= &coarseTmp_[0];
  
-  // Sum within subsecs in a first kernel
-  accelerator_for(sce,subsec*coarse->oSites(),vobj::Nsimd(),{
+  accelerator_for(sc,coarse->oSites(),1,{

-      int sc=sce/subsec;
-      int e=sce%subsec;
-      
      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate

-      auto cd = coalescedRead(zz);
-      for(int sb=e*subvol;sb<MIN((e+1)*subvol,blockVol);sb++){
+      vobj cd = zz;
+      
+      for(int sb=0;sb<blockVol;sb++){
+
 	int sf;
 	Coordinate coor_b(_ndimension);
 	Coordinate coor_f(_ndimension);
 	Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
 	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
 	Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
-	
-	cd=cd+coalescedRead(fineData_p[sf]);
+
+	cd=cd+fineData_p[sf];
      }

-      coalescedWrite(coarseTmp_[sc](e),cd);
+      coarseData_p[sc] = cd;

    });
-   // Sum across subsecs in a second kernel
-   accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{
-      auto cd = coalescedRead(coarseTmp_p[sc](0));
-      for(int e=1;e<subsec;e++){
-	cd=cd+coalescedRead(coarseTmp_p[sc](e));
-      }
-      coalescedWrite(coarseData_p[sc],cd);
-   });
-
  return;
 }

@@ -601,7 +521,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
  blockOrthonormalize(ip,Basis);
 }

-#ifdef GRID_ACCELERATED
+#if 0
 // TODO: CPU optimized version here
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
@@ -627,37 +547,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  autoView( fineData_   , fineData, AcceleratorWrite);
  autoView( coarseData_ , coarseData, AcceleratorRead);

-  typedef LatticeView<vobj> Vview;
-  std::vector<Vview> AcceleratorVecViewContainer_h; 
-  for(int v=0;v<nbasis;v++) {
-    AcceleratorVecViewContainer_h.push_back(Basis[v].View(AcceleratorRead));
-  }
-  static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(nbasis); 
-  acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],nbasis *sizeof(Vview));
-  auto Basis_p = &AcceleratorVecViewContainer[0];
  // Loop with a cache friendly loop ordering
-  Coordinate frdimensions=fine->_rdimensions;
-  Coordinate crdimensions=coarse->_rdimensions;
-  accelerator_for(sf,fine->oSites(),vobj::Nsimd(),{
+  accelerator_for(sf,fine->oSites(),1,{
    int sc;
    Coordinate coor_c(_ndimension);
    Coordinate coor_f(_ndimension);

-    Lexicographic::CoorFromIndex(coor_f,sf,frdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,crdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);

-    auto sum= coarseData_(sc)(0) *Basis_p[0](sf);
-    for(int i=1;i<nbasis;i++) sum = sum + coarseData_(sc)(i)*Basis_p[i](sf);
-    coalescedWrite(fineData_[sf],sum);
+    for(int i=0;i<nbasis;i++) {
+      /*      auto basis_ = Basis[i],  );*/
+      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
+      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
+    }
  });
-  for(int v=0;v<nbasis;v++) {
-    AcceleratorVecViewContainer_h[v].ViewClose();
-  }
  return;
+  
 }
 #else
-// CPU version
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -681,26 +590,6 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }
 #endif

-template<class vobj,class CComplex,int nbasis,class VLattice>
-inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
-                               std::vector<Lattice<vobj>> &fineData,
-                               const VLattice &Basis)
-{
-  int NBatch = coarseData.size();
-  assert(fineData.size() == NBatch);
-
-  GridBase * fine   = fineData[0].Grid();
-  GridBase * coarse = coarseData[0].Grid();
-  for (int k=0; k<NBatch; k++)
-    fineData[k]=Zero();
-  for (int i=0;i<nbasis;i++) {
-    for (int k=0; k<NBatch; k++) {
-      Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
-      blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
-    }
-  }
-}
-
 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
 template<class vobj,class vvobj>
@@ -744,11 +633,7 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

-  const int words=sizeof(vobj)/sizeof(vector_type);
-
-  //////////////////////////////////////////////////////////////////////////////////////////
-  // checks should guarantee that the operations are local
-  //////////////////////////////////////////////////////////////////////////////////////////
+  static const int words=sizeof(vobj)/sizeof(vector_type);

  GridBase *Fg = From.Grid();
  GridBase *Tg = To.Grid();
@@ -764,186 +649,43 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }

-  ///////////////////////////////////////////////////////////
-  // do the index calc on the GPU
-  ///////////////////////////////////////////////////////////
-  Coordinate f_ostride = Fg->_ostride;
-  Coordinate f_istride = Fg->_istride;
-  Coordinate f_rdimensions = Fg->_rdimensions;
-  Coordinate t_ostride = Tg->_ostride;
-  Coordinate t_istride = Tg->_istride;
-  Coordinate t_rdimensions = Tg->_rdimensions;
+  // the above should guarantee that the operations are local
+  Coordinate ldf = Fg->_ldimensions;
+  Coordinate rdf = Fg->_rdimensions;
+  Coordinate isf = Fg->_istride;
+  Coordinate osf = Fg->_ostride;
+  Coordinate rdt = Tg->_rdimensions;
+  Coordinate ist = Tg->_istride;
+  Coordinate ost = Tg->_ostride;

-  size_t nsite = 1;
-  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
-
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  autoView(from_v,From,AcceleratorRead);
-  autoView(to_v,To,AcceleratorWrite);
-
-  accelerator_for(idx,nsite,1,{
-
-      Coordinate from_coor, to_coor, base;
-      Lexicographic::CoorFromIndex(base,idx,RegionSize);
-      for(int i=0;i<nd;i++){
-	from_coor[i] = base[i] + FromLowerLeft[i];
-	to_coor[i] = base[i] + ToLowerLeft[i];
+  autoView( t_v , To, AcceleratorWrite);
+  autoView( f_v , From, AcceleratorRead);
+  accelerator_for(idx,Fg->lSites(),1,{
+    sobj s;
+    Coordinate Fcoor(nd);
+    Coordinate Tcoor(nd);
+    Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
+    int in_region=1;
+    for(int d=0;d<nd;d++){
+      if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){ 
+	in_region=0;
      }
-      int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
-      int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
-      int to_oidx   = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
-      int to_lane   = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
-
-      const vector_type* from = (const vector_type *)&from_v[from_oidx];
-      vector_type* to = (vector_type *)&to_v[to_oidx];
-      
-      scalar_type stmp;
+      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
+    }
+    if (in_region) {
+      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
+      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
+      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
+      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
+      scalar_type * fp = (scalar_type *)&f_v[odx_f];
+      scalar_type * tp = (scalar_type *)&t_v[odx_t];
      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	putlane(to[w], stmp, to_lane);
+	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
      }
+    }
  });
 }

-template<class vobj>
-void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int orthog)
-{
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  const int words=sizeof(vobj)/sizeof(vector_type);
-
-  //////////////////////////////////////////////////////////////////////////////////////////
-  // checks should guarantee that the operations are local
-  //////////////////////////////////////////////////////////////////////////////////////////
-  GridBase *Fg = From.Grid();
-  GridBase *Tg = To.Grid();
-  assert(!Fg->_isCheckerBoarded);
-  assert(!Tg->_isCheckerBoarded);
-  int Nsimd = Fg->Nsimd();
-  int nF = Fg->_ndimension;
-  int nT = Tg->_ndimension;
-  assert(nF+1 == nT);
-
-  ///////////////////////////////////////////////////////////
-  // do the index calc on the GPU
-  ///////////////////////////////////////////////////////////
-  Coordinate f_ostride = Fg->_ostride;
-  Coordinate f_istride = Fg->_istride;
-  Coordinate f_rdimensions = Fg->_rdimensions;
-  Coordinate t_ostride = Tg->_ostride;
-  Coordinate t_istride = Tg->_istride;
-  Coordinate t_rdimensions = Tg->_rdimensions;
-  Coordinate RegionSize = Fg->_ldimensions;
-  size_t nsite = 1;
-  for(int i=0;i<nF;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
-
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  autoView(from_v,From,AcceleratorRead);
-  autoView(to_v,To,AcceleratorWrite);
-
-  accelerator_for(idx,nsite,1,{
-
-      Coordinate from_coor(nF), to_coor(nT);
-      Lexicographic::CoorFromIndex(from_coor,idx,RegionSize);
-      int j=0;
-      for(int i=0;i<nT;i++){
-	if ( i!=orthog ) { 
-	  to_coor[i] = from_coor[j];
-	  j++;
-	} else {
-	  to_coor[i] = slice;
-	}
-      }
-      int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
-      int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
-      int to_oidx   = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
-      int to_lane   = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
-
-      const vector_type* from = (const vector_type *)&from_v[from_oidx];
-      vector_type* to = (vector_type *)&to_v[to_oidx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	putlane(to[w], stmp, to_lane);
-      }
-  });
-}
-
-template<class vobj>
-void ExtractSliceFast(Lattice<vobj> &To,const Lattice<vobj> & From,int slice, int orthog)
-{
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  const int words=sizeof(vobj)/sizeof(vector_type);
-
-  //////////////////////////////////////////////////////////////////////////////////////////
-  // checks should guarantee that the operations are local
-  //////////////////////////////////////////////////////////////////////////////////////////
-  GridBase *Fg = From.Grid();
-  GridBase *Tg = To.Grid();
-  assert(!Fg->_isCheckerBoarded);
-  assert(!Tg->_isCheckerBoarded);
-  int Nsimd = Fg->Nsimd();
-  int nF = Fg->_ndimension;
-  int nT = Tg->_ndimension;
-  assert(nT+1 == nF);
-
-  ///////////////////////////////////////////////////////////
-  // do the index calc on the GPU
-  ///////////////////////////////////////////////////////////
-  Coordinate f_ostride = Fg->_ostride;
-  Coordinate f_istride = Fg->_istride;
-  Coordinate f_rdimensions = Fg->_rdimensions;
-  Coordinate t_ostride = Tg->_ostride;
-  Coordinate t_istride = Tg->_istride;
-  Coordinate t_rdimensions = Tg->_rdimensions;
-  Coordinate RegionSize = Tg->_ldimensions;
-  size_t nsite = 1;
-  for(int i=0;i<nT;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
-
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  autoView(from_v,From,AcceleratorRead);
-  autoView(to_v,To,AcceleratorWrite);
-
-  accelerator_for(idx,nsite,1,{
-
-      Coordinate from_coor(nF), to_coor(nT);
-      Lexicographic::CoorFromIndex(to_coor,idx,RegionSize);
-      int j=0;
-      for(int i=0;i<nF;i++){
-	if ( i!=orthog ) { 
-	  from_coor[i] = to_coor[j];
-	  j++;
-	} else {
-	  from_coor[i] = slice;
-	}
-      }
-      int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
-      int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
-      int to_oidx   = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
-      int to_lane   = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
-
-      const vector_type* from = (const vector_type *)&from_v[from_oidx];
-      vector_type* to = (vector_type *)&to_v[to_oidx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	putlane(to[w], stmp, to_lane);
-      }
-  });
-}

 template<class vobj>
 void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
@@ -981,14 +723,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
    hcoor[orthog] = slice;
    for(int d=0;d<nh;d++){
      if ( d!=orthog ) { 
-	hcoor[d]=lcoor[ddl];
-	if ( hg->_checker_dim == d ) {
-	  hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
-	  lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
-	}
-	ddl++;
+	hcoor[d]=lcoor[ddl++];
      }
-      
    }
    peekLocalSite(s,lowDimv,lcoor);
    pokeLocalSite(s,higherDimv,hcoor);
@@ -1009,7 +745,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
  assert(orthog<nh);
  assert(orthog>=0);
  assert(hg->_processors[orthog]==1);
-  lowDim.Checkerboard() = higherDim.Checkerboard();

  int dl; dl = 0;
  for(int d=0;d<nh;d++){
@@ -1027,16 +762,11 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
    Coordinate lcoor(nl);
    Coordinate hcoor(nh);
    lg->LocalIndexToLocalCoor(idx,lcoor);
-    hcoor[orthog] = slice;
    int ddl=0;
+    hcoor[orthog] = slice;
    for(int d=0;d<nh;d++){
      if ( d!=orthog ) { 
-	hcoor[d]=lcoor[ddl];
-	if ( hg->_checker_dim == d ) {
-	  hcoor[d]=hcoor[d]*2;     // factor in the full gridd coor for peekLocalSite
-	  lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
-	}
-	ddl++;
+	hcoor[d]=lcoor[ddl++];
      }
    }
    peekLocalSite(s,higherDimv,hcoor);
@@ -1045,7 +775,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic

 }

-//Can I implement with local copyregion??
+
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@@ -1062,22 +792,65 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int

  for(int d=0;d<nh;d++){
    if ( d!=orthog ) {
-      assert(lg->_processors[d]  == hg->_processors[d]);
-      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
-    }
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
  }
-  Coordinate sz = lg->_ldimensions;
-  sz[orthog]=1;
-  Coordinate f_ll(nl,0); f_ll[orthog]=slice_lo;
-  Coordinate t_ll(nh,0); t_ll[orthog]=slice_hi;
-  localCopyRegion(lowDim,higherDim,f_ll,t_ll,sz);
+  }
+
+  // the above should guarantee that the operations are local
+  autoView(lowDimv,lowDim,CpuRead);
+  autoView(higherDimv,higherDim,CpuWrite);
+  thread_for(idx,lg->lSites(),{
+    sobj s;
+    Coordinate lcoor(nl);
+    Coordinate hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,lowDimv,lcoor);
+      pokeLocalSite(s,higherDimv,hcoor);
+    }
+  });
 }


 template<class vobj>
 void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
-  InsertSliceLocal(higherDim,lowDim,slice_hi,slice_lo,orthog);
+  typedef typename vobj::scalar_object sobj;
+
+  GridBase *lg = lowDim.Grid();
+  GridBase *hg = higherDim.Grid();
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    if ( d!=orthog ) {
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+  }
+
+  // the above should guarantee that the operations are local
+  autoView(lowDimv,lowDim,CpuWrite);
+  autoView(higherDimv,higherDim,CpuRead);
+  thread_for(idx,lg->lSites(),{
+    sobj s;
+    Coordinate lcoor(nl);
+    Coordinate hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,higherDimv,hcoor);
+      pokeLocalSite(s,lowDimv,lcoor);
+    }
+  });
 }


@@ -1103,7 +876,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)

  Coordinate fcoor(nd);
  Coordinate ccoor(nd);
-  for(int64_t g=0;g<fg->gSites();g++){
+  for(int g=0;g<fg->gSites();g++){

    fg->GlobalIndexToGlobalCoor(g,fcoor);
    for(int d=0;d<nd;d++){
@@ -1307,80 +1080,11 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }

-//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
-template<class VobjOut, class VobjIn>
-void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
-{
-  typedef typename VobjOut::vector_type Vout;
-  typedef typename VobjIn::vector_type Vin;
-  const int N = sizeof(VobjOut)/sizeof(Vout);
-  conformable(out.Grid(),in.Grid());
-  out.Checkerboard() = in.Checkerboard();
-  int nsimd = out.Grid()->Nsimd();
-  autoView( out_v  , out, AcceleratorWrite);
-  autoView(  in_v ,   in, AcceleratorRead);
-  accelerator_for(idx,out.Grid()->oSites(),1,{
-      Vout *vout = (Vout *)&out_v[idx];
-      Vin  *vin  = (Vin  *)&in_v[idx];
-      precisionChange(vout,vin,N);
-  });
-}
-//Convert a Lattice from one precision to another (original, slow implementation)
-template<class VobjOut, class VobjIn>
-void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
-{
-  assert(out.Grid()->Nd() == in.Grid()->Nd());
-  for(int d=0;d<out.Grid()->Nd();d++){
-    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
-  }
-  out.Checkerboard() = in.Checkerboard();
-  GridBase *in_grid=in.Grid();
-  GridBase *out_grid = out.Grid();
-
-  typedef typename VobjOut::scalar_object SobjOut;
-  typedef typename VobjIn::scalar_object SobjIn;
-
-  int ndim = out.Grid()->Nd();
-  int out_nsimd = out_grid->Nsimd();
-  int in_nsimd = in_grid->Nsimd();
-  std::vector<Coordinate > out_icoor(out_nsimd);
-      
-  for(int lane=0; lane < out_nsimd; lane++){
-    out_icoor[lane].resize(ndim);
-    out_grid->iCoorFromIindex(out_icoor[lane], lane);
-  }
-        
-  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
-  unvectorizeToLexOrdArray(in_slex_conv, in);
-    
-  autoView( out_v , out, CpuWrite);
-  thread_for(out_oidx,out_grid->oSites(),{
-    Coordinate out_ocoor(ndim);
-    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
-
-    ExtractPointerArray<SobjOut> ptrs(out_nsimd);      
-
-    Coordinate lcoor(out_grid->Nd());
-      
-    for(int lane=0; lane < out_nsimd; lane++){
-      for(int mu=0;mu<ndim;mu++)
-	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
-	
-      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
-      ptrs[lane] = &in_slex_conv[llex];
-    }
-    merge(out_v[out_oidx], ptrs, 0);
-  });
-}
-
 //The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
 class precisionChangeWorkspace{
  std::pair<Integer,Integer>* fmap_device; //device pointer
-  //maintain grids for checking
-  GridBase* _out_grid;
-  GridBase* _in_grid;
 public:
-  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
+  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
    assert(out_grid->Nd() == in_grid->Nd());
    for(int d=0;d<out_grid->Nd();d++){
@@ -1427,46 +1131,20 @@ public:
  
  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }

-  void checkGrids(GridBase* out, GridBase* in) const{
-    conformable(out, _out_grid);
-    conformable(in, _in_grid);
-  }
-  
  ~precisionChangeWorkspace(){
    acceleratorFreeDevice(fmap_device);
  }
 };


-//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
-//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
-template<class VobjOut, class VobjIn>
-auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
-  if(out.Grid() == in.Grid()){
-    precisionChangeFast(out,in);
-    return 1;
-  }else{
-    return 0;
-  }
-}
-template<class VobjOut, class VobjIn>
-int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
-  return 0;
-}
-
-
-//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
-//which contains the mapping data.
+//Convert a lattice of one precision to another. The input workspace contains the mapping data.
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
-  if(_precisionChangeFastWrap(out,in,0)) return;
-  
-  static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+  static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same

  out.Checkerboard() = in.Checkerboard();
  constexpr int Nsimd_out = VobjOut::Nsimd();

-  workspace.checkGrids(out.Grid(),in.Grid());
  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();

  //Do the copy/precision change
@@ -1483,18 +1161,15 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const pre
    });
 }

-//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
-//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
+//Convert a Lattice from one precision to another
+//Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
-  if(_precisionChangeFastWrap(out,in,0)) return;   
  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
  precisionChange(out, in, workspace);
 }


-
-
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
@@ -1789,35 +1464,5 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
  }
 }

-//////////////////////////////////////////////////////
-// Faster but less accurate blockProject
-//////////////////////////////////////////////////////
-template<class vobj,class CComplex,int nbasis,class VLattice>
-inline void blockProjectFast(Lattice<iVector<CComplex,nbasis > > &coarseData,
-			     const             Lattice<vobj>   &fineData,
-			     const VLattice &Basis)
-{
-  GridBase * fine  = fineData.Grid();
-  GridBase * coarse= coarseData.Grid();
-
-  Lattice<iScalar<CComplex> > ip(coarse);
-
-  autoView( coarseData_ , coarseData, AcceleratorWrite);
-  autoView( ip_         , ip,         AcceleratorWrite);
-  RealD t_IP=0;
-  RealD t_co=0;
-  for(int v=0;v<nbasis;v++) {
-    t_IP-=usecond();
-    blockInnerProductD(ip,Basis[v],fineData); 
-    t_IP+=usecond();
-    t_co-=usecond();
-    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
-	convertType(coarseData_[sc](v),ip_[sc]);
-      });
-    t_co+=usecond();
-  }
-}
-
-
 NAMESPACE_END(Grid);

@@ -45,7 +45,6 @@ public:
  };
  // Host only
  GridBase * getGrid(void) const { return _grid; };
-  vobj* getHostPointer(void) const { return _odata; };
 };

 /////////////////////////////////////////////////////////////////////////////////////////
@@ -1,601 +0,0 @@
-/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/lattice/PaddedCell.h
-
-    Copyright (C) 2019
-
-Author: Peter Boyle pboyle@bnl.gov
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-#include<Grid/cshift/Cshift.h>
-
-NAMESPACE_BEGIN(Grid);
-
-//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
-template<typename vobj>
-struct CshiftImplBase{
-  virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
-  virtual ~CshiftImplBase(){}
-};
-template<typename vobj>
-struct CshiftImplDefault: public CshiftImplBase<vobj>{
-  Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
-};
-template<typename Gimpl>
-struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
-  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
-};  
-
-
-/*
- *
- * TODO: 
- *  -- address elementsof vobj via thread block in Scatter/Gather
- *  -- overlap comms with motion in Face_exchange
- *
- */
-
-template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
-					      Lattice<vobj> &lat,
-					      int x,
-					      int dim,
-					      int offset=0)
-{
-  const int Nsimd=vobj::Nsimd();
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  GridBase *grid = lat.Grid();
-  Coordinate simd = grid->_simd_layout;
-  int Nd          = grid->Nd();
-  int block       = grid->_slice_block[dim];
-  int stride      = grid->_slice_stride[dim];
-  int nblock      = grid->_slice_nblock[dim];
-  int rd          = grid->_rdimensions[dim];
-
-  int ox = x%rd;
-  int ix = x/rd;
-
-  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
-
-  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
-
-  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
-  int rNsimda= Nsimd/simd[dim]; // should be equal
-  assert(rNsimda==rNsimd);
-  int face_ovol=block*nblock;
-
-  //  assert(buf.size()==face_ovol*rNsimd);
-
-  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
-  //Let's make it work on GPU and then make a special accelerator_for that
-  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
-  //for cross platform
-  // FIXME -- can put internal indices into thread loop
-  auto buf_p = & buf[0];
-  autoView(lat_v, lat, AcceleratorWrite);
-  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
-
-    // scalar layout won't coalesce
-#ifdef GRID_SIMT
-      {
-	int blane=acceleratorSIMTlane(Nsimd); // buffer lane
-#else
-      for(int blane=0;blane<Nsimd;blane++) {
-#endif
-	int olane=blane%rNsimd;               // reduced lattice lane
-	int obit =blane/rNsimd;
-
-	///////////////////////////////////////////////////////////////
-	// osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
-	///////////////////////////////////////////////////////////////
-	int ssp = ss*simd[dim]+obit;
-	int b    = ssp%block;
-	int n    = ssp/block;
-	int osite= b+n*stride + ox*block;
-	
-	////////////////////////////////////////////
-	// isite -- map lane within buffer to lane within lattice
-	////////////////////////////////////////////
-	Coordinate icoor;
-	int lane;
-	Lexicographic::CoorFromIndex(icoor,olane,rsimd);
-	icoor[dim]=ix;
-	Lexicographic::IndexFromCoor(icoor,lane,simd);
-	
-	///////////////////////////////////////////
-	// Transfer into lattice - will coalesce
-	///////////////////////////////////////////
-	//	sobj obj = extractLane(blane,buf_p[ss+offset]);
-	//	insertLane(lane,lat_v[osite],obj);
-	const int words=sizeof(vobj)/sizeof(vector_type);
-	vector_type * from = (vector_type *)&buf_p[ss+offset];
-	vector_type * to   = (vector_type *)&lat_v[osite];
-	scalar_type stmp;
-	for(int w=0;w<words;w++){
-	  stmp = getlane(from[w], blane);
-	  putlane(to[w], stmp, lane);
-	}
-      }
-  });
-}
-
-template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
-					     const Lattice<vobj> &lat,
-					     int x,
-					     int dim,
-					     int offset=0)
-{
-  const int Nsimd=vobj::Nsimd();
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  autoView(lat_v, lat, AcceleratorRead);
-
-  GridBase *grid = lat.Grid();
-  Coordinate simd = grid->_simd_layout;
-  int Nd          = grid->Nd();
-  int block       = grid->_slice_block[dim];
-  int stride      = grid->_slice_stride[dim];
-  int nblock      = grid->_slice_nblock[dim];
-  int rd          = grid->_rdimensions[dim];
-
-  int ox = x%rd;
-  int ix = x/rd;
-
-  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
-
-  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
-
-  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
-  
-  int face_ovol=block*nblock;
-
-  //  assert(buf.size()==face_ovol*rNsimd);
-
-  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
-  //Let's make it work on GPU and then make a special accelerator_for that
-  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
-  //for cross platform
-  //For CPU perhaps just run a loop over Nsimd
-  auto buf_p = & buf[0];
-  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
-
-    // scalar layout won't coalesce
-#ifdef GRID_SIMT
-      {
-	int blane=acceleratorSIMTlane(Nsimd); // buffer lane
-#else
-      for(int blane=0;blane<Nsimd;blane++) {
-#endif
-	int olane=blane%rNsimd;               // reduced lattice lane
-	int obit =blane/rNsimd;
-	
-	////////////////////////////////////////////
-	// osite
-	////////////////////////////////////////////
-	int ssp = ss*simd[dim]+obit;
-	int b    = ssp%block;
-	int n    = ssp/block;
-	int osite= b+n*stride + ox*block;
-
-	////////////////////////////////////////////
-	// isite -- map lane within buffer to lane within lattice
-	////////////////////////////////////////////
-	Coordinate icoor;
-	int lane;
-	Lexicographic::CoorFromIndex(icoor,olane,rsimd);
-	icoor[dim]=ix;
-	Lexicographic::IndexFromCoor(icoor,lane,simd);
-	
-	///////////////////////////////////////////
-	// Take out of lattice
-	///////////////////////////////////////////
-	//	sobj obj = extractLane(lane,lat_v[osite]);
-	//	insertLane(blane,buf_p[ss+offset],obj);
-	const int words=sizeof(vobj)/sizeof(vector_type);
-	vector_type * to    = (vector_type *)&buf_p[ss+offset];
-	vector_type * from  = (vector_type *)&lat_v[osite];
-	scalar_type stmp;
-	for(int w=0;w<words;w++){
-	  stmp = getlane(from[w], lane);
-	  putlane(to[w], stmp, blane);
-	}
-      }
-  });
-}
-
-
-class PaddedCell {
-public:
-  GridCartesian * unpadded_grid;
-  int dims;
-  int depth;
-  std::vector<GridCartesian *> grids;
-
-  ~PaddedCell()
-  {
-    DeleteGrids();
-  }
-  PaddedCell(int _depth,GridCartesian *_grid)
-  {
-    unpadded_grid = _grid;
-    depth=_depth;
-    dims=_grid->Nd();
-    AllocateGrids();
-    Coordinate local     =unpadded_grid->LocalDimensions();
-    Coordinate procs     =unpadded_grid->ProcessorGrid();
-    for(int d=0;d<dims;d++){
-      if ( procs[d] > 1 ) assert(local[d]>=depth);
-    }
-  }
-  void DeleteGrids(void)
-  {
-    Coordinate processors=unpadded_grid->_processors;
-    for(int d=0;d<grids.size();d++){
-      if ( processors[d] > 1 ) { 
-	delete grids[d];
-      }
-    }
-    grids.resize(0);
-  };
-  void AllocateGrids(void)
-  {
-    Coordinate local     =unpadded_grid->LocalDimensions();
-    Coordinate simd      =unpadded_grid->_simd_layout;
-    Coordinate processors=unpadded_grid->_processors;
-    Coordinate plocal    =unpadded_grid->LocalDimensions();
-    Coordinate global(dims);
-    GridCartesian *old_grid = unpadded_grid;
-    // expand up one dim at a time
-    for(int d=0;d<dims;d++){
-
-      if ( processors[d] > 1 ) { 
-	plocal[d] += 2*depth; 
-      
-	for(int d=0;d<dims;d++){
-	  global[d] = plocal[d]*processors[d];
-	}
-
-	old_grid = new GridCartesian(global,simd,processors);
-      }
-      grids.push_back(old_grid);
-    }
-  };
-  template<class vobj>
-  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
-  {
-    Coordinate processors=unpadded_grid->_processors;
-
-    Lattice<vobj> out(unpadded_grid);
-
-    Coordinate local     =unpadded_grid->LocalDimensions();
-    // depends on the MPI spread      
-    Coordinate fll(dims,depth);
-    Coordinate tll(dims,0); // depends on the MPI spread
-    for(int d=0;d<dims;d++){
-      if( processors[d]==1 ) fll[d]=0;
-    }
-    localCopyRegion(in,out,fll,tll,local);
-    return out;
-  }
-  template<class vobj>
-  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
-  {
-    GridBase *old_grid = in.Grid();
-    int dims = old_grid->Nd();
-    Lattice<vobj> tmp = in;
-    for(int d=0;d<dims;d++){
-      tmp = Expand(d,tmp,cshift); // rvalue && assignment
-    }
-    return tmp;
-  }
-  template<class vobj>
-  inline Lattice<vobj> ExchangePeriodic(const Lattice<vobj> &in) const
-  {
-    GridBase *old_grid = in.Grid();
-    int dims = old_grid->Nd();
-    Lattice<vobj> tmp = in;
-    for(int d=0;d<dims;d++){
-      tmp = ExpandPeriodic(d,tmp); // rvalue && assignment
-    }
-    return tmp;
-  }
-  // expand up one dim at a time
-  template<class vobj>
-  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
-  {
-    Coordinate processors=unpadded_grid->_processors;
-    GridBase *old_grid = in.Grid();
-    GridCartesian *new_grid = grids[dim];//These are new grids
-    Lattice<vobj>  padded(new_grid);
-    Lattice<vobj> shifted(old_grid);    
-    Coordinate local     =old_grid->LocalDimensions();
-    Coordinate plocal    =new_grid->LocalDimensions();
-    if(dim==0) conformable(old_grid,unpadded_grid);
-    else       conformable(old_grid,grids[dim-1]);
-
-    double tins=0, tshift=0;
-
-    int islocal = 0 ;
-    if ( processors[dim] == 1 ) islocal = 1;
-
-    if ( islocal ) {
-
-      // replace with a copy and maybe grid swizzle
-      // return in;??
-      double t = usecond();
-      padded = in;
-      tins += usecond() - t;
-      
-    } else {
-
-      //////////////////////////////////////////////
-      // Replace sequence with
-      // ---------------------
-      // (i) Gather high face(s); start comms
-      // (ii) Gather low  face(s); start comms
-      // (iii) Copy middle bit with localCopyRegion
-      // (iv) Complete high face(s), insert slice(s)
-      // (iv) Complete low  face(s), insert slice(s)
-      //////////////////////////////////////////////
-      // Middle bit
-      double t = usecond();
-      for(int x=0;x<local[dim];x++){
-	InsertSliceLocal(in,padded,x,depth+x,dim);
-      }
-      tins += usecond() - t;
-    
-      // High bit
-      t = usecond();
-      shifted = cshift.Cshift(in,dim,depth);
-      tshift += usecond() - t;
-
-      t=usecond();
-      for(int x=0;x<depth;x++){
-	InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
-      }
-      tins += usecond() - t;
-    
-      // Low bit
-      t = usecond();
-      shifted = cshift.Cshift(in,dim,-depth);
-      tshift += usecond() - t;
-    
-      t = usecond();
-      for(int x=0;x<depth;x++){
-	InsertSliceLocal(shifted,padded,x,x,dim);
-      }
-      tins += usecond() - t;
-
-    }
-    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
-    
-    return padded;
-  }
-
-  template<class vobj>
-  inline Lattice<vobj> ExpandPeriodic(int dim, const Lattice<vobj> &in) const
-  {
-    Coordinate processors=unpadded_grid->_processors;
-    GridBase *old_grid = in.Grid();
-    GridCartesian *new_grid = grids[dim];//These are new grids
-    Lattice<vobj>  padded(new_grid);
-    //    Lattice<vobj> shifted(old_grid);    
-    Coordinate local     =old_grid->LocalDimensions();
-    Coordinate plocal    =new_grid->LocalDimensions();
-    if(dim==0) conformable(old_grid,unpadded_grid);
-    else       conformable(old_grid,grids[dim-1]);
-
-    //    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
-    double tins=0, tshift=0;
-
-    int islocal = 0 ;
-    if ( processors[dim] == 1 ) islocal = 1;
-
-    if ( islocal ) {
-      padded=in; // slightly different interface could avoid a copy operation
-    } else {
-      Face_exchange(in,padded,dim,depth);
-      return padded;
-    }
-    return padded;
-  }
-  template<class vobj>
-  void Face_exchange(const Lattice<vobj> &from,
-		     Lattice<vobj> &to,
-		     int dimension,int depth) const
-  {
-    typedef typename vobj::vector_type vector_type;
-    typedef typename vobj::scalar_type scalar_type;
-    typedef typename vobj::scalar_object sobj;
-
-    RealD t_gather=0.0;
-    RealD t_scatter=0.0;
-    RealD t_comms=0.0;
-    RealD t_copy=0.0;
-    
-    //    std::cout << GridLogMessage << "dimension " <<dimension<<std::endl;
-    //    DumpSliceNorm(std::string("Face_exchange from"),from,dimension);
-    GridBase *grid=from.Grid();
-    GridBase *new_grid=to.Grid();
-
-    Coordinate lds = from.Grid()->_ldimensions;
-    Coordinate nlds=   to.Grid()->_ldimensions;
-    Coordinate simd= from.Grid()->_simd_layout;
-    int ld    = lds[dimension];
-    int nld   = to.Grid()->_ldimensions[dimension];
-    const int Nsimd = vobj::Nsimd();
-
-    assert(depth<=lds[dimension]); // A must be on neighbouring node
-    assert(depth>0);   // A caller bug if zero
-    assert(ld+2*depth==nld);
-    ////////////////////////////////////////////////////////////////////////////
-    // Face size and byte calculations
-    ////////////////////////////////////////////////////////////////////////////
-    int buffer_size = 1;
-    for(int d=0;d<lds.size();d++){
-      if ( d!= dimension) buffer_size=buffer_size*lds[d];
-    }
-    buffer_size = buffer_size  / Nsimd;
-    int rNsimd = Nsimd / simd[dimension];
-    assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
-
-    static deviceVector<vobj> send_buf; 
-    static deviceVector<vobj> recv_buf;
-    send_buf.resize(buffer_size*2*depth);    
-    recv_buf.resize(buffer_size*2*depth);
-#ifndef ACCELERATOR_AWARE_MPI
-    static hostVector<vobj> hsend_buf; 
-    static hostVector<vobj> hrecv_buf;
-    hsend_buf.resize(buffer_size*2*depth);    
-    hrecv_buf.resize(buffer_size*2*depth);
-#endif    
-
-    std::vector<MpiCommsRequest_t> fwd_req;   
-    std::vector<MpiCommsRequest_t> bwd_req;   
-
-    int words = buffer_size;
-    int bytes = words * sizeof(vobj);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Communication coords
-    ////////////////////////////////////////////////////////////////////////////
-    int comm_proc = 1;
-    int xmit_to_rank;
-    int recv_from_rank;
-    grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Gather all surface terms up to depth "d"
-    ////////////////////////////////////////////////////////////////////////////
-    RealD t;
-    RealD t_tot=-usecond();
-    int plane=0;
-    for ( int d=0;d < depth ; d ++ ) {
-      int tag = d*1024 + dimension*2+0;
-
-      t=usecond();
-      GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
-      t_gather+=usecond()-t;
-
-      t=usecond();
-#ifdef ACCELERATOR_AWARE_MPI
-      grid->SendToRecvFromBegin(fwd_req,
-				(void *)&send_buf[d*buffer_size], xmit_to_rank,
-				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
-#else
-      acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
-      grid->SendToRecvFromBegin(fwd_req,
-				(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
-				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
-#endif
-      t_comms+=usecond()-t;
-     }
-    for ( int d=0;d < depth ; d ++ ) {
-      int tag = d*1024 + dimension*2+1;
-
-      t=usecond();
-      GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
-      t_gather+= usecond() - t;
-
-      t=usecond();
-#ifdef ACCELERATOR_AWARE_MPI
-      grid->SendToRecvFromBegin(bwd_req,
-				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
-				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
-#else
-      acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
-      grid->SendToRecvFromBegin(bwd_req,
-				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
-				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
-#endif      
-      t_comms+=usecond()-t;
-    }
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Copy interior -- overlap this with comms
-    ////////////////////////////////////////////////////////////////////////////
-    int Nd = new_grid->Nd();
-    Coordinate LL(Nd,0);
-    Coordinate sz = grid->_ldimensions;
-    Coordinate toLL(Nd,0);
-    toLL[dimension]=depth;
-    t=usecond();
-    localCopyRegion(from,to,LL,toLL,sz);
-    t_copy= usecond() - t;
-    
-    ////////////////////////////////////////////////////////////////////////////
-    // Scatter all faces
-    ////////////////////////////////////////////////////////////////////////////
-    plane=0;
-
-    t=usecond();
-    grid->CommsComplete(fwd_req);
-#ifndef ACCELERATOR_AWARE_MPI
-    for ( int d=0;d < depth ; d ++ ) {
-      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
-    }
-#endif
-    t_comms+= usecond() - t;
-    
-    t=usecond();
-    for ( int d=0;d < depth ; d ++ ) {
-      ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
-    }
-    t_scatter= usecond() - t;
-
-    t=usecond();
-    grid->CommsComplete(bwd_req);
-#ifndef ACCELERATOR_AWARE_MPI
-    for ( int d=0;d < depth ; d ++ ) {
-      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
-    }
-#endif
-    t_comms+= usecond() - t;
-    
-    t=usecond();
-    for ( int d=0;d < depth ; d ++ ) {
-      ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
-    }
-    t_scatter+= usecond() - t;
-    t_tot+=usecond();
-
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000  << "ms"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000   << "ms"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy   :" << t_copy/1000      << "ms"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << t_comms/1000     << "ms"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total  :" << t_tot/1000     << "ms"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << depth*4.0*bytes/t_gather << "MB/s"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << depth*4.0*bytes/t_scatter<< "MB/s"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << (RealD)4.0*bytes/t_comms   << "MB/s"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: face bytes  :" << depth*bytes/1e6 << "MB"<<std::endl;
-  }
-  
-};
- 
-
-NAMESPACE_END(Grid);
-
-
@@ -65,40 +65,32 @@ GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
-GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
-GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
-GridLogger GridLogDslash     (1, "Dslash", GridLogColours, "BLUE");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");

 void GridLogConfigure(std::vector<std::string> &logstreams) {
-  GridLogError.Active(1);
+  GridLogError.Active(0);
  GridLogWarning.Active(0);
  GridLogMessage.Active(1); // at least the messages should be always on
-  GridLogMemory.Active(0); 
-  GridLogTracing.Active(0); 
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
-  GridLogDslash.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);

  for (int i = 0; i < logstreams.size(); i++) {
-    if (logstreams[i] == std::string("Tracing"))     GridLogTracing.Active(1);
-    if (logstreams[i] == std::string("Memory"))      GridLogMemory.Active(1);
+    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
    if (logstreams[i] == std::string("Warning"))     GridLogWarning.Active(1);
    if (logstreams[i] == std::string("NoMessage"))   GridLogMessage.Active(0);
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Dslash"))      GridLogDslash.Active(1);
-    if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
-    if (logstreams[i] == std::string("NoHMC"))       GridLogHMC.Active(0);
+    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
+    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
@@ -138,8 +138,7 @@ public:
        stream << std::setw(log.topWidth);
      }
      stream << log.topName << log.background()<< " : ";
-      //      stream << log.colour() <<  std::left;
-      stream <<  std::left;
+      stream << log.colour() <<  std::left;
      if (log.chanWidth > 0)
      {
        stream << std::setw(log.chanWidth);
@@ -154,9 +153,9 @@ public:
 	stream << log.evidence()
 	       << now	       << log.background() << " : " ;
      }
-      //      stream << log.colour();
-      stream <<  std::right;
+      stream << log.colour();
      stream.flags(f);
+
      return stream;
    } else { 
      return devnull;
@@ -179,53 +178,15 @@ extern GridLogger GridLogSolver;
 extern GridLogger GridLogError;
 extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
-extern GridLogger GridLogDebug;
+extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
-extern GridLogger GridLogDslash;
-extern GridLogger GridLogIterative;
-extern GridLogger GridLogIntegrator;
+extern GridLogger GridLogIterative  ;
+extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
-extern GridLogger GridLogMemory;
-extern GridLogger GridLogTracing;
 extern Colours    GridLogColours;

 std::string demangle(const char* name) ;

-template<typename... Args>
-inline std::string sjoin(Args&&... args) noexcept {
-    std::ostringstream msg;
-    (msg << ... << args);
-    return msg.str();
-}
-
-/*!  @brief make log messages work like python print */
-template <typename... Args>
-inline void Grid_log(Args&&... args) {
-    std::string msg = sjoin(std::forward<Args>(args)...);
-    std::cout << GridLogMessage << msg << std::endl;
-}
-
-/*!  @brief make warning messages work like python print */
-template <typename... Args>
-inline void Grid_warn(Args&&... args) {
-    std::string msg = sjoin(std::forward<Args>(args)...);
-    std::cout << "\033[33m" << GridLogWarning << msg << "\033[0m" << std::endl;
-}
-
-/*!  @brief make error messages work like python print */
-template <typename... Args>
-inline void Grid_error(Args&&... args) {
-    std::string msg = sjoin(std::forward<Args>(args)...);
-    std::cout << "\033[31m" << GridLogError << msg << "\033[0m" << std::endl;
-}
-
-/*!  @brief make pass messages work like python print */
-template <typename... Args>
-inline void Grid_pass(Args&&... args) {
-    std::string msg = sjoin(std::forward<Args>(args)...);
-    std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
-}
-
 #define _NBACKTRACE (256)
 extern void * Grid_backtrace_buffer[_NBACKTRACE];

@@ -165,7 +165,7 @@ class BinaryIO {
 	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
 	
-	int64_t global_site;
+	int global_site;

 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);

@@ -175,8 +175,8 @@ class BinaryIO {

 	Lexicographic::IndexFromCoor(coor,global_site,global_vol);

-	uint64_t gsite29   = global_site%29;
-	uint64_t gsite31   = global_site%31;
+	uint32_t gsite29   = global_site%29;
+	uint32_t gsite31   = global_site%31;
 	
 	site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
 	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
@@ -545,9 +545,7 @@ class BinaryIO {
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
-				       uint32_t &scidac_csumb,
-				       int control=BINARYIO_LEXICOGRAPHIC
-				       )
+				       uint32_t &scidac_csumb)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@@ -558,7 +556,7 @@ class BinaryIO {
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
    
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|control,
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);

    GridStopWatch timer; 
@@ -584,8 +582,7 @@ class BinaryIO {
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
-					  uint32_t &scidac_csumb,
-					  int control=BINARYIO_LEXICOGRAPHIC)
+					  uint32_t &scidac_csumb)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@@ -610,7 +607,7 @@ class BinaryIO {
    while (attemptsLeft >= 0)
    {
      grid->Barrier();
-      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|control,
+      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
 	             nersc_csum,scidac_csuma,scidac_csumb);
      if (checkWrite)
      {
@@ -620,7 +617,7 @@ class BinaryIO {

        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
        grid->Barrier();
-        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|control,
+        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
        {
@@ -31,7 +31,6 @@ directory
 #include <fstream>
 #include <iomanip>
 #include <iostream>
-#include <string>
 #include <map>

 #include <pwd.h>
@@ -162,14 +161,8 @@ template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
 {
   uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
   uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
-   std::cout << GridLogMessage << " scidacChecksumVerify computed "<<scidac_csuma<<" expected "<<scidac_checksuma <<std::endl;
-   std::cout << GridLogMessage << " scidacChecksumVerify computed "<<scidac_csumb<<" expected "<<scidac_checksumb <<std::endl;
-   if ( scidac_csuma !=scidac_checksuma) {
-     return 0;
-   };
-   if ( scidac_csumb !=scidac_checksumb) {
-     return 0;
-   };
+   if ( scidac_csuma !=scidac_checksuma) return 0;
+   if ( scidac_csumb !=scidac_checksumb) return 0;
   return 1;
 }

@@ -212,7 +205,7 @@ class GridLimeReader : public BinaryIO {
  // Read a generic lattice field and verify checksum
  ////////////////////////////////////////////
  template<class vobj>
-  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
+  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    typedef typename vobj::scalar_object sobj;
    scidacChecksum scidacChecksum_;
@@ -244,7 +237,7 @@ class GridLimeReader : public BinaryIO {
 	uint64_t offset= ftello(File);
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
-	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb,control);
+	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
 	std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
 	/////////////////////////////////////////////
@@ -414,7 +407,7 @@ class GridLimeWriter : public BinaryIO
  // in communicator used by the field.Grid()
  ////////////////////////////////////////////////////
  template<class vobj>
-  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
+  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    ////////////////////////////////////////////////////////////////////
    // NB: FILE and iostream are jointly writing disjoint sequences in the
@@ -465,7 +458,7 @@ class GridLimeWriter : public BinaryIO
    ///////////////////////////////////////////
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
-    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb,control);
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);

    ///////////////////////////////////////////
    // Wind forward and close the record
@@ -518,8 +511,7 @@ class ScidacWriter : public GridLimeWriter {
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
-                              const unsigned int recordScientificPrec = 0,
-			      int control=BINARYIO_LEXICOGRAPHIC)
+                              const unsigned int recordScientificPrec = 0) 
  {
    GridBase * grid = field.Grid();

@@ -541,7 +533,7 @@ class ScidacWriter : public GridLimeWriter {
      writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    }
    // Collective call
-    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);      // Closes message with checksum
+    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
  }
 };

@@ -560,8 +552,7 @@ class ScidacReader : public GridLimeReader {
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
-  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord,
-			     int control=BINARYIO_LEXICOGRAPHIC) 
+  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
  {
    typedef typename vobj::scalar_object sobj;
    GridBase * grid = field.Grid();
@@ -579,7 +570,7 @@ class ScidacReader : public GridLimeReader {
    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
-    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);
+    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
  }
  void skipPastBinaryRecord(void) {
    std::string rec_name(ILDG_BINARY_DATA);
@@ -663,8 +654,7 @@ class IldgWriter : public ScidacWriter {
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
-    const std::string stNC = std::to_string( Nc ) ;
-    ildgfmt.field          = std::string("su"+stNC+"gauge");
+    ildgfmt.field     = std::string("su3gauge");

    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
@@ -881,8 +871,7 @@ class IldgReader : public GridLimeReader {
    } else { 

      assert(found_ildgFormat);
-      const std::string stNC = std::to_string( Nc ) ;
-      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
+      assert ( ildgFormat_.field == std::string("su3gauge") );

      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
@@ -890,7 +879,7 @@ class IldgReader : public GridLimeReader {

      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
+      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");

      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
@@ -6,8 +6,8 @@

    Copyright (C) 2015

+
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -182,8 +182,8 @@ class GaugeStatistics
 public:
  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
  {
-    header.link_trace = WilsonLoops<Impl>::linkTrace(data);
-    header.plaquette  = WilsonLoops<Impl>::avgPlaquette(data);
+    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
+    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
  }
 };
 typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@@ -203,24 +203,20 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  assert( Nc < 4 && Nc > 1 ) ;
+  const int x=0;
+  const int y=1;
+  const int z=2;
  for(int mu=0;mu<Nd;mu++){
-    #if Nc == 2
-      cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
-      cm(mu)()(1,1) =  adj(cm(mu)()(0,x)) ;
-    #else
-      const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
-      cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
-      cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
-      cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
-    #endif
+    cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
+    cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
+    cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
  }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
-template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
+template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;

 typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
 typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@@ -282,6 +278,7 @@ struct GaugeSimpleMunger{

 template <class fobj, class sobj>
 struct GaugeSimpleUnmunger {
+
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nd; mu++) {
      for (int i = 0; i < Nc; i++) {
@@ -320,8 +317,8 @@ template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<Nc;j++){
+      for(int i=0;i<2;i++){
+	for(int j=0;j<3;j++){
 	  out(mu)()(i,j) = in(mu)(i)(j);
 	}}
    }
@@ -333,8 +330,8 @@ template<class fobj,class sobj>
 struct Gauge3x2unmunger{
  void operator() (sobj &in,fobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<Nc;j++){
+      for(int i=0;i<2;i++){
+	for(int j=0;j<3;j++){
 	  out(mu)(i)(j) = in(mu)()(i,j);
 	}}
    }
@@ -9,7 +9,6 @@
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -31,8 +30,6 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H

-#include <string>
-
 NAMESPACE_BEGIN(Grid);

 using namespace Grid;
@@ -150,17 +147,15 @@ public:

    std::string format(header.floating_point);

-    const int ieee32big = (format == std::string("IEEE32BIG"));
-    const int ieee32    = (format == std::string("IEEE32"));
-    const int ieee64big = (format == std::string("IEEE64BIG"));
-    const int ieee64    = (format == std::string("IEEE64") || \
-			   format == std::string("IEEE64LITTLE"));
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
    // munger is a function of <floating point, Real, data_type>
-    const std::string stNC = std::to_string( Nc ) ;
-    if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
+    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@@ -171,7 +166,7 @@ public:
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
-    } else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
+    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@@ -216,29 +211,27 @@ public:
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
-					std::string ens_label = std::string("DWF"),
-					std::string ens_id = std::string("UKQCD"),
-					unsigned int sequence_number = 1)
+					std::string ens_label = std::string("DWF"))
  {
-    writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
+    writeConfiguration(Umu,file,0,1,ens_label);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32,
-					std::string ens_label = std::string("DWF"),
-					std::string ens_id = std::string("UKQCD"),
-					unsigned int sequence_number = 1)
+					std::string ens_label = std::string("DWF"))
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;

    FieldMetaData header;
-    header.sequence_number = sequence_number;
-    header.ensemble_id     = ens_id;
+    ///////////////////////////////////////////
+    // Following should become arguments
+    ///////////////////////////////////////////
+    header.sequence_number = 1;
+    header.ensemble_id     = std::string("UKQCD");
    header.ensemble_label  = ens_label;
-    header.hdr_version     = "1.0" ;

    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -252,14 +245,10 @@ public:

    uint64_t offset;

-    // Sod it -- always write NcxNc double
-    header.floating_point  = std::string("IEEE64BIG");
-    const std::string stNC = std::to_string( Nc ) ;
-    if( two_row ) {
-      header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
-    } else {
-      header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
-    }
+    // Sod it -- always write 3x3 double
+    header.floating_point = std::string("IEEE64BIG");
+    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
+    GaugeSimpleUnmunger<fobj3D,sobj> munge;
    if ( grid->IsBoss() ) { 
      truncate(file);
      offset = writeHeader(header,file);
@@ -267,15 +256,8 @@ public:
    grid->Broadcast(0,(void *)&offset,sizeof(offset));

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    if( two_row ) {
-      Gauge3x2unmunger<fobj2D,sobj> munge;
-      BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
-						nersc_csum,scidac_csuma,scidac_csumb);
-    } else {
-      GaugeSimpleUnmunger<fobj3D,sobj> munge;
-      BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
-						nersc_csum,scidac_csuma,scidac_csumb);
-    }
+    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
+					      nersc_csum,scidac_csuma,scidac_csumb);
    header.checksum = nersc_csum;
    if ( grid->IsBoss() ) { 
      writeHeader(header,file);
@@ -307,7 +289,8 @@ public:
    header.plaquette=0.0;
    MachineCharacteristics(header);

-    uint64_t offset;
+	uint64_t offset;
+  
 #ifdef RNG_RANLUX
    header.floating_point = std::string("UINT64");
    header.data_type      = std::string("RANLUX48");
@@ -347,7 +330,7 @@ public:

    GridBase *grid = parallel.Grid();

-    uint64_t offset = readHeader(file,grid,header);
+	uint64_t offset = readHeader(file,grid,header);

    FieldMetaData clone(header);

@@ -27,12 +27,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */

 #include <Grid/GridCore.h>
-
-#include <Grid/perfmon/Timer.h>
 #include <Grid/perfmon/PerfCount.h>
-NAMESPACE_BEGIN(Grid);

-GridTimePoint theProgramStart = GridClock::now();
+NAMESPACE_BEGIN(Grid);

 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 #define RawConfig(A,B) (A<<8|B)
@@ -30,12 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_PERFCOUNT_H
 #define GRID_PERFCOUNT_H

-
-#ifndef __SSC_START
-#define __SSC_START
-#define __SSC_STOP
-#endif
-
 #include <sys/time.h>
 #include <ctime>
 #include <chrono>
@@ -78,9 +72,17 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 inline uint64_t cyclecount(void){ 
  return 0;
 }
+#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
+#define __SSC_STOP  __SSC_MARK(0x110)
+#define __SSC_START __SSC_MARK(0x111)
+

 #else

+#define __SSC_MARK(mark) 
+#define __SSC_STOP  
+#define __SSC_START 
+
 /*
 * cycle counters arch dependent
 */
@@ -35,8 +35,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid)

-//typedef  std::chrono::system_clock          GridClock;
-typedef  std::chrono::high_resolution_clock   GridClock;
+// Dress the output; use std::chrono
+// C++11 time facilities better?
+inline double usecond(void) {
+  struct timeval tv;
+#ifdef TIMERS_ON
+  gettimeofday(&tv,NULL);
+#endif
+  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
+}
+
+typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;

 typedef  std::chrono::seconds               GridSecs;
@@ -44,15 +53,6 @@ typedef  std::chrono::milliseconds          GridMillisecs;
 typedef  std::chrono::microseconds          GridUsecs;
 typedef  std::chrono::microseconds          GridTime;

-extern GridTimePoint theProgramStart;
-// Dress the output; use std::chrono
-// C++11 time facilities better?
-inline double usecond(void) {
-  auto usecs = std::chrono::duration_cast<GridUsecs>(GridClock::now()-theProgramStart); 
-  return 1.0*usecs.count();
-}
-
-
 inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 {
  stream << time.count()<<" s";
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Christopher Kelly	4fefae1745	Test_evec_compression changes: Added ability to choose one of a variety of preselected basis sizes from the command line Fine lanczos now checks enough evecs are generated and resizes the output to Nstop and not the actual amount that converged (which can be larger)	2022-04-06 06:33:26 -07:00
Christopher Kelly	758e2edcad	Test_evec_compression enhancements: In testing the compressed evecs, a Cheybshev smoothing is now applied first to remove high mode noise Added a second test where the uncompressed evecs are compared directly to the original evecs Generalized the test to allow for either DWF or Mobius with or without GPBC, switched by command line options	2022-03-29 06:16:15 -07:00
Christopher Kelly	1538b15f3b	48ID evo main program now uses reliable update CG	2022-03-14 06:45:28 -07:00
Christopher Kelly	deac621c2c	Merge branch 'develop' into gparity_HMC_merge_develop	2022-02-22 14:25:27 -05:00
Christopher Kelly	ba974960e6	Added an HMC checkpoint start option that loads the fields and then reseeds the RNGs, suitable for creating new evolution streams Added option to choose RNG seeds in 40ID main binary	2022-02-14 08:09:01 -08:00
Christopher Kelly	6755dc57f8	Added methods to compute spatial plaquette and timeslice spatial plaquette to WilsonLoops	2022-01-24 13:57:39 -05:00
Christopher Kelly	aa620ca52c	Fixed compilation error in observables resulting from changes in Wilson flow code Modified light quark mass on 40ID HMC binary	2022-01-24 09:56:24 -08:00
Christopher Kelly	2c46c942cc	Reworked WilsonFlow: Both smear and smear_adaptive now maintain the Wilson flow time as a function variable rather than a class member variable. smear_adaptive does likewise for the current time step. This allows the evolve and smear functions to be const Fixed smear_adaptive setting initial time to epsilon rather than 0 Added ability to assign generic measurement actions at user specified frequencies during the smearing and reimplemented current energy density / topq output in this framework Reimplemented the "flowMeasure" methods using the above framework Fixed const correctness for WilsonLoops::TopologicalCharge	2022-01-24 12:06:05 -05:00
Christopher Kelly	adeba8059a	Added calculation of timeslice topological charge	2022-01-20 14:29:07 -05:00
Christopher Kelly	c4ac528126	Added cloverleaf energy density calculation to WilsonFlow	2021-12-27 10:33:33 -05:00
Christopher Kelly	551b93ba8e	To HMC/Mobius2p1fIDSDRGparityEOFA_40ID, added input param to change trajectory length and increased integrator steps for DSDR	2021-12-10 09:06:06 -08:00
Christopher Kelly	ddf7540510	Added calculation of 5Li topological charge WilsonFlow code now calls topological charge calculation with correct gauge implementation rather than assuming periodic Added version of WilsonFlow::flowMeasureEnergyDensityPlaquette that outputs the smeared gauge field at the end	2021-12-06 17:56:42 -05:00
Christopher Kelly	de68d12c3d	1x1 topological charge calculation now respects gauge boundary conditions	2021-12-06 13:42:09 -05:00
Christopher Kelly	6d26a2a1ad	Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into gparity_HMC	2021-11-16 07:32:47 -08:00
Christopher Kelly	a1211cdcce	Gparity 48ID tuning and exposure of trajectory length as input variable	2021-11-16 07:31:41 -08:00
Christopher Kelly	e78acf77ff	To LocalCoherenceLanczos, added a method to reconstruct the fine eigenvector and added some comments to aid the user Added a test code for local coherence Lanczos with G-parity BCs Added a test code for block eigenvector compression	2021-11-08 07:26:35 -08:00
Christopher Kelly	f7e9621492	40ID ensemble tuning: now use 5 Hasenbusch steps, parameters now separately tunable in param file	2021-10-18 08:17:36 -07:00
Christopher Kelly	f14be15f8b	Updates to Gparity HMC main programs	2021-10-15 08:10:17 -07:00
Christopher Kelly	6a3aaa52ef	Test_dwf_lanczos can now run either G-parity Mobius or non-Gparity DWF according to cmdline switch Fixed copyStream intialization	2021-10-12 12:59:54 -07:00
Christopher Kelly	9ba47b4696	Merge branch 'develop' into gparity_HMC	2021-09-29 20:07:55 -07:00
Christopher Kelly	e85af80c39	Added return value checks on all cuda api calls Test_dwf_lanczos can now run with either regular DWF or Mobius+Gparity based on cmdline arg	2021-09-29 19:57:43 -07:00
Christopher Kelly	0b91e90dd4	Merge branch 'develop' into feature/gparity_HMC	2021-09-27 07:16:26 -07:00
Christopher Kelly	d184b8c921	Merge branch 'develop' into gparity_HMC	2021-09-08 06:14:08 -07:00
Christopher Kelly	c92e390b08	Added initial main binary code for 40ID and 48ID Gparity HMC	2021-09-08 09:00:13 -04:00
Christopher Kelly	5b36a8af54	Added a CshiftLink function to the GaugeImplementations and boundary condition classes that offers a boundary aware C-shift Modified gauge fixing code to use CshiftLink internally such that the steepest descent algorithm is universal Modified gauge transformation code to use CshiftLink for a universal definition Improved comprehensibility of Test_fft_gfix and generalized to use either periodic or charge conjugation BCs based on cmdline option Added cmdline options to Test_fft_gfix to tune alpha and optionally disable the Fourier acceleration tests	2021-07-12 17:13:40 -04:00
Christopher Kelly	75a1f85162	Added method to compute and return the Wilson flow energy density over some number of steps	2021-06-30 17:24:00 -04:00
Christopher Kelly	ac4f2d9798	Fixed EOFA approx test square rooting the result inappropriately thus failing when it shouldn't To MDWF+ID GPBC evol main program, added routine to compute the lower bound of the EOFA using the power method with a command line toggle	2021-06-09 09:08:37 -04:00
Christopher Kelly	c3b99de33f	In EOFA pseudofermion action, implemented M^{-1} (this costs the same as M for EOFA!) Added tests/solver/Test_eofa_inv.cc to test the above In MDWF+ID GPBC binary, tests of RHMC approx for the action / MD approxs can be performed separately using a cmdline toggle	2021-06-03 11:11:14 -04:00
Christopher Kelly	e1a02bb80a	Added main program to reproduce 32ID ensemble with 240MeV pions and GPBC Allowed EOFA to accept different solvers for the L and R operations in the heatbath step Fixed EOFA Meofa operating on member Phi rather than input field Added derived EOFA pseudofermion variant that allows for mixed prec CG to be used in the heatbath Added forces/Test_mobius_gparity_eofa_mixed testing the above reproduces the regular EOFA To Test_gamma, added checks for the various properties of the charge conjugation matrix C=-gamma2*gamma4 in Grid basis	2021-06-01 11:44:34 -04:00
Christopher Kelly	86f08c6b9a	Added a check that the initial EOFA action agrees with \|eta\|^2, thus checking the quality of the rational approximation in the heatbath	2021-05-18 13:57:44 -04:00
Christopher Kelly	9f0271039f	Completed implementation of Meofa method of ExactOneFlavourRatio pseudofermion action Added tests to tests/forces/Test_mobius_force_eofa.cc testing that the EOFA heatbath results in Phi = M^{-1/2} eta	2021-05-18 12:27:51 -04:00
Christopher Kelly	24df770f74	Added tests/IO/Test_field_array_io.cc testing/demonstrating parallel IO of an array of 5D fermion fields	2021-05-13 12:32:45 -04:00
Christopher Kelly	45b6c7effc	Added a test code forces/Test_gpdwf_force_1f_2f that compares the action and force for DWF, EOFA and DSDR actions between the 1f and 2f implementations of G-parity BCs Broke up ExactOneFlavourRatio refresh into a virtual routine that generates eta and one that uses it as with the ratio and RHMC actions Added accessors to the pseudofermion field to TwoFlavourEvenOddRatio and ExactOneFlavourRatio	2021-05-12 16:34:07 -04:00
Quadro	1c70d8c4d9	Warning remove	2021-05-05 19:56:04 -04:00
Quadro	f0e9a5299f	Happy on GCC I hope	2021-05-05 19:55:34 -04:00
Quadro	f1b8ba45e7	Warning on GCC suppress unrelated to my code so why doesn't it shut up about its ABI fix	2021-05-05 19:54:21 -04:00
Peter Boyle	fe998ab578	Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into feature/gparity_HMC	2021-05-05 17:36:51 -04:00
Peter Boyle	c2ee2b5fd1	Random chhanges	2021-05-05 17:36:38 -04:00
Peter Boyle	3b734ee397	two point function example	2021-05-05 17:36:19 -04:00
Peter Boyle	8637a9512a	Freeze Gaussian implementation	2021-05-05 17:34:54 -04:00
Peter Boyle	7f6e2ee03e	Drop normal_distribution, standardise	2021-05-05 17:34:17 -04:00
Peter Boyle	7b02acb2bd	Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into feature/gparity_HMC	2021-05-04 13:45:11 -04:00
Peter Boyle	86948c6ea0	CRC for finger print fields - aids debug / version diff	2021-05-04 13:44:38 -04:00
Peter Boyle	53d226924a	CRC added	2021-05-04 13:44:07 -04:00
Christopher Kelly	80176b1b39	RHMC now outputs some initial norms to the logs Fixed DWF+I Gparity binaries not correctly assigning twist directions (thanks Peter!)	2021-05-04 13:12:23 -04:00
Christopher Kelly	29ddafd0fc	Added variant of G-parity DWF+I ensemble gen code using double prec RHMC	2021-04-30 13:12:24 -04:00
Peter Boyle	0f08364e4f	Mom filter refresh sRNG	2021-04-26 23:18:11 +02:00
Peter Boyle	a198d59381	Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into feature/gparity_HMC	2021-04-26 21:05:52 +02:00
Peter Boyle	3a4f5f2324	Merge develop, strengthen force tests	2021-04-22 18:54:00 -04:00
Peter Boyle	824d84473f	Merge branch 'develop' into feature/gparity_HMC	2021-04-22 16:32:41 -04:00
Peter Boyle	38964a4076	Switch twist direction	2021-04-22 15:57:37 -04:00
Peter Boyle	0d9aa87228	Reduce momentum to the GP plane	2021-04-22 15:56:59 -04:00
Peter Boyle	0e959d9b94	Update plaquette analysis	2021-04-22 15:55:47 -04:00
Peter Boyle	752f70cd48	Merge branch 'develop' into feature/gparity_HMC	2021-04-22 01:58:11 +02:00
Christopher Kelly	e0e42873c1	Const correctness for Lattice::Replicate Adapted GeneralEvenOddRationalRatio and Test_rhmc_EOWilsonRatio_doubleVsMixedPrec to recent changes that require passing in serial RNG For GeneralEvenOddRationalRatio and TwoFlavourEvenOddRatio, broke refresh into two stages, the first of which generates the random field and the second that computes the pseudofermion field. This allows derived classes to override the generation of the random field, for example in testing. Test_dwf_gpforce now uses Gparity in x-direction and APBC in time as opposed to G-parity in time Added Test_action_dwf_gparity2fvs1f that compares the DWF fermion action with the 2f and the 1f (doubled-lattice) implementations of Gparity	2021-04-14 16:41:27 -04:00
Christopher Kelly	0ff3bf6dc5	Merge branch 'develop' into feature/gparity_HMC	2021-03-22 15:33:13 -04:00
Christopher Kelly	351eab02ae	Comment fix	2021-03-22 14:39:17 -04:00
Christopher Kelly	feee5ccde2	Added Gparity flavour Pauli matrix algebra and associated tensor types mirroring strategy used for Gamma matrices Added test program for the above	2021-03-03 15:39:41 -05:00
Christopher Kelly	e0f6a146d8	To DWF+I G-parity evolution code, added ability to specify number of MD steps in params and an optional usage mode that reads the config and checks the plaq/checksum agree then exits	2021-02-16 10:41:52 -05:00
Christopher Kelly	daa095c519	Fixed an obscure but reproducible hang in the RHMC caused by the bounds check being activated by a random number that wasn't synchronized over the nodes HMC now also reports the "L-infinity norm" of the impulse, aka the largest site norm	2021-02-09 12:55:46 -05:00
Christopher Kelly	c2676853ca	Merge branch 'bugfix/maxnorm2' into feature/gparity_HMC	2021-02-08 12:17:33 -05:00
Christopher Kelly	6a824033f8	Merge branch 'develop' into feature/gparity_HMC	2021-02-08 09:31:49 -05:00
Christopher Kelly	cee6a37639	Added a logging tag for HMC As the integrator logger is active by default the cmdline option to activate had no effect. Changed option to deactivate on request ("NoIntegrator") Cleaned up generating rational approxs in the general RHMC code As the tolerance of the rational approx is not related to the CG tolerance, regenerating approxs for MD and MC if they differ only by the CG tolerance is not necessary; this has been fixed In DWF+I Gparity evolution code, added cmdline options to check the rational approximations and compute the lowest/highest eigenvalues of M^dagM for RHMC tuning In the above, changed the integrator layout to a much simpler one that completes much faster; may need additional tuning	2021-02-08 09:30:35 -05:00
Christopher Kelly	6cc3ad110c	Improved logging output for RHMC bounds checks In GenericHMCRunner, exposed functionality for initializing gauge fields and RNG for external use	2021-01-29 12:35:00 -05:00
Christopher Kelly	e6c6f82c52	Gparity DWF+I HMC main program now has option to specify parameter file	2021-01-27 11:18:41 -05:00
Christopher Kelly	d10d0c4e7f	Merge branch 'develop' into feature/gparity_HMC	2021-01-25 15:13:29 -05:00
Christopher Kelly	9c106d625a	Added HMC main program designed to reproduce the 16^3x32x16 DWF+I ensembles with beta=2.13 and Gparity BCs	2021-01-25 15:07:44 -05:00
Christopher Kelly	6795bbca31	Generalized GeneralEvenOddRatioRationalPseudoFermionAction such that the multi-shift CG algorithm can be overridden by derived classes Added a mixed-precision variant of GeneralEvenOddRatioRationalPseudoFermionAction and a verification test against double prec class Fixed non-const reference used in passing RHMC approx to multishift classes	2021-01-25 14:22:31 -05:00
Christopher Kelly	d161c2dc35	Improved formating of timing output in mixed-prec multishift In test of mixed-prec multishift, added comparison against full double precision multishift both for timing and to cross-check the results	2021-01-20 15:42:06 -05:00
Christopher Kelly	7a06826cf1	Added option to NerscIO to disable exit on failing plaquette check allowing for circumvention of factor of 2 error in CPS-generated G-parity config headers Adapted mixed-prec multi-shift test to new way to pass gauge BC directions and added cmdline option to perform the G-parity plaquette comparison with the corrected plaquette when loading config	2021-01-20 13:31:50 -05:00
Christopher Kelly	c3712b8e06	Merge branch 'develop' into feature/gparity_HMC	2021-01-20 11:48:52 -05:00
Christopher Kelly	901ee77b84	Mixed precision multishift test can now be performed with/without G-parity using cmdline check and can load a pregenerated configuration	2021-01-20 11:45:44 -05:00
Christopher Kelly	1b84f59273	Added a mixed precision multishift algorithm for which the matrix multiplies are performed in single precision but the search directions are accumulated in double precision. A reliable update step is performed at a tunable frequency to correct the residual. A final mixed-prec single-shift solve is performed on each pole to perform cleanup if necessary. A test is provided to demonstrate the algorithm.	2021-01-06 12:24:44 -05:00
Christopher Kelly	1fb41a4300	Added copyLane function to Tensor_extract_merge.h which copies one lane of data from an input tensor object to a different lane of an output tensor object of potentially different precision precisionChange lattice function now uses copyLane to remove need for temporary scalar objects, reducing register footprint and significantly improving performance	2021-01-06 11:50:56 -05:00
Christopher Kelly	287bac946f	ConjugateGradientMixedPrec now stores final true residual and uses the precisionChange workspaces for improved efficiency	2021-01-06 09:50:41 -05:00
Christopher Kelly	80c14be65e	Added core test to check precision change	2021-01-06 09:34:44 -05:00
Christopher Kelly	d7a2a4852d	Reimplemented precisionChange to run on GPUs. A workspace containing the mapping table can be optionally precomputed and reused for improved performance.	2021-01-06 09:30:49 -05:00
Christopher Kelly	d185f2eaa7	OneFlavourEvenOddRatioRationalPseudoFermionAction now derives from GeneralEvenOddRatioRationalPseudoFermionAction, simply performs transcription of parameters	2020-12-23 16:26:10 -05:00
Christopher Kelly	813d4cd900	Added test program that ensures the generic checkerboarded RHMC (with parameters set appropriately) gives the same answer as the existing 1f code	2020-12-23 16:01:42 -05:00
Christopher Kelly	75c6c6b173	General RHMC pseudofermion action now allows for different rational approximations to be used in the MD and action evaluation	2020-12-23 11:19:26 -05:00
Christopher Kelly	220ad5e3ee	Added more verbose log output to GeneralEvenOddRatioRationalPseudoFermionAction In GeneralEvenOddRatioRationalPseudoFermionAction, setting the bounds check frequency to 0 now disables the check	2020-12-22 11:08:22 -05:00
Christopher Kelly	ba5dc670a5	Reimplemented GparityWilsonImpl::InsertForce5D to run efficiently on GPUs Swapped order of templated tensor code and c-number specializations in Tensor_outer.h to fix compile issue with type deduction on Summit	2020-12-22 10:10:07 -05:00
Christopher Kelly	a0ca362690	Added an RHMC pseudofermion action, GeneralEvenOddRatioRationalPseudoFermionAction, that works for an arbitrary fractional power, not just a square root Added a test evolution for the above, Test_rhmc_EOWilsonRatioPowQuarter, demonstrating conservation of Hamiltonian Fixed HMC ignoring the MetropolisTest parameter of HMCparameters	2020-12-17 16:21:58 -05:00
Christopher Kelly	249b6e61ec	For G-parity BCs the Nd-1 direction is now assumed to be the time direction and setting a twist in this direction will apply antiperiodic BCs Added option to run Test_gparity with antiperiodic time BCs	2020-12-17 14:09:00 -05:00
				`@@ -1,2 +0,0 @@`

				`mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL`