Simplifying the MultiRHS solver to make it do SRHS *and* MRHS

relocate deflation support
Move to a blas directory
2026-05-20 17:14:30 +01:00 · 2024-03-06 14:04:33 -05:00 · 2024-02-27 11:52:23 -05:00 · 2024-02-27 11:51:04 -05:00 · 2024-02-27 11:41:44 -05:00 · 2024-02-27 11:41:13 -05:00
90 changed files with 7662 additions and 3486 deletions
@@ -59,6 +59,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice.h>      
 #include <Grid/cshift/Cshift.h>       
 #include <Grid/stencil/Stencil.h>      
+#include <Grid/stencil/GeneralLocalStencil.h>      
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/algorithms/Algorithms.h>   
 NAMESPACE_CHECK(GridCore)
@@ -29,6 +29,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALGORITHMS_H
 #define GRID_ALGORITHMS_H

+NAMESPACE_CHECK(blas);
+#include <Grid/algorithms/blas/BatchedBlas.h>
+
 NAMESPACE_CHECK(algorithms);
 #include <Grid/algorithms/SparseMatrix.h>
 #include <Grid/algorithms/LinearOperator.h>
@@ -44,7 +47,10 @@ NAMESPACE_CHECK(SparseMatrix);
 #include <Grid/algorithms/approx/RemezGeneral.h>
 #include <Grid/algorithms/approx/ZMobius.h>
 NAMESPACE_CHECK(approx);
-#include <Grid/algorithms/iterative/Deflation.h>
+#include <Grid/algorithms/deflation/Deflation.h>
+#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
+#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
+NAMESPACE_CHECK(deflation);
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 NAMESPACE_CHECK(ConjGrad);
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
@@ -67,10 +73,10 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/iterative/PowerMethod.h>
-
+#include <Grid/algorithms/iterative/AdefGeneric.h>
 NAMESPACE_CHECK(PowerMethod);
-#include <Grid/algorithms/CoarsenedMatrix.h>
-NAMESPACE_CHECK(CoarsendMatrix);
+#include <Grid/algorithms/multigrid/MultiGrid.h>
+NAMESPACE_CHECK(multigrid);
 #include <Grid/algorithms/FFT.h>

 #endif
@@ -145,6 +145,44 @@ public:
  }
 };

+////////////////////////////////////////////////////////////////////
+// Create a shifted HermOp
+////////////////////////////////////////////////////////////////////
+template<class Field>
+class ShiftedHermOpLinearOperator : public LinearOperatorBase<Field> {
+  LinearOperatorBase<Field> &_Mat;
+  RealD _shift;
+public:
+  ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    assert(0);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
+  void Op     (const Field &in, Field &out){
+    assert(0);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    assert(0);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+    _Mat.HermOp(in,out);
+    out = out + _shift*in;
+  }
+};
+
+
 ////////////////////////////////////////////////////////////////////
 // Wrap an already herm matrix
 ////////////////////////////////////////////////////////////////////
@@ -460,53 +498,6 @@ class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Fi
  }
 };

-template<class Matrix,class Field>
-class QuadLinearOperator : public LinearOperatorBase<Field> {
-  Matrix &_Mat;
-public:
-  RealD a0,a1,a2;
-  QuadLinearOperator(Matrix &Mat): _Mat(Mat),a0(0.),a1(0.),a2(1.) {};
-  QuadLinearOperator(Matrix &Mat, RealD _a0,RealD _a1,RealD _a2): _Mat(Mat),a0(_a0),a1(_a1),a2(_a2) {};
-  // Support for coarsening to a multigrid
-  void OpDiag (const Field &in, Field &out) {
-    assert(0);
-    _Mat.Mdiag(in,out);
-  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    assert(0);
-    _Mat.Mdir(in,out,dir,disp);
-  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
-    _Mat.MdirAll(in,out);
-  }
-  void HermOp (const Field &in, Field &out){
-//    _Mat.M(in,out);
-    Field tmp1(in.Grid());
-//    Linop.HermOpAndNorm(psi, mmp, d, b);
-    _Mat.M(in,tmp1);
-    _Mat.M(tmp1,out);
-    out *= a2;
-    axpy(out, a1, tmp1, out);
-    axpy(out, a0, in, out);
-//    d=real(innerProduct(psi,mmp));
-//    b=norm2(mmp);
-  }
-  void AdjOp     (const Field &in, Field &out){
-    assert(0);
-    _Mat.M(in,out);
-  }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    HermOp(in,out);
-    ComplexD dot= innerProduct(in,out); n1=real(dot);
-    n2=norm2(out);
-  }
-  void Op(const Field &in, Field &out){
-    assert(0);
-    _Mat.M(in,out);
-  }
-};
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
 // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
@@ -90,9 +90,8 @@ public:
    order=_order;
      
    if(order < 2) exit(-1);
-    Coeffs.resize(order);
-    Coeffs.assign(0.,order);
-    Coeffs[order-1] = 1.;
+    Coeffs.resize(order,0.0);
+    Coeffs[order-1] = 1.0;
  };
  
  // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
@@ -36,12 +36,11 @@ NAMESPACE_BEGIN(Grid);
 // Abstract base class.
 // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
 // and returns a forecasted solution to the system D*psi = phi (psi).
-// Changing to operator
-template<class LinearOperatorBase, class Field>
+template<class Matrix, class Field>
 class Forecast
 {
 public:
-  virtual Field operator()(LinearOperatorBase &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
+  virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
 };

 // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
@@ -55,13 +54,13 @@ public:
  Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
  {
    int degree = prev_solns.size();
-    std::cout << GridLogMessage << "ChronoForecast: degree= " << degree << std::endl;
    Field chi(phi); // forecasted solution

    // Trivial cases
    if(degree == 0){ chi = Zero(); return chi; }
    else if(degree == 1){ return prev_solns[0]; }

+    //    RealD dot;
    ComplexD xp;
    Field r(phi); // residual
    Field Mv(phi);
@@ -84,9 +83,8 @@ public:
    // Perform sparse matrix multiplication and construct rhs
    for(int i=0; i<degree; i++){
      b[i] = innerProduct(v[i],phi);
-//      Mat.M(v[i],Mv);
-//      Mat.Mdag(Mv,MdagMv[i]);
-      Mat.HermOp(v[i],MdagMv[i]);
+      Mat.M(v[i],Mv);
+      Mat.Mdag(Mv,MdagMv[i]);
      G[i][i] = innerProduct(v[i],MdagMv[i]);
    }

@@ -40,7 +40,7 @@ public:
  RealD norm;
  RealD lo,hi;

-  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
+  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), tolerances(n), lo(_lo), hi(_hi) {;};
  RealD approx(RealD x);
  void csv(std::ostream &out);
  void gnuplot(std::ostream &out);
@@ -0,0 +1,685 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: BatchedBlas.h
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#ifdef GRID_HIP
+#include <hipblas/hipblas.h>
+#endif
+#ifdef GRID_CUDA
+#include <hipblas/hipblas.h>
+#endif
+#ifdef GRID_SYCL
+#error // need oneMKL version
+#endif
+
+///////////////////////////////////////////////////////////////////////	  
+// Need to rearrange lattice data to be in the right format for a
+// batched multiply. Might as well make these static, dense packed
+///////////////////////////////////////////////////////////////////////
+NAMESPACE_BEGIN(Grid);
+#ifdef GRID_HIP
+  typedef hipblasHandle_t gridblasHandle_t;
+#endif
+#ifdef GRID_CUDA
+  typedef cudablasHandle_t gridblasHandle_t;
+#endif
+#ifdef GRID_SYCL
+  typedef int32_t gridblasHandle_t;
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+  typedef int32_t gridblasHandle_t;
+#endif
+
+enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
+
+class GridBLAS {
+public:
+
+  
+  static gridblasHandle_t gridblasHandle;
+  static int            gridblasInit;
+  
+  static void Init(void)
+  {
+    if ( ! gridblasInit ) {
+#ifdef GRID_CUDA
+      std::cout << "cublasCreate"<<std::endl;
+      cublasCreate(&gridblasHandle);
+#endif
+#ifdef GRID_HIP
+      std::cout << "hipblasCreate"<<std::endl;
+      hipblasCreate(&gridblasHandle);
+#endif
+#ifdef GRID_SYCL
+#endif
+      gridblasInit=1;
+    }
+  }
+  
+  // Force construct once
+  GridBLAS() { Init(); };
+  ~GridBLAS() { };
+  
+  /////////////////////////////////////////////////////////////////////////////////////
+  // BLAS GEMM conventions:
+  /////////////////////////////////////////////////////////////////////////////////////
+  // - C = alpha A * B + beta C
+  // Dimensions:
+  // - C_m.n
+  // - A_m.k
+  // - B_k.n
+  // - Flops = 8 M N K
+  // - Bytes = 2*sizeof(word) * (MN+MK+KN)
+  // M=60, N=12
+  // Flop/Byte = 8 . 60.60.12 / (60.12+60.60+60.12)/16 = 4 so expect about 4 TF/s on a GCD
+  /////////////////////////////////////////////////////////////////////////////////////
+  void synchronise(void)
+  {
+#ifdef GRID_HIP
+    auto err = hipDeviceSynchronize();
+    assert(err==hipSuccess);
+#endif
+#ifdef GRID_CUDA
+    auto err = cudaDeviceSynchronize();
+    assert(err==cudaSuccess);
+#endif
+#ifdef GRID_SYCL
+    accelerator_barrier();
+#endif
+  }
+  
+  void gemmBatched(int m,int n, int k,
+		   ComplexD alpha,
+		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
+		   deviceVector<ComplexD*> &Bkn,
+		   ComplexD beta,
+		   deviceVector<ComplexD*> &Cmn)
+  {
+    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
+		m,n,k,
+		alpha,
+		Amk,
+		Bkn,
+		beta,
+		Cmn);
+  }
+  void gemmBatched(int m,int n, int k,
+		   ComplexF alpha,
+		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
+		   deviceVector<ComplexF*> &Bkn,
+		   ComplexF beta,
+		   deviceVector<ComplexF*> &Cmn)
+  {
+    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
+		m,n,k,
+		alpha,
+		Amk,
+		Bkn,
+		beta,
+		Cmn);
+  }
+  void gemmBatched(int m,int n, int k,
+		   RealD alpha,
+		   deviceVector<RealD*> &Amk,  // pointer list to matrices
+		   deviceVector<RealD*> &Bkn,
+		   RealD beta,
+		   deviceVector<RealD*> &Cmn)
+  {
+    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
+		m,n,k,
+		alpha,
+		Amk,
+		Bkn,
+		beta,
+		Cmn);
+  }
+  void gemmBatched(int m,int n, int k,
+		   RealF alpha,
+		   deviceVector<RealF*> &Amk,  // pointer list to matrices
+		   deviceVector<RealF*> &Bkn,
+		   RealF beta,
+		   deviceVector<RealF*> &Cmn)
+  {
+    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
+		m,n,k,
+		alpha,
+		Amk,
+		Bkn,
+		beta,
+		Cmn);
+  }
+
+  void gemmBatched(GridBLASOperation_t OpA,
+		   GridBLASOperation_t OpB,
+		   int m,int n, int k,
+		   ComplexD alpha,
+		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
+		   deviceVector<ComplexD*> &Bkn,
+		   ComplexD beta,
+		   deviceVector<ComplexD*> &Cmn)
+  {
+    RealD t2=usecond();
+    int32_t batchCount = Amk.size();
+    assert(Bkn.size()==batchCount);
+    assert(Cmn.size()==batchCount);
+
+    int lda = m; // m x k column major
+    int ldb = k; // k x n column major
+    int ldc = m; // m x b column major
+    if(OpA!=GridBLAS_OP_N)
+      lda = k;
+    if(OpB!=GridBLAS_OP_N)
+      ldb = n;
+    
+    static deviceVector<ComplexD> alpha_p(1);
+    static deviceVector<ComplexD> beta_p(1);
+    // can prestore the 1 and the zero on device
+    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
+    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
+    RealD t0=usecond();
+    //    std::cout << "ZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
+#ifdef GRID_HIP
+    hipblasOperation_t hOpA;
+    hipblasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+    auto err = hipblasZgemmBatched(gridblasHandle,
+				   hOpA,
+				   hOpB,
+				   m,n,k,
+				   (hipblasDoubleComplex *) &alpha_p[0],
+				   (hipblasDoubleComplex **)&Amk[0], lda,
+				   (hipblasDoubleComplex **)&Bkn[0], ldb,
+				   (hipblasDoubleComplex *) &beta_p[0],
+				   (hipblasDoubleComplex **)&Cmn[0], ldc,
+				   batchCount);
+    //	 std::cout << " hipblas return code " <<(int)err<<std::endl;
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    cublasOperation_t hOpA;
+    cublasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
+    auto err = cublasZgemmBatched(gridblasHandle,
+				  hOpA,
+				  hOpB,
+				  m,n,k,
+				  (cuDoubleComplex *) &alpha_p[0],
+				  (cuDoubleComplex **)&Amk[0], lda,
+				  (cuDoubleComplex **)&Bkn[0], ldb,
+				  (cuDoubleComplex *) &beta_p[0],
+				  (cuDoubleComplex **)&Cmn[0], ldc,
+				  batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    //MKL’s cblas_<T>gemm_batch & OneAPI
+#warning "oneMKL implementation not built "
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    // Need a default/reference implementation
+    for (int p = 0; p < batchCount; ++p) {
+      for (int mm = 0; mm < m; ++mm) {
+	for (int nn = 0; nn < n; ++nn) {
+	  ComplexD c_mn(0.0);
+	  for (int kk = 0; kk < k, ++kk)
+	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
+	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
+	}
+      }
+    }
+#endif
+    //    synchronise();
+     RealD t1=usecond();
+     RealD flops = 8.0*m*n*k*batchCount;
+     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
+     //     std::cout <<GridLogMessage<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
+     //     std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+  }
+
+  void gemmBatched(GridBLASOperation_t OpA,
+		   GridBLASOperation_t OpB,
+		   int m,int n, int k,
+		   ComplexF alpha,
+		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
+		   deviceVector<ComplexF*> &Bkn,
+		   ComplexF beta,
+		   deviceVector<ComplexF*> &Cmn)
+  {
+    RealD t2=usecond();
+    int32_t batchCount = Amk.size();
+
+    int lda = m; // m x k column major
+    int ldb = k; // k x n column major
+    int ldc = m; // m x b column major
+    if(OpA!=GridBLAS_OP_N)
+      lda = k;
+    if(OpB!=GridBLAS_OP_N)
+      ldb = n;
+    static deviceVector<ComplexF> alpha_p(1);
+    static deviceVector<ComplexF> beta_p(1);
+    // can prestore the 1 and the zero on device
+    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
+    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
+    RealD t0=usecond();
+
+    assert(Bkn.size()==batchCount);
+    assert(Cmn.size()==batchCount);
+#ifdef GRID_HIP
+    hipblasOperation_t hOpA;
+    hipblasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+    auto err = hipblasCgemmBatched(gridblasHandle,
+				   hOpA,
+				   hOpB,
+				   m,n,k,
+				   (hipblasComplex *) &alpha_p[0],
+				   (hipblasComplex **)&Amk[0], lda,
+				   (hipblasComplex **)&Bkn[0], ldb,
+				   (hipblasComplex *) &beta_p[0],
+				   (hipblasComplex **)&Cmn[0], ldc,
+				   batchCount);
+
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    cublasOperation_t hOpA;
+    cublasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
+    auto err = cublasCgemmBatched(gridblasHandle,
+				  hOpA,
+				  hOpB,
+				  m,n,k,
+				  (cuComplex *) &alpha_p[0],
+				  (cuComplex **)&Amk[0], lda,
+				  (cuComplex **)&Bkn[0], ldb,
+				  (cuComplex *) &beta_p[0],
+				  (cuComplex **)&Cmn[0], ldc,
+				  batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    //MKL’s cblas_<T>gemm_batch & OneAPI
+#warning "oneMKL implementation not built "
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    // Need a default/reference implementation
+    for (int p = 0; p < batchCount; ++p) {
+      for (int mm = 0; mm < m; ++mm) {
+	for (int nn = 0; nn < n; ++nn) {
+	  ComplexD c_mn(0.0);
+	  for (int kk = 0; kk < k, ++kk)
+	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
+	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
+	}
+      }
+    }
+#endif
+     RealD t1=usecond();
+     RealD flops = 8.0*m*n*k*batchCount;
+     RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
+  }
+  
+  ///////////////////////////////////////////////////////////////////////////
+  // Single precision real GEMM
+  ///////////////////////////////////////////////////////////////////////////
+
+  void gemmBatched(GridBLASOperation_t OpA,
+		   GridBLASOperation_t OpB,
+		   int m,int n, int k,
+		   RealF alpha,
+		   deviceVector<RealF*> &Amk,  // pointer list to matrices
+		   deviceVector<RealF*> &Bkn,
+		   RealF beta,
+		   deviceVector<RealF*> &Cmn)
+  {
+    RealD t2=usecond();
+    int32_t batchCount = Amk.size();
+
+    int lda = m; // m x k column major
+    int ldb = k; // k x n column major
+    int ldc = m; // m x b column major
+    if(OpA!=GridBLAS_OP_N)
+      lda = k;
+    if(OpB!=GridBLAS_OP_N)
+      ldb = n;
+    static deviceVector<RealF> alpha_p(1);
+    static deviceVector<RealF> beta_p(1);
+    // can prestore the 1 and the zero on device
+    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
+    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
+    RealD t0=usecond();
+
+    assert(Bkn.size()==batchCount);
+    assert(Cmn.size()==batchCount);
+#ifdef GRID_HIP
+    hipblasOperation_t hOpA;
+    hipblasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+    auto err = hipblasSgemmBatched(gridblasHandle,
+				   hOpA,
+				   hOpB,
+				   m,n,k,
+				   (float *) &alpha_p[0],
+				   (float **)&Amk[0], lda,
+				   (float **)&Bkn[0], ldb,
+				   (float *) &beta_p[0],
+				   (float **)&Cmn[0], ldc,
+				   batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    cublasOperation_t hOpA;
+    cublasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
+    auto err = cublasSgemmBatched(gridblasHandle,
+				  hOpA,
+				  hOpB,
+				  m,n,k,
+				  (float *) &alpha_p[0],
+				  (float **)&Amk[0], lda,
+				  (float **)&Bkn[0], ldb,
+				  (float *) &beta_p[0],
+				  (float **)&Cmn[0], ldc,
+				  batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    //MKL’s cblas_<T>gemm_batch & OneAPI
+#warning "oneMKL implementation not built "
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    // Need a default/reference implementation
+    for (int p = 0; p < batchCount; ++p) {
+      for (int mm = 0; mm < m; ++mm) {
+	for (int nn = 0; nn < n; ++nn) {
+	  RealD c_mn(0.0);
+	  for (int kk = 0; kk < k, ++kk)
+	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
+	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
+	}
+      }
+    }
+#endif
+     RealD t1=usecond();
+     RealD flops = 2.0*m*n*k*batchCount;
+     RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
+  }
+  
+  
+  ///////////////////////////////////////////////////////////////////////////
+  // Double precision real GEMM
+  ///////////////////////////////////////////////////////////////////////////
+
+  void gemmBatched(GridBLASOperation_t OpA,
+		   GridBLASOperation_t OpB,
+		   int m,int n, int k,
+		   RealD alpha,
+		   deviceVector<RealD*> &Amk,  // pointer list to matrices
+		   deviceVector<RealD*> &Bkn,
+		   RealD beta,
+		   deviceVector<RealD*> &Cmn)
+  {
+    RealD t2=usecond();
+    int32_t batchCount = Amk.size();
+
+    int lda = m; // m x k column major
+    int ldb = k; // k x n column major
+    int ldc = m; // m x b column major
+    if(OpA!=GridBLAS_OP_N)
+      lda = k;
+    if(OpB!=GridBLAS_OP_N)
+      ldb = n;
+    
+    static deviceVector<RealD> alpha_p(1);
+    static deviceVector<RealD> beta_p(1);
+    // can prestore the 1 and the zero on device
+    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
+    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
+    RealD t0=usecond();
+
+    assert(Bkn.size()==batchCount);
+    assert(Cmn.size()==batchCount);
+#ifdef GRID_HIP
+    hipblasOperation_t hOpA;
+    hipblasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+    auto err = hipblasDgemmBatched(gridblasHandle,
+				   HIPBLAS_OP_N,
+				   HIPBLAS_OP_N,
+				   m,n,k,
+				   (double *) &alpha_p[0],
+				   (double **)&Amk[0], lda,
+				   (double **)&Bkn[0], ldb,
+				   (double *) &beta_p[0],
+				   (double **)&Cmn[0], ldc,
+				   batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    cublasOperation_t hOpA;
+    cublasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
+    auto err = cublasDgemmBatched(gridblasHandle,
+				  hOpA,
+				  hOpB,
+				  m,n,k,
+				  (double *) &alpha_p[0],
+				  (double **)&Amk[0], lda,
+				  (double **)&Bkn[0], ldb,
+				  (double *) &beta_p[0],
+				  (double **)&Cmn[0], ldc,
+				  batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    /*
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t batchCount64=batchCount;
+      oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
+      onemkl::transpose::N,
+      onemkl::transpose::N,
+      &m64,&n64,&k64,
+      (double *) &alpha_p[0],
+      (double **)&Amk[0], lda,
+      (double **)&Bkn[0], ldb,
+      (double *) &beta_p[0],
+      (double **)&Cmn[0], ldc,
+      1,&batchCount64);
+     */
+    //MKL’s cblas_<T>gemm_batch & OneAPI
+#warning "oneMKL implementation not built "
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    // Need a default/reference implementation
+    for (int p = 0; p < batchCount; ++p) {
+      for (int mm = 0; mm < m; ++mm) {
+	for (int nn = 0; nn < n; ++nn) {
+	  RealD c_mn(0.0);
+	  for (int kk = 0; kk < k, ++kk)
+	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
+	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
+	}
+      }
+    }
+#endif
+     RealD t1=usecond();
+     RealD flops = 2.0*m*n*k*batchCount;
+     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
+  }
+  
+
+  
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // Strided case used by benchmark, but generally unused in Grid
+  // Keep a code example in double complex, but don't generate the single and real variants for now
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  
+  void gemmStridedBatched(int m,int n, int k,
+			  ComplexD alpha,
+			  ComplexD* Amk,  // pointer list to matrices
+			  ComplexD* Bkn,
+			  ComplexD beta,
+			  ComplexD* Cmn,
+			  int batchCount)
+  {
+    // Use C-row major storage, so transpose calls
+    int lda = m; // m x k column major
+    int ldb = k; // k x n column major
+    int ldc = m; // m x b column major
+    int sda = m*k;
+    int sdb = k*n;
+    int sdc = m*n;
+    deviceVector<ComplexD> alpha_p(1);
+    deviceVector<ComplexD> beta_p(1);
+    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
+    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
+    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
+    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
+    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
+#ifdef GRID_HIP
+    auto err = hipblasZgemmStridedBatched(gridblasHandle,
+					  HIPBLAS_OP_N,
+					  HIPBLAS_OP_N,
+					  m,n,k,
+					  (hipblasDoubleComplex *) &alpha_p[0],
+					  (hipblasDoubleComplex *) Amk, lda, sda,
+					  (hipblasDoubleComplex *) Bkn, ldb, sdb,
+					  (hipblasDoubleComplex *) &beta_p[0],
+					  (hipblasDoubleComplex *) Cmn, ldc, sdc,
+					  batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    cublasZgemmStridedBatched(gridblasHandle,
+			      CUBLAS_OP_N,
+			      CUBLAS_OP_N,
+			      m,n,k,
+			      (cuDoubleComplex *) &alpha_p[0],
+			      (cuDoubleComplex *) Amk, lda, sda,
+			      (cuDoubleComplex *) Bkn, ldb, sdb,
+			      (cuDoubleComplex *) &beta_p[0],
+			      (cuDoubleComplex *) Cmn, ldc, sdc,
+			      batchCount);
+#endif
+#ifdef GRID_SYCL
+     #warning "oneMKL implementation not made "
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+     // Need a default/reference implementation
+     for (int p = 0; p < batchCount; ++p) {
+       for (int mm = 0; mm < m; ++mm) {
+	 for (int nn = 0; nn < n; ++nn) {
+	   ComplexD c_mn(0.0);
+	   for (int kk = 0; kk < k, ++kk)
+	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
+	   Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
+	 }
+       }
+     }
+#endif
+  }
+
+  void benchmark(int nbasis, int nrhs, int coarseVol, int nstencil)
+  {
+    int32_t N_A = nbasis*nbasis*coarseVol*nstencil;
+    int32_t N_B = nbasis*nrhs*coarseVol*nstencil; // One leg of stencil at a time
+    int32_t N_C = nbasis*nrhs*coarseVol*nstencil; 
+    deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
+    deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
+    deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
+    ComplexD alpha(1.0);
+    ComplexD beta (1.0);
+    for(int i=0;i<10;i++){
+      RealD t0 = usecond();
+      for(int s=0;s<nstencil;s++){
+	gemmStridedBatched(nbasis,nrhs,nbasis,
+			   alpha,
+			   &A[0], // m x k 
+			   &B[0], // k x n
+			   beta, 
+			   &C[0], // m x n
+			   coarseVol);
+      }
+      synchronise();
+      RealD t1 = usecond();
+      RealD flops = 8.0*nbasis*nbasis*nrhs*coarseVol*nstencil;
+      RealD bytes = 1.0*sizeof(ComplexD)*(nbasis*nbasis+nbasis*nrhs*3)*coarseVol*nstencil;
+      std::cout << " batched Blas call "<<i<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+      std::cout << " batched Blas call "<<i<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+    }
+  }
+
+
+
+
+};
+
+NAMESPACE_END(Grid);
@@ -0,0 +1,512 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: MultiRHSDeflation.h
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+
+/* 
+   MultiRHS block projection
+
+   Import basis -> nblock x nbasis x  (block x internal) 
+   Import vector of fine lattice objects -> nblock x nrhs x (block x internal) 
+
+   => coarse_(nrhs x nbasis )^block = via batched GEMM
+
+//template<class vobj,class CComplex,int nbasis,class VLattice>
+//inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
+//			   const VLattice &fineData,
+//			   const VLattice &Basis)
+*/
+
+template<class Field>
+class MultiRHSBlockProject
+{
+public:
+
+  typedef typename Field::scalar_type   scalar;
+  typedef typename Field::scalar_object scalar_object;
+  typedef Field Fermion;
+
+  int nbasis;
+  GridBase *coarse_grid;
+  GridBase *fine_grid;
+  uint64_t block_vol;
+  uint64_t fine_vol;
+  uint64_t coarse_vol;
+  uint64_t words;
+
+  // Row major layout "C" order:
+  // BLAS_V[coarse_vol][nbasis][block_vol][words]
+  // BLAS_F[coarse_vol][nrhs][block_vol][words]
+  // BLAS_C[coarse_vol][nrhs][nbasis]
+  /*
+   * in Fortran column major notation (cuBlas order)
+   *
+   * Vxb = [v1(x)][..][vn(x)] ... x coarse vol
+   *
+   * Fxr = [r1(x)][..][rm(x)] ... x coarse vol
+   *
+   * Block project:
+   * C_br = V^dag F x coarse vol
+   *
+   * Block promote:
+   * F_xr = Vxb Cbr x coarse_vol
+   */  
+  deviceVector<scalar> BLAS_V;      // words * block_vol * nbasis x coarse_vol 
+  deviceVector<scalar> BLAS_F;      // nrhs x fine_vol * words   -- the sources
+  deviceVector<scalar> BLAS_C;      // nrhs x coarse_vol * nbasis -- the coarse coeffs
+
+  RealD blasNorm2(deviceVector<scalar> &blas)
+  {
+    scalar ss(0.0);
+    std::vector<scalar> tmp(blas.size());
+    acceleratorCopyFromDevice(&blas[0],&tmp[0],blas.size()*sizeof(scalar));
+    for(int64_t s=0;s<blas.size();s++){
+      ss=ss+tmp[s]*adj(tmp[s]);
+    }
+    coarse_grid->GlobalSum(ss);
+    return real(ss);
+  }
+  
+  MultiRHSBlockProject(){};
+ ~MultiRHSBlockProject(){ Deallocate(); };
+  
+  void Deallocate(void)
+  {
+    nbasis=0;
+    coarse_grid=nullptr;
+    fine_grid=nullptr;
+    fine_vol=0;
+    block_vol=0;
+    coarse_vol=0;
+    words=0;
+    BLAS_V.resize(0);
+    BLAS_F.resize(0);
+    BLAS_C.resize(0);
+  }
+  void Allocate(int _nbasis,GridBase *_fgrid,GridBase *_cgrid)
+  {
+    nbasis=_nbasis;
+
+    fine_grid=_fgrid;
+    coarse_grid=_cgrid;
+
+    fine_vol   = fine_grid->lSites();
+    coarse_vol = coarse_grid->lSites();
+    block_vol = fine_vol/coarse_vol;
+    
+    words = sizeof(scalar_object)/sizeof(scalar);
+
+    BLAS_V.resize (fine_vol * words * nbasis );
+  }
+  void ImportFineGridVectors(std::vector <Field > &vecs, deviceVector<scalar> &blas)
+  {
+    int nvec = vecs.size();
+    typedef typename Field::vector_object vobj;
+    std::cout << " BlockProjector importing "<<nvec<< " vectors" <<std::endl;
+
+    assert(vecs[0].Grid()==fine_grid);
+
+    subdivides(coarse_grid,fine_grid); // require they map
+
+    int _ndimension = coarse_grid->_ndimension;
+    assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
+    
+    Coordinate  block_r      (_ndimension);
+    for(int d=0 ; d<_ndimension;d++){
+      block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
+    }
+
+    uint64_t sz = blas.size();
+
+    acceleratorMemSet(&blas[0],0,blas.size()*sizeof(scalar));
+
+    Coordinate fine_rdimensions = fine_grid->_rdimensions;
+    Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
+    int64_t bv= block_vol;
+    for(int v=0;v<vecs.size();v++){
+
+      //      std::cout << " BlockProjector importing vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
+      autoView( fineData   , vecs[v], AcceleratorRead);
+
+      auto blasData_p  = &blas[0];
+      auto fineData_p  = &fineData[0];
+
+      int64_t osites = fine_grid->oSites();
+
+      // loop over fine sites
+      const int Nsimd = vobj::Nsimd();
+      //      std::cout << "sz "<<sz<<std::endl;
+      //      std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
+      assert(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
+      uint64_t lwords= words; // local variable for copy in to GPU
+      accelerator_for(sf,osites,Nsimd,{
+#ifdef GRID_SIMT
+        {
+	  int lane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+	  for(int lane=0;lane<Nsimd;lane++) {
+#endif
+	  // One thread per fine site
+	  Coordinate coor_f(_ndimension);
+	  Coordinate coor_b(_ndimension);
+	  Coordinate coor_c(_ndimension);
+
+	  // Fine site to fine coor
+	  Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
+
+	  for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
+	  for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
+	  
+	  int sc;// coarse site
+	  int sb;// block site
+	  Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
+	  Lexicographic::IndexFromCoor(coor_b,sb,block_r);
+
+          scalar_object data = extractLane(lane,fineData[sf]);
+
+	  // BLAS layout address calculation
+	  // words * block_vol * nbasis x coarse_vol
+	  // coarse oSite x block vole x lanes
+	  int64_t site = (lane*osites + sc*bv)*nvec
+   	               + v*bv
+	               + sb;
+
+	  //	  assert(site*lwords<sz);
+
+	  scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
+
+	  *ptr = data;
+#ifdef GRID_SIMT
+	}
+#else
+	}
+#endif
+      });
+      //      std::cout << " import fine Blas norm "<<blasNorm2(blas)<<std::endl;
+      //      std::cout << " BlockProjector imported vector"<<v<<std::endl;
+    }
+  }
+  void ExportFineGridVectors(std::vector <Field> &vecs, deviceVector<scalar> &blas)
+  {
+    typedef typename Field::vector_object vobj;
+
+    int nvec = vecs.size();
+
+    assert(vecs[0].Grid()==fine_grid);
+
+    subdivides(coarse_grid,fine_grid); // require they map
+
+    int _ndimension = coarse_grid->_ndimension;
+    assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
+    
+    Coordinate  block_r      (_ndimension);
+    for(int d=0 ; d<_ndimension;d++){
+      block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
+    }
+    Coordinate fine_rdimensions = fine_grid->_rdimensions;
+    Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
+
+    //    std::cout << " export fine Blas norm "<<blasNorm2(blas)<<std::endl;
+
+    int64_t bv= block_vol;
+    for(int v=0;v<vecs.size();v++){
+
+      autoView( fineData   , vecs[v], AcceleratorWrite);
+
+      auto blasData_p  = &blas[0];
+      auto fineData_p    = &fineData[0];
+
+      int64_t osites = fine_grid->oSites();
+      uint64_t lwords = words;
+      //      std::cout << " Nsimd is "<<vobj::Nsimd() << std::endl;
+      //      std::cout << " lwords is "<<lwords << std::endl;
+      //      std::cout << " sizeof(scalar_object) is "<<sizeof(scalar_object) << std::endl;
+      // loop over fine sites
+      accelerator_for(sf,osites,vobj::Nsimd(),{
+      
+#ifdef GRID_SIMT
+        {
+	  int lane=acceleratorSIMTlane(vobj::Nsimd()); // buffer lane
+#else
+	  for(int lane=0;lane<vobj::Nsimd();lane++) {
+#endif
+	  // One thread per fine site
+	  Coordinate coor_f(_ndimension);
+	  Coordinate coor_b(_ndimension);
+	  Coordinate coor_c(_ndimension);
+
+	  Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
+
+	  for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
+	  for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
+	  
+	  int sc;
+	  int sb;
+	  Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
+	  Lexicographic::IndexFromCoor(coor_b,sb,block_r);
+
+	  // BLAS layout address calculation
+	  // words * block_vol * nbasis x coarse_vol 	  
+	  int64_t site = (lane*osites + sc*bv)*nvec
+   	               + v*bv
+	               + sb;
+
+	  scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
+
+	  scalar_object data = *ptr;
+
+	  insertLane(lane,fineData[sf],data);
+#ifdef GRID_SIMT
+	}
+#else
+	}
+#endif
+      });
+    }
+  }
+  template<class vobj>
+  void ImportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
+  {
+    int nvec = vecs.size();
+    typedef typename vobj::scalar_object coarse_scalar_object;
+
+    std::cout << " BlockProjector importing coarse grid "<<nvec<< " vectors" <<std::endl;
+
+    assert(vecs[0].Grid()==coarse_grid);
+
+    int _ndimension = coarse_grid->_ndimension;
+    
+    uint64_t sz = blas.size();
+
+    Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
+    
+    for(int v=0;v<vecs.size();v++){
+
+      //      std::cout << " BlockProjector importing coarse vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
+      autoView( coarseData   , vecs[v], AcceleratorRead);
+
+      auto blasData_p  = &blas[0];
+      auto coarseData_p  = &coarseData[0];
+
+      int64_t osites = coarse_grid->oSites();
+
+      // loop over fine sites
+      const int Nsimd = vobj::Nsimd();
+      uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
+      assert(cwords==nbasis);
+      
+      accelerator_for(sc,osites,Nsimd,{
+#ifdef GRID_SIMT
+        {
+	  int lane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+	  for(int lane=0;lane<Nsimd;lane++) {
+#endif
+           // C_br per site
+	    int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
+	    
+	    coarse_scalar_object data = extractLane(lane,coarseData[sc]);
+
+	    coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
+
+	    *ptr = data;
+#ifdef GRID_SIMT
+	}
+#else
+	}
+#endif
+      });
+      //      std::cout << " import coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
+    }
+  }
+  template<class vobj>
+  void ExportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
+  {
+    int nvec = vecs.size();
+    typedef typename vobj::scalar_object coarse_scalar_object;
+    std::cout << " BlockProjector importing coarse grid "<<nvec<< " vectors" <<std::endl;
+
+    assert(vecs[0].Grid()==coarse_grid);
+
+    int _ndimension = coarse_grid->_ndimension;
+    
+    uint64_t sz = blas.size();
+
+    Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
+    
+    //    std::cout << " export coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
+    for(int v=0;v<vecs.size();v++){
+
+      //  std::cout << " BlockProjector exporting coarse vector"<<v<<std::endl;
+      autoView( coarseData   , vecs[v], AcceleratorWrite);
+
+      auto blasData_p  = &blas[0];
+      auto coarseData_p  = &coarseData[0];
+
+      int64_t osites = coarse_grid->oSites();
+
+      // loop over fine sites
+      const int Nsimd = vobj::Nsimd();
+      uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
+      assert(cwords==nbasis);
+      
+      accelerator_for(sc,osites,Nsimd,{
+	  // Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
+#ifdef GRID_SIMT
+        {
+	  int lane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+	  for(int lane=0;lane<Nsimd;lane++) {
+#endif
+	    int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
+	    coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
+	    coarse_scalar_object data = *ptr;
+	    insertLane(lane,coarseData[sc],data);
+#ifdef GRID_SIMT
+	}
+#else
+	}
+#endif
+      });
+    }
+  }
+  void ImportBasis(std::vector < Field > &vecs)
+  {
+    //    std::cout << " BlockProjector Import basis size "<<vecs.size()<<std::endl;
+    ImportFineGridVectors(vecs,BLAS_V);
+  }
+
+  template<class cobj>
+  void blockProject(std::vector<Field> &fine,std::vector< Lattice<cobj> > & coarse)
+  {
+    int nrhs=fine.size();
+    int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
+    assert(nbasis==_nbasis);
+    
+    BLAS_F.resize (fine_vol * words * nrhs );
+    BLAS_C.resize (coarse_vol * nbasis * nrhs );
+
+    /////////////////////////////////////////////
+    // Copy in the multi-rhs sources to same data layout
+    /////////////////////////////////////////////
+    //    std::cout << "BlockProject import fine"<<std::endl;
+    ImportFineGridVectors(fine,BLAS_F);
+    
+    deviceVector<scalar *> Vd(coarse_vol);
+    deviceVector<scalar *> Fd(coarse_vol);
+    deviceVector<scalar *> Cd(coarse_vol);
+
+    //    std::cout << "BlockProject pointers"<<std::endl;
+    for(int c=0;c<coarse_vol;c++){
+      // BLAS_V[coarse_vol][nbasis][block_vol][words]
+      // BLAS_F[coarse_vol][nrhs][block_vol][words]
+      // BLAS_C[coarse_vol][nrhs][nbasis]
+      scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
+      scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
+      scalar * Ch = & BLAS_C[c*nrhs*nbasis];
+
+      acceleratorPut(Vd[c],Vh);
+      acceleratorPut(Fd[c],Fh);
+      acceleratorPut(Cd[c],Ch);
+    }
+
+    GridBLAS BLAS;
+
+    //    std::cout << "BlockProject BLAS"<<std::endl;
+    int64_t vw = block_vol * words;
+    /////////////////////////////////////////
+    // C_br = V^dag R
+    /////////////////////////////////////////
+    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
+    		     nbasis,nrhs,vw,
+		     ComplexD(1.0),
+		     Vd,
+		     Fd,
+		     ComplexD(0.0),  // wipe out C
+		     Cd);
+    BLAS.synchronise();
+    //    std::cout << "BlockProject done"<<std::endl;
+    ExportCoarseGridVectors(coarse, BLAS_C);
+    //    std::cout << "BlockProject done"<<std::endl;
+
+  }
+
+  template<class cobj>
+  void blockPromote(std::vector<Field> &fine,std::vector<Lattice<cobj> > & coarse)
+  {
+    int nrhs=fine.size();
+    int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
+    assert(nbasis==_nbasis);
+    
+    BLAS_F.resize (fine_vol * words * nrhs );
+    BLAS_C.resize (coarse_vol * nbasis * nrhs );
+
+    ImportCoarseGridVectors(coarse, BLAS_C);
+
+    GridBLAS BLAS;
+
+    deviceVector<scalar *> Vd(coarse_vol);
+    deviceVector<scalar *> Fd(coarse_vol);
+    deviceVector<scalar *> Cd(coarse_vol);
+
+    for(int c=0;c<coarse_vol;c++){
+      // BLAS_V[coarse_vol][nbasis][block_vol][words]
+      // BLAS_F[coarse_vol][nrhs][block_vol][words]
+      // BLAS_C[coarse_vol][nrhs][nbasis]
+      scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
+      scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
+      scalar * Ch = & BLAS_C[c*nrhs*nbasis];
+      acceleratorPut(Vd[c],Vh);
+      acceleratorPut(Fd[c],Fh);
+      acceleratorPut(Cd[c],Ch);
+    }
+
+    /////////////////////////////////////////
+    // Block promote:
+    // F_xr = Vxb Cbr (x coarse_vol)
+    /////////////////////////////////////////
+
+    int64_t vw = block_vol * words;
+    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
+    		     vw,nrhs,nbasis,
+		     ComplexD(1.0),
+		     Vd,
+		     Cd,
+		     ComplexD(0.0),  // wipe out C
+		     Fd);
+    BLAS.synchronise();
+    //    std::cout << " blas call done"<<std::endl;
+    
+    ExportFineGridVectors(fine, BLAS_F);
+    //    std::cout << " exported "<<std::endl;
+  }
+};
+
+NAMESPACE_END(Grid);
@@ -0,0 +1,234 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: MultiRHSDeflation.h
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+
+/* Need helper object for BLAS accelerated mrhs projection
+
+   i) MultiRHS Deflation
+
+   Import Evecs -> nev x vol x internal 
+   Import vector of Lattice objects -> nrhs x vol x internal
+   => Cij (nrhs x Nev) via GEMM.
+   => Guess  (nrhs x vol x internal)  = C x evecs (via GEMM)
+   Export
+
+   
+   ii) MultiRHS block projection
+
+   Import basis -> nblock x nbasis x  (block x internal) 
+   Import vector of fine lattice objects -> nblock x nrhs x (block x internal) 
+
+   => coarse_(nrhs x nbasis )^block = via batched GEMM
+
+   iii)   Alternate interface: 
+   Import higher dim Lattice object-> vol x nrhs layout
+   
+*/
+template<class Field>
+class MultiRHSDeflation
+{
+public:
+
+  typedef typename Field::scalar_type   scalar;
+  typedef typename Field::scalar_object scalar_object;
+
+  int nev;
+  std::vector<RealD> eval;
+  GridBase *grid;
+  uint64_t vol;
+  uint64_t words;
+  
+  deviceVector<scalar> BLAS_E;      //  nev x vol -- the eigenbasis   (up to a 1/sqrt(lambda))
+  deviceVector<scalar> BLAS_R;      // nrhs x vol -- the sources
+  deviceVector<scalar> BLAS_G;      // nrhs x vol -- the guess
+  deviceVector<scalar> BLAS_C;      // nrhs x nev -- the coefficients 
+  
+  MultiRHSDeflation(){};
+  ~MultiRHSDeflation(){ Deallocate(); };
+  
+  void Deallocate(void)
+  {
+    nev=0;
+    grid=nullptr;
+    vol=0;
+    words=0;
+    BLAS_E.resize(0);
+    BLAS_R.resize(0);
+    BLAS_C.resize(0);
+    BLAS_G.resize(0);
+  }
+  void Allocate(int _nev,GridBase *_grid)
+  {
+    nev=_nev;
+    grid=_grid;
+    vol   = grid->lSites();
+    words = sizeof(scalar_object)/sizeof(scalar);
+    eval.resize(nev);
+    BLAS_E.resize (vol * words * nev );
+    std::cout << GridLogMessage << " Allocate for "<<nev<<" eigenvectors and volume "<<vol<<std::endl;
+  }
+  void ImportEigenVector(Field &evec,RealD &_eval, int ev)
+  {
+    assert(ev<eval.size());
+    std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
+    eval[ev] = _eval;
+
+    int64_t offset = ev*vol*words;
+    autoView(v,evec,AcceleratorRead);
+    acceleratorCopyDeviceToDevice(&v[0],&BLAS_E[offset],sizeof(scalar_object)*vol);
+
+  }
+  void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval)
+  {
+    ImportEigenBasis(evec,_eval,0,evec.size());
+  }
+  // Could use to import a batch of eigenvectors
+  void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev)
+  {
+    assert(_ev0+_nev<=evec.size());
+
+    Allocate(_nev,evec[0].Grid());
+    
+    // Imports a sub-batch of eigenvectors, _ev0, ..., _ev0+_nev-1
+    for(int e=0;e<nev;e++){
+      std::cout << "Importing eigenvector "<<e<<" evalue "<<_eval[_ev0+e]<<std::endl;
+      ImportEigenVector(evec[_ev0+e],_eval[_ev0+e],e);
+    }
+  }
+  void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess)
+  {
+    int nrhs = source.size();
+    assert(source.size()==guess.size());
+    assert(grid == guess[0].Grid());
+    conformable(guess[0],source[0]);
+
+    int64_t vw = vol * words;
+
+    std::cout << GridLogMessage << "MultiRHSDelation for "<<nrhs<<" sources with "<<nev<<" eigenvectors "<<std::endl;
+    RealD t0 = usecond();
+    BLAS_R.resize(nrhs * vw); // cost free if size doesn't change
+    BLAS_G.resize(nrhs * vw); // cost free if size doesn't change
+    BLAS_C.resize(nev * nrhs);// cost free if size doesn't change
+
+    /////////////////////////////////////////////
+    // Copy in the multi-rhs sources
+    /////////////////////////////////////////////
+    //    for(int r=0;r<nrhs;r++){
+    //      std::cout << " source["<<r<<"] = "<<norm2(source[r])<<std::endl;
+    //    }
+    for(int r=0;r<nrhs;r++){
+      int64_t offset = r*vw;
+      autoView(v,source[r],AcceleratorRead);
+      acceleratorCopyDeviceToDevice(&v[0],&BLAS_R[offset],sizeof(scalar_object)*vol);
+    }
+
+  /*
+   * in Fortran column major notation (cuBlas order)
+   *
+   * Exe = [e1(x)][..][en(x)]
+   *
+   * Rxr = [r1(x)][..][rm(x)]
+   *
+   * C_er = E^dag R
+   * C_er = C_er / lambda_e 
+   * G_xr = Exe Cer
+   */
+    deviceVector<scalar *> Ed(1);
+    deviceVector<scalar *> Rd(1);
+    deviceVector<scalar *> Cd(1);
+    deviceVector<scalar *> Gd(1);
+
+    scalar * Eh = & BLAS_E[0];
+    scalar * Rh = & BLAS_R[0];
+    scalar * Ch = & BLAS_C[0];
+    scalar * Gh = & BLAS_G[0];
+
+    acceleratorPut(Ed[0],Eh);
+    acceleratorPut(Rd[0],Rh);
+    acceleratorPut(Cd[0],Ch);
+    acceleratorPut(Gd[0],Gh);
+
+    GridBLAS BLAS;
+
+    /////////////////////////////////////////
+    // C_er = E^dag R
+    /////////////////////////////////////////
+    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
+    		     nev,nrhs,vw,
+		     ComplexD(1.0),
+		     Ed,
+		     Rd,
+		     ComplexD(0.0),  // wipe out C
+		     Cd);
+    BLAS.synchronise();
+
+    assert(BLAS_C.size()==nev*nrhs);
+
+    std::vector<scalar> HOST_C(BLAS_C.size());      // nrhs . nev -- the coefficients 
+    acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
+    grid->GlobalSumVector(&HOST_C[0],nev*nrhs);
+    for(int e=0;e<nev;e++){
+      RealD lam(1.0/eval[e]);
+      for(int r=0;r<nrhs;r++){
+	int off = e+nev*r;
+	HOST_C[off]=HOST_C[off] * lam;
+	//	std::cout << "C["<<e<<"]["<<r<<"] ="<<HOST_C[off]<< " eval[e] "<<eval[e] <<std::endl;
+      }
+    }
+    acceleratorCopyToDevice(&HOST_C[0],&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
+
+    
+    /////////////////////////////////////////
+    // Guess G_xr = Exe Cer
+    /////////////////////////////////////////
+    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
+		     vw,nrhs,nev,
+		     ComplexD(1.0),
+		     Ed, // x . nev
+		     Cd, // nev . nrhs
+		     ComplexD(0.0),
+		     Gd);
+    BLAS.synchronise();
+
+    ///////////////////////////////////////
+    // Copy out the multirhs
+    ///////////////////////////////////////
+    for(int r=0;r<nrhs;r++){
+      int64_t offset = r*vw;
+      autoView(v,guess[r],AcceleratorWrite);
+      acceleratorCopyDeviceToDevice(&BLAS_G[offset],&v[0],sizeof(scalar_object)*vol);
+    }
+    RealD t1 = usecond();
+    std::cout << GridLogMessage << "MultiRHSDelation for "<<nrhs<<" sources with "<<nev<<" eigenvectors took " << (t1-t0)/1e3 <<" ms"<<std::endl;
+  }
+};
+
+NAMESPACE_END(Grid);
@@ -33,109 +33,111 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   * Script A = SolverMatrix 
   * Script P = Preconditioner
   *
-   * Deflation methods considered
-   *      -- Solve P A x = P b        [ like Luscher ]
-   * DEF-1        M P A x = M P b     [i.e. left precon]
-   * DEF-2        P^T M A x = P^T M b
-   * ADEF-1       Preconditioner = M P + Q      [ Q + M + M A Q]
-   * ADEF-2       Preconditioner = P^T M + Q
-   * BNN          Preconditioner = P^T M P + Q
-   * BNN2         Preconditioner = M P + P^TM +Q - M P A M 
-   * 
   * Implement ADEF-2
   *
   * Vstart = P^Tx + Qb
   * M1 = P^TM + Q
   * M2=M3=1
-   * Vout = x
   */
+NAMESPACE_BEGIN(Grid);

-// abstract base
-template<class Field, class CoarseField>
-class TwoLevelFlexiblePcg : public LinearFunction<Field>
+
+template<class Field>
+class TwoLevelCG : public LinearFunction<Field>
 {
 public:
-  int verbose;
  RealD   Tolerance;
  Integer MaxIterations;
-  const int mmax = 5;
  GridBase *grid;
-  GridBase *coarsegrid;

-  LinearOperatorBase<Field>   *_Linop
-  OperatorFunction<Field>     *_Smoother,
-  LinearFunction<CoarseField> *_CoarseSolver;
-
-  // Need somthing that knows how to get from Coarse to fine and back again
+  // Fine operator, Smoother, CoarseSolver
+  LinearOperatorBase<Field>   &_FineLinop;
+  LinearFunction<Field>   &_Smoother;
  
  // more most opertor functions
-  TwoLevelFlexiblePcg(RealD tol,
-		     Integer maxit,
-		     LinearOperatorBase<Field> *Linop,
-		     LinearOperatorBase<Field> *SmootherLinop,
-		     OperatorFunction<Field>   *Smoother,
-		     OperatorFunction<CoarseField>  CoarseLinop
-		     ) : 
+  TwoLevelCG(RealD tol,
+	     Integer maxit,
+	     LinearOperatorBase<Field>   &FineLinop,
+	     LinearFunction<Field>       &Smoother,
+	     GridBase *fine) : 
      Tolerance(tol), 
      MaxIterations(maxit),
-      _Linop(Linop),
-      _PreconditionerLinop(PrecLinop),
-      _Preconditioner(Preconditioner)
-  { 
-    verbose=0;
+      _FineLinop(FineLinop),
+      _Smoother(Smoother)
+  {
+    grid       = fine;
  };
-
-  // The Pcg routine is common to all, but the various matrices differ from derived 
-  // implementation to derived implmentation
-  void operator() (const Field &src, Field &psi){
-  void operator() (const Field &src, Field &psi){
-
-    psi.Checkerboard() = src.Checkerboard();
-    grid             = src.Grid();
-
+  
+  virtual void operator() (const Field &src, Field &x)
+  {
+    std::cout << GridLogMessage<<"HDCG: fPcg starting single RHS"<<std::endl;
    RealD f;
    RealD rtzp,rtz,a,d,b;
    RealD rptzp;
-    RealD tn;
-    RealD guess = norm2(psi);
-    RealD ssq   = norm2(src);
-    RealD rsq   = ssq*Tolerance*Tolerance;
-    
+
    /////////////////////////////
    // Set up history vectors
    /////////////////////////////
-    std::vector<Field> p  (mmax,grid);
+    int mmax = 5;
+    std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
+    std::vector<Field> p(mmax,grid);
    std::vector<Field> mmp(mmax,grid);
    std::vector<RealD> pAp(mmax);
-
-    Field x  (grid); x = psi;
-    Field z  (grid);
+    Field z(grid);
    Field tmp(grid);
-    Field r  (grid);
-    Field mu (grid);
-  
+    Field  mp (grid);
+    Field  r  (grid);
+    Field  mu (grid);
+    
+    std::cout << GridLogMessage<<"HDCG: fPcg allocated"<<std::endl;
+    //Initial residual computation & set up
+    RealD guess   = norm2(x);
+    std::cout << GridLogMessage<<"HDCG: fPcg guess nrm "<<guess<<std::endl;
+    RealD src_nrm = norm2(src);
+    std::cout << GridLogMessage<<"HDCG: fPcg src nrm "<<src_nrm<<std::endl;
+    
+    if ( src_nrm == 0.0 ) {
+      std::cout << GridLogMessage<<"HDCG: fPcg given trivial source norm "<<src_nrm<<std::endl;
+      x=Zero();
+    }
+    RealD tn;
+    
+    GridStopWatch HDCGTimer;
+    HDCGTimer.Start();
    //////////////////////////
    // x0 = Vstart -- possibly modify guess
    //////////////////////////
-    x=src;
    Vstart(x,src);
-
+    
    // r0 = b -A x0
-    HermOp(x,mmp); // Shouldn't this be something else?
+    _FineLinop.HermOp(x,mmp[0]);
    axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
+    {
+      double n1 = norm2(x);
+      double n2 = norm2(mmp[0]);
+      double n3 = norm2(r);
+      std::cout<<GridLogMessage<<"x,vstart,r = "<<n1<<" "<<n2<<" "<<n3<<std::endl;
+    }

    //////////////////////////////////
    // Compute z = M1 x
    //////////////////////////////////
-    M1(r,z,tmp,mp,SmootherMirs);
+    PcgM1(r,z);
    rtzp =real(innerProduct(r,z));
-
+    
    ///////////////////////////////////////
    // Solve for Mss mu = P A z and set p = z-mu
-    // Def2: p = 1 - Q Az = Pright z 
+    // Def2 p = 1 - Q Az = Pright z
    // Other algos M2 is trivial
    ///////////////////////////////////////
-    M2(z,p[0]);
+    PcgM2(z,p[0]);
+
+    RealD ssq =  norm2(src);
+    RealD rsq =  ssq*Tolerance*Tolerance;
+
+    std::cout << GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" rsq "<<rsq<<"\n";
+
+    Field pp(grid);

    for (int k=0;k<=MaxIterations;k++){
    
@@ -143,31 +145,46 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
      int peri_kp = (k+1) % mmax;

      rtz=rtzp;
-      d= M3(p[peri_k],mp,mmp[peri_k],tmp);
+      d= PcgM3(p[peri_k],mmp[peri_k]);
      a = rtz/d;
    
      // Memorise this
      pAp[peri_k] = d;
-
+      
      axpy(x,a,p[peri_k],x);
      RealD rn = axpy_norm(r,-a,mmp[peri_k],r);

      // Compute z = M x
-      M1(r,z,tmp,mp);
-
+      PcgM1(r,z);
+      
+      {
+	RealD n1,n2;
+	n1=norm2(r);
+	n2=norm2(z);
+	std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : vector r,z "<<n1<<" "<<n2<<"\n";
+      }
      rtzp =real(innerProduct(r,z));
+      std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : inner rtzp "<<rtzp<<"\n";

-      M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
+      //    PcgM2(z,p[0]);
+      PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
+      
+      p[peri_kp]=mu;

-      p[peri_kp]=p[peri_k];
-
-      // Standard search direction  p -> z + b p    ; b = 
+      // Standard search direction  p -> z + b p    
      b = (rtzp)/rtz;
-
+      
      int northog;
+      // k=zero  <=> peri_kp=1;        northog = 1
+      // k=1     <=> peri_kp=2;        northog = 2
+      // ...               ...                  ...
+      // k=mmax-2<=> peri_kp=mmax-1;   northog = mmax-1
+      // k=mmax-1<=> peri_kp=0;        northog = 1
+
      //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
      northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
    
+      std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
      for(int back=0; back < northog; back++){
 	int peri_back = (k-back)%mmax;
 	RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
@@ -176,75 +193,324 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
      }

      RealD rrn=sqrt(rn/ssq);
-      std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
+      RealD rtn=sqrt(rtz/ssq);
+      RealD rtnp=sqrt(rtzp/ssq);
+
+      std::cout<<GridLogMessage<<"HDCG: fPcg k= "<<k<<" residual = "<<rrn<<"\n";

      // Stopping condition
      if ( rn <= rsq ) { 

-	HermOp(x,mmp); // Shouldn't this be something else?
+	HDCGTimer.Stop();
+	std::cout<<GridLogMessage<<"HDCG: fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
+	
+	_FineLinop.HermOp(x,mmp[0]);			  
 	axpy(tmp,-1.0,src,mmp[0]);
 	
-	RealD psinorm = sqrt(norm2(x));
-	RealD srcnorm = sqrt(norm2(src));
-	RealD tmpnorm = sqrt(norm2(tmp));
-	RealD true_residual = tmpnorm/srcnorm;
-	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
-	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
-	return k;
+	RealD  mmpnorm = sqrt(norm2(mmp[0]));
+	RealD  xnorm   = sqrt(norm2(x));
+	RealD  srcnorm = sqrt(norm2(src));
+	RealD  tmpnorm = sqrt(norm2(tmp));
+	RealD  true_residual = tmpnorm/srcnorm;
+	std::cout<<GridLogMessage
+	       <<"HDCG: true residual is "<<true_residual
+	       <<" solution "<<xnorm
+	       <<" source "<<srcnorm
+	       <<" mmp "<<mmpnorm	  
+	       <<std::endl;
+      
+	return;
      }
+
    }
-    // Non-convergence
-    assert(0);
+    HDCGTimer.Stop();
+    std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
+    RealD  xnorm   = sqrt(norm2(x));
+    RealD  srcnorm = sqrt(norm2(src));
+    std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
  }

+
+
+  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
+  {
+    std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
+    src[0].Grid()->Barrier();
+    int nrhs = src.size();
+    std::vector<RealD> f(nrhs);
+    std::vector<RealD> rtzp(nrhs);
+    std::vector<RealD> rtz(nrhs);
+    std::vector<RealD> a(nrhs);
+    std::vector<RealD> d(nrhs);
+    std::vector<RealD> b(nrhs);
+    std::vector<RealD> rptzp(nrhs);
+    /////////////////////////////
+    // Set up history vectors
+    /////////////////////////////
+    int mmax = 3;
+    std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
+    src[0].Grid()->Barrier();
+    std::vector<std::vector<Field> > p(nrhs);   for(int r=0;r<nrhs;r++)  p[r].resize(mmax,grid);
+    std::cout << GridLogMessage<<"HDCG: fPcg allocated p"<<std::endl;
+    src[0].Grid()->Barrier();
+    std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
+    std::cout << GridLogMessage<<"HDCG: fPcg allocated mmp"<<std::endl;
+    src[0].Grid()->Barrier();
+    std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
+    std::cout << GridLogMessage<<"HDCG: fPcg allocated pAp"<<std::endl;
+    src[0].Grid()->Barrier();
+    std::vector<Field> z(nrhs,grid);
+    std::vector<Field>  mp (nrhs,grid);
+    std::vector<Field>  r  (nrhs,grid);
+    std::vector<Field>  mu (nrhs,grid);
+    std::cout << GridLogMessage<<"HDCG: fPcg allocated z,mp,r,mu"<<std::endl;
+    src[0].Grid()->Barrier();
+
+    //Initial residual computation & set up
+    std::vector<RealD> src_nrm(nrhs);
+    for(int rhs=0;rhs<nrhs;rhs++) {
+      src_nrm[rhs]=norm2(src[rhs]);
+      assert(src_nrm[rhs]!=0.0);
+    }
+    std::vector<RealD> tn(nrhs);
+
+    GridStopWatch HDCGTimer;
+    HDCGTimer.Start();
+    //////////////////////////
+    // x0 = Vstart -- possibly modify guess
+    //////////////////////////
+    Vstart(x,src);
+
+    for(int rhs=0;rhs<nrhs;rhs++){
+      // r0 = b -A x0
+      _FineLinop.HermOp(x[rhs],mmp[rhs][0]);
+      axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]);    // Recomputes r=src-Ax0
+    }
+
+    //////////////////////////////////
+    // Compute z = M1 x
+    //////////////////////////////////
+    // This needs a multiRHS version for acceleration
+    PcgM1(r,z);
+
+    std::vector<RealD> ssq(nrhs);
+    std::vector<RealD> rsq(nrhs);
+    std::vector<Field> pp(nrhs,grid);
+
+    for(int rhs=0;rhs<nrhs;rhs++){
+      rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
+      p[rhs][0]=z[rhs];
+      ssq[rhs]=norm2(src[rhs]);
+      rsq[rhs]=  ssq[rhs]*Tolerance*Tolerance;
+      std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
+    }
+
+    std::vector<RealD> rn(nrhs);
+    for (int k=0;k<=MaxIterations;k++){
+    
+      int peri_k  = k % mmax;
+      int peri_kp = (k+1) % mmax;
+
+      for(int rhs=0;rhs<nrhs;rhs++){
+	rtz[rhs]=rtzp[rhs];
+	d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
+	a[rhs] = rtz[rhs]/d[rhs];
+    
+	// Memorise this
+	pAp[rhs][peri_k] = d[rhs];
+
+	axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
+	rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
+      }
+
+      // Compute z = M x (for *all* RHS)
+      PcgM1(r,z);
+      std::cout << GridLogMessage<<"HDCG::fPcg M1 complete"<<std::endl;
+      grid->Barrier();
+      
+      RealD max_rn=0.0;
+      for(int rhs=0;rhs<nrhs;rhs++){
+
+	rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
+
+	std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
+	
+	mu[rhs]=z[rhs];
+
+	p[rhs][peri_kp]=mu[rhs];
+
+	// Standard search direction p == z + b p 
+	b[rhs] = (rtzp[rhs])/rtz[rhs];
+
+	int northog = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
+	std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
+	for(int back=0; back < northog; back++){
+	  int peri_back = (k-back)%mmax;
+	  RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
+	  RealD beta = -pbApk/pAp[rhs][peri_back];
+	  axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
+	}
+
+	RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
+	RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
+	RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
+	
+	std::cout<<GridLogMessage<<"HDCG: rhs "<<rhs<<"fPcg k= "<<k<<" residual = "<<rrn<<"\n";
+	if ( rrn > max_rn ) max_rn = rrn;
+      }
+
+      // Stopping condition based on worst case
+      if ( max_rn <= Tolerance ) { 
+
+	HDCGTimer.Stop();
+	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
+
+	for(int rhs=0;rhs<nrhs;rhs++){
+	  _FineLinop.HermOp(x[rhs],mmp[rhs][0]);			  
+	  Field tmp(grid);
+	  axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
+      
+	  RealD  mmpnorm = sqrt(norm2(mmp[rhs][0]));
+	  RealD  xnorm   = sqrt(norm2(x[rhs]));
+	  RealD  srcnorm = sqrt(norm2(src[rhs]));
+	  RealD  tmpnorm = sqrt(norm2(tmp));
+	  RealD  true_residual = tmpnorm/srcnorm;
+	  std::cout<<GridLogMessage
+		   <<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
+		   <<" solution "<<xnorm
+		   <<" source "<<srcnorm
+		   <<" mmp "<<mmpnorm	  
+		   <<std::endl;
+	}
+	return;
+      }
+      
+    }
+    HDCGTimer.Stop();
+    std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
+    for(int rhs=0;rhs<nrhs;rhs++){
+      RealD  xnorm   = sqrt(norm2(x[rhs]));
+      RealD  srcnorm = sqrt(norm2(src[rhs]));
+      std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
+    }
+  }
+  
+
 public:

-  virtual void M(Field & in,Field & out,Field & tmp) {
+  virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out)
+  {
+    std::cout << "PcgM1 default (cheat) mrhs version"<<std::endl;
+    for(int rhs=0;rhs<in.size();rhs++){
+      this->PcgM1(in[rhs],out[rhs]);
+    }
+  }
+  virtual void PcgM1(Field & in, Field & out)     =0;
+  virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
+  {
+    std::cout << "Vstart default (cheat) mrhs version"<<std::endl;
+    for(int rhs=0;rhs<x.size();rhs++){
+      this->Vstart(x[rhs],src[rhs]);
+    }
+  }
+  virtual void Vstart(Field & x,const Field & src)=0;

+  virtual void PcgM2(const Field & in, Field & out) {
+    out=in;
  }

-  virtual void M1(Field & in, Field & out) {// the smoother
+  virtual RealD PcgM3(const Field & p, Field & mmp){
+    RealD dd;
+    _FineLinop.HermOp(p,mmp);
+    ComplexD dot = innerProduct(p,mmp);
+    dd=real(dot);
+    return dd;
+  }

+  /////////////////////////////////////////////////////////////////////
+  // Only Def1 has non-trivial Vout.
+  /////////////////////////////////////////////////////////////////////
+
+};
+  
+template<class Field, class CoarseField, class Aggregation>
+class TwoLevelADEF2 : public TwoLevelCG<Field>
+{
+ public:
+  ///////////////////////////////////////////////////////////////////////////////////
+  // Need something that knows how to get from Coarse to fine and back again
+  //  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
+  //  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
+  ///////////////////////////////////////////////////////////////////////////////////
+  GridBase *coarsegrid;
+  Aggregation &_Aggregates;                    
+  LinearFunction<CoarseField> &_CoarseSolver;
+  LinearFunction<CoarseField> &_CoarseSolverPrecise;
+  ///////////////////////////////////////////////////////////////////////////////////
+  
+  // more most opertor functions
+  TwoLevelADEF2(RealD tol,
+		Integer maxit,
+		LinearOperatorBase<Field>    &FineLinop,
+		LinearFunction<Field>        &Smoother,
+		LinearFunction<CoarseField>  &CoarseSolver,
+		LinearFunction<CoarseField>  &CoarseSolverPrecise,
+		Aggregation &Aggregates
+		) :
+      TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
+      _CoarseSolver(CoarseSolver),
+      _CoarseSolverPrecise(CoarseSolverPrecise),
+      _Aggregates(Aggregates)
+  {
+    coarsegrid = Aggregates.CoarseGrid;
+  };
+
+  virtual void PcgM1(Field & in, Field & out)
+  {
+    GRID_TRACE("MultiGridPreconditioner ");
    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
-    Field tmp(grid);
-    Field Min(grid);

-    PcgM(in,Min); // Smoother call
+    Field tmp(this->grid);
+    Field Min(this->grid);
+    CoarseField PleftProj(this->coarsegrid);
+    CoarseField PleftMss_proj(this->coarsegrid);

-    HermOp(Min,out);
+    GridStopWatch SmootherTimer;
+    GridStopWatch MatrixTimer;
+    SmootherTimer.Start();
+    this->_Smoother(in,Min);
+    SmootherTimer.Stop();
+
+    MatrixTimer.Start();
+    this->_FineLinop.HermOp(Min,out);
+    MatrixTimer.Stop();
    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min

-    ProjectToSubspace(tmp,PleftProj);     
-    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
-    PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
+    GridStopWatch ProjTimer;
+    GridStopWatch CoarseTimer;
+    GridStopWatch PromTimer;
+    ProjTimer.Start();
+    this->_Aggregates.ProjectToSubspace(PleftProj,tmp);     
+    ProjTimer.Stop();
+    CoarseTimer.Start();
+    this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
+    CoarseTimer.Stop();
+    PromTimer.Start();
+    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
+    PromTimer.Stop();
+    std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
+    std::cout << GridLogPerformance << "\tSmoother   " << SmootherTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tProj       " << ProjTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tCoarse     " << CoarseTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tProm       " << PromTimer.Elapsed() <<std::endl;
+
    axpy(out,1.0,Min,tmp); // Min+tmp
  }

-  virtual void M2(const Field & in, Field & out) {
-    out=in;
-    // Must override for Def2 only
-    //  case PcgDef2:
-    //    Pright(in,out);
-    //    break;
-  }
-
-  virtual RealD M3(const Field & p, Field & mmp){
-    double d,dd;
-    HermOpAndNorm(p,mmp,d,dd);
-    return dd;
-    // Must override for Def1 only
-    //  case PcgDef1:
-    //    d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
-    //      linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
-    //    Pleft(mp,mmp);
-    //    d=real(linop_d->inner(p,mmp));
-  }
-
-  virtual void VstartDef2(Field & xconst Field & src){
-    //case PcgDef2:
-    //case PcgAdef2: 
-    //case PcgAdef2f:
-    //case PcgV11f:
+  virtual void Vstart(Field & x,const Field & src)
+  {
+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart "<<std::endl;
    ///////////////////////////////////
    // Choose x_0 such that 
    // x_0 = guess +  (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
@@ -256,142 +522,211 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
    //                   = 0 
    ///////////////////////////////////
-    Field r(grid);
-    Field mmp(grid);
+    Field r(this->grid);
+    Field mmp(this->grid);
+    CoarseField PleftProj(this->coarsegrid);
+    CoarseField PleftMss_proj(this->coarsegrid);
+
+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart projecting "<<std::endl;
+    this->_Aggregates.ProjectToSubspace(PleftProj,src);     
+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart coarse solve "<<std::endl;
+    this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart promote "<<std::endl;
+    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x);  
+
+  }
+
+};
+
+template<class Field, class CoarseField, class Aggregation>
+class TwoLevelADEF2mrhs : public TwoLevelADEF2<Field,CoarseField,Aggregation>
+{
+public:
+  GridBase *coarsegridmrhs;
+  LinearFunction<CoarseField> &_CoarseSolverMrhs;
+  LinearFunction<CoarseField> &_CoarseSolverPreciseMrhs;
+  LinearFunction<CoarseField> &_CoarseGuesser;
+  TwoLevelADEF2mrhs(RealD tol,
+		    Integer maxit,
+		    LinearOperatorBase<Field>    &FineLinop,
+		    LinearFunction<Field>        &Smoother,
+		    //		    LinearFunction<CoarseField>  &CoarseSolver,
+		    //		    LinearFunction<CoarseField>  &CoarseSolverPrecise,
+		    LinearFunction<CoarseField>  &CoarseSolverMrhs,
+		    LinearFunction<CoarseField>  &CoarseSolverPreciseMrhs,
+		    LinearFunction<CoarseField>  &CoarseGuesser,
+		    GridBase *rhsgrid,
+		    Aggregation &Aggregates) :
+    TwoLevelADEF2<Field,CoarseField,Aggregation>(tol, maxit,FineLinop,Smoother,CoarseSolverMrhs,CoarseSolverPreciseMrhs,Aggregates),
+    _CoarseSolverMrhs(CoarseSolverMrhs),
+    _CoarseSolverPreciseMrhs(CoarseSolverPreciseMrhs),
+    _CoarseGuesser(CoarseGuesser)
+  {
+    coarsegridmrhs = rhsgrid;
+  };
+
+  virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
+  {
+    int nrhs=x.size();
+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart for "<<nrhs<<" right hand sides" <<std::endl;
+    ///////////////////////////////////
+    // Choose x_0 such that 
+    // x_0 = guess +  (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
+    //                               = [1 - Ass_inv A] Guess + Assinv src
+    //                               = P^T guess + Assinv src 
+    //                               = Vstart  [Tang notation]
+    // This gives:
+    // W^T (src - A x_0) = src_s - A guess_s - r_s
+    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
+    //                   = 0 
+    ///////////////////////////////////
+    CoarseField PleftProj(this->coarsegrid);
+    CoarseField PleftMss_proj(this->coarsegrid);
+
+    CoarseField PleftProjMrhs(this->coarsegridmrhs);
+    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
+
+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart Mrhs projecting "<<std::endl;
+
+    for(int rhs=0;rhs<nrhs;rhs++) {
+      this->_Aggregates.ProjectToSubspace(PleftProj,src[rhs]);     // can optimise later
+      InsertSliceFast(PleftProj,PleftProjMrhs,rhs,0);
+      this->_CoarseGuesser(PleftProj,PleftMss_proj);
+      InsertSliceFast(PleftMss_proj,PleftMss_projMrhs,rhs,0);
+    }
    
-    HermOp(x,mmp);
-    axpy (r, -1.0, mmp, src);        // r_{-1} = src - A x
-    ProjectToSubspace(r,PleftProj);     
-    ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    PromoteFromSubspace(PleftMss_proj,mmp);  
-    x=x+mmp;
+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart Mrhs coarse solve "<<std::endl;
+    this->_CoarseSolverPreciseMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} r_s

+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart promote "<<std::endl;
+    for(int rhs=0;rhs<nrhs;rhs++) {
+      ExtractSliceFast(PleftMss_proj,PleftMss_projMrhs,rhs,0);
+      this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x[rhs]);
+    }
  }

+  virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out){
+
+    int nrhs=in.size();
+    std::cout << " mrhs PcgM1 for "<<nrhs<<" right hand sides"<<std::endl;
+    MemoryManager::Print();
+    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
+    Field tmp(this->grid);
+    std::vector<Field> Min(nrhs,this->grid);
+    std::cout << " mrhs PcgM1 Min "<<std::endl;
+    CoarseField PleftProj(this->coarsegrid);
+    CoarseField PleftMss_proj(this->coarsegrid);
+
+    CoarseField PleftProjMrhs(this->coarsegridmrhs);
+    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
+    std::cout << " mrhs Coarse ops "<<std::endl;
+
+    // Really want the coarse solver
+    // to do the guessing itself, knowing the eigenvectors.
+    // The projection to coarse space is in aggregates
+    // If the Aggregates have a layout change option
+    // they could formulate as a BLAS routine.
+    // Put the routines in this object
+    for(int rhs=0;rhs<nrhs;rhs++) {
+
+      std::cout << GridLogMessage<<" Smoother for "<<rhs<<std::endl;
+      this->_Smoother(in[rhs],Min[rhs]);
+
+      std::cout << GridLogMessage<<" HermOp for "<<rhs<<std::endl;
+      this->_FineLinop.HermOp(Min[rhs],out[rhs]);
+
+      axpy(tmp,-1.0,out[rhs],in[rhs]);          // tmp  = in - A Min
+
+      // Was
+      //      this->_Aggregates.ProjectToSubspace(PleftProj,tmp);     // can optimise later
+      // Now:
+      std::cout << GridLogMessage<<" blockProject for "<<rhs<<std::endl;
+      blockProjectFast(PleftProj,tmp,this->_Aggregates.subspace);
+      
+      std::cout << GridLogMessage<<" InsertSlice for "<<rhs<<std::endl;
+      InsertSlice(PleftProj,PleftProjMrhs,rhs,0);
+      
+      std::cout << GridLogMessage<<" CoarseGuesser for "<<rhs<<std::endl;
+      this->_CoarseGuesser(PleftProj,PleftMss_proj);
+
+      std::cout << GridLogMessage<<" InsertSlice for "<<rhs<<std::endl;
+      InsertSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
+    }
+    MemoryManager::Print();
+
+    std::cout << " Coarse solve "<<std::endl;
+    this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s
+    std::cout << " Coarse solve done"<<std::endl;
+    MemoryManager::Print();
+
+    for(int rhs=0;rhs<nrhs;rhs++) {
+      std::cout << GridLogMessage<<" Extract for "<<rhs<<std::endl;
+      ExtractSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
+      std::cout << GridLogMessage<<" Promote for "<<rhs<<std::endl;
+      this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
+								    //      std::cout << " add for "<<rhs<<std::endl;
+      axpy(out[rhs],1.0,Min[rhs],tmp); // Min+tmp
+    }
+    MemoryManager::Print();
+    std::cout << " Extracted "<<std::endl;
+  }
+};
+  
+template<class Field>
+class TwoLevelADEF1defl : public TwoLevelCG<Field>
+{
+public:
+  const std::vector<Field> &evec;
+  const std::vector<RealD> &eval;
+  
+  TwoLevelADEF1defl(RealD tol,
+		   Integer maxit,
+		   LinearOperatorBase<Field>   &FineLinop,
+		   LinearFunction<Field>   &Smoother,
+		   std::vector<Field> &_evec,
+		   std::vector<RealD> &_eval) : 
+    TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
+    evec(_evec),
+    eval(_eval)
+  {};
+
+  // Can just inherit existing M2
+  // Can just inherit existing M3
+
+  // Simple vstart - do nothing
  virtual void Vstart(Field & x,const Field & src){
-    return;
+    x=src; // Could apply Q
+  };
+
+  // Override PcgM1
+  virtual void PcgM1(Field & in, Field & out)
+  {
+    GRID_TRACE("EvecPreconditioner ");
+    int N=evec.size();
+    Field Pin(this->grid);
+    Field Qin(this->grid);
+
+    //MP  + Q = M(1-AQ) + Q = M
+    // // If we are eigenvector deflating in coarse space
+    // // Q   = Sum_i |phi_i> 1/lambda_i <phi_i|
+    // // A Q = Sum_i |phi_i> <phi_i|
+    // // M(1-AQ) = M(1-proj) + Q
+    Qin.Checkerboard()=in.Checkerboard();
+    Qin = Zero();
+    Pin = in;
+    for (int i=0;i<N;i++) {
+      const Field& tmp = evec[i];
+      auto ip = TensorRemove(innerProduct(tmp,in));
+      axpy(Qin, ip / eval[i],tmp,Qin);
+      axpy(Pin, -ip ,tmp,Pin);
+    }
+
+    this->_Smoother(Pin,out);
+
+    out = out + Qin;
  }
+};

-  /////////////////////////////////////////////////////////////////////
-  // Only Def1 has non-trivial Vout. Override in Def1
-  /////////////////////////////////////////////////////////////////////
-  virtual void   Vout  (Field & in, Field & out,Field & src){
-    out = in;
-    //case PcgDef1:
-    //    //Qb + PT x
-    //    ProjectToSubspace(src,PleftProj);     
-    //    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    //    PromoteFromSubspace(PleftMss_proj,tmp);  
-    //    
-    //    Pright(in,out);
-    //    
-    //    linop_d->axpy(out,tmp,out,1.0);
-    //    break;
-  }
+NAMESPACE_END(Grid);

-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // Pright and Pleft are common to all implementations
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  virtual void Pright(Field & in,Field & out){
-    // P_R  = [ 1              0 ] 
-    //        [ -Mss^-1 Msb    0 ] 
-    Field in_sbar(grid);
-
-    ProjectToSubspace(in,PleftProj);     
-    PromoteFromSubspace(PleftProj,out);  
-    axpy(in_sbar,-1.0,out,in);       // in_sbar = in - in_s 
-
-    HermOp(in_sbar,out);
-    ProjectToSubspace(out,PleftProj);           // Mssbar in_sbar  (project)
-
-    ApplyInverse     (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
-    PromoteFromSubspace(PleftMss_proj,out);     // 
-
-    axpy(out,-1.0,out,in_sbar);     // in_sbar - Mss^{-1} Mssbar in_sbar
-  }
-  virtual void Pleft (Field & in,Field & out){
-    // P_L  = [ 1  -Mbs Mss^-1] 
-    //        [ 0   0         ] 
-    Field in_sbar(grid);
-    Field    tmp2(grid);
-    Field    Mtmp(grid);
-
-    ProjectToSubspace(in,PleftProj);     
-    PromoteFromSubspace(PleftProj,out);  
-    axpy(in_sbar,-1.0,out,in);      // in_sbar = in - in_s
-
-    ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
-    PromoteFromSubspace(PleftMss_proj,out);
-
-    HermOp(out,Mtmp);
-
-    ProjectToSubspace(Mtmp,PleftProj);      // Msbar s Mss^{-1}
-    PromoteFromSubspace(PleftProj,tmp2);
-
-    axpy(out,-1.0,tmp2,Mtmp);
-    axpy(out,-1.0,out,in_sbar);     // in_sbar - Msbars Mss^{-1} in_s
-  }
-}
-
-template<class Field>
-class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp){
-
-  } 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
-
-  }
-  virtual void M2(Field & in, Field & out){
-
-  }
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
-
-  }
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
-
-  }
-}
-/*
-template<class Field>
-class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-}
-
-template<class Field>
-class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-  virtual void   Vout  (Field & in, Field & out,Field & src,Field & tmp);
-}
-
-template<class Field>
-class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-}
-
-template<class Field>
-class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-}
-*/
 #endif
@@ -183,13 +183,13 @@ public:
 		  << "\tTrue residual " << true_residual
 		  << "\tTarget " << Tolerance << std::endl;

-        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+        std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
+	std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogPerformance << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogPerformance << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogPerformance << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogPerformance << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

 	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;

@@ -207,7 +207,8 @@ public:

    TrueResidual = sqrt(norm2(p)/ssq);

-    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;
+    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
+	      <<" residual "<< TrueResidual<< std::endl;

    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;
@@ -144,7 +144,7 @@ public:
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
-	       <<" target resid "<<rsq[s]<<std::endl;
+	       <<" target resid^2 "<<rsq[s]<<std::endl;
      ps[s] = src;
    }
    // r and p for primary
@@ -79,14 +79,16 @@ template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public Imp
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);

    std::cout.precision(13);
-    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
-	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
-	     <<std::endl;

    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;

+    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
+	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<" target " << eresid*eresid << " conv " <<conv
+	     <<std::endl;
+
    return conv;
  }
 };
@@ -457,7 +459,7 @@ until convergence
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
-    std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
+    std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
    const RealD tiny = 1.0e-20;
    assert( k< Nm );

@@ -465,7 +467,7 @@ until convergence

    Field& evec_k = evec[k];

-    _PolyOp(evec_k,w);    std::cout<<GridLogIRL << "PolyOp" <<std::endl;
+    _PolyOp(evec_k,w);    std::cout<<GridLogDebug << "PolyOp" <<std::endl;

    if(k>0) w -= lme[k-1] * evec[k-1];

@@ -480,18 +482,18 @@ until convergence
    lme[k] = beta;

    if ( (k>0) && ( (k % orth_period) == 0 )) {
-      std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
+      std::cout<<GridLogDebug << "Orthogonalising " <<k<<std::endl;
      orthogonalize(w,evec,k); // orthonormalise
-      std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
+      std::cout<<GridLogDebug << "Orthogonalised " <<k<<std::endl;
    }

    if(k < Nm-1) evec[k+1] = w;

-    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
+    std::cout<<GridLogIRL << "Lanczos step alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;

-    std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
+    std::cout<<GridLogDebug << "Lanczos step complete " <<k<<std::endl;
  }

  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
@@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Take a matrix and form an NE solver calling a Herm solver
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class NormalEquations {
+template<class Field> class NormalEquations : public LinearFunction<Field>{
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@@ -60,7 +60,7 @@ public:
  }     
 };

-template<class Field> class HPDSolver {
+template<class Field> class HPDSolver : public LinearFunction<Field> {
 private:
  LinearOperatorBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@@ -78,13 +78,13 @@ public:
  void operator() (const Field &in, Field &out){
 
    _Guess(in,out);
-    _HermitianSolver(_Matrix,in,out);  // Mdag M out = Mdag in
+    _HermitianSolver(_Matrix,in,out);  //M out = in

  }     
 };


-template<class Field> class MdagMSolver {
+template<class Field> class MdagMSolver : public LinearFunction<Field> {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
    RealD evalMaxApprox = 0.0; 
    auto src_n = src; 
    auto tmp = src; 
-    const int _MAX_ITER_EST_ = 50; 
+    const int _MAX_ITER_EST_ = 100; 

    for (int i=0;i<_MAX_ITER_EST_;i++) { 
      
@@ -0,0 +1,383 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/Aggregates.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+inline RealD AggregatePowerLaw(RealD x)
+{
+  //  return std::pow(x,-4);
+  //  return std::pow(x,-3);
+  return std::pow(x,-5);
+}
+
+template<class Fobj,class CComplex,int nbasis>
+class Aggregation {
+public:
+  constexpr int Nbasis(void) { return nbasis; };
+  
+  typedef iVector<CComplex,nbasis >             siteVector;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+
+  GridBase *CoarseGrid;
+  GridBase *FineGrid;
+  std::vector<Lattice<Fobj> > subspace;
+  int checkerboard;
+  int Checkerboard(void){return checkerboard;}
+  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
+    CoarseGrid(_CoarseGrid),
+    FineGrid(_FineGrid),
+    subspace(nbasis,_FineGrid),
+    checkerboard(_checkerboard)
+  {
+  };
+  
+  
+  void Orthogonalise(void){
+    CoarseScalar InnerProd(CoarseGrid); 
+    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+  } 
+  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
+    blockProject(CoarseVec,FineVec,subspace);
+  }
+  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
+    FineVec.Checkerboard() = subspace[0].Checkerboard();
+    blockPromote(CoarseVec,FineVec,subspace);
+  }
+
+  virtual void CreateSubspaceRandom(GridParallelRNG  &RNG) {
+    int nn=nbasis;
+    RealD scale;
+    FineField noise(FineGrid);
+    for(int b=0;b<nn;b++){
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      subspace[b] = noise;
+    }
+  }
+  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
+  {
+
+    RealD scale;
+
+    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+
+    for(int b=0;b<nn;b++){
+      
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+      for(int i=0;i<1;i++){
+
+	CG(hermop,noise,subspace[b]);
+
+	noise = subspace[b];
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+      }
+
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
+      subspace[b]   = noise;
+
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
+  // and this is the best I found
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    std::cout << GridLogMessage<<" Chebyshev subspace pass-1 : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
+    std::cout << GridLogMessage<<" Chebyshev subspace pass-2 : nbasis"<<nn<<" min "
+	      <<ordermin<<" step "<<orderstep
+	      <<" lo"<<filterlo<<std::endl;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {
+      // Filter
+      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      b++;
+    }
+
+    // Generate a full sequence of Chebyshevs
+    {
+      lo=filterlo;
+      noise=Mn;
+
+      FineField T0(FineGrid); T0 = noise;  
+      FineField T1(FineGrid); 
+      FineField T2(FineGrid);
+      FineField y(FineGrid);
+      
+      FineField *Tnm = &T0;
+      FineField *Tn  = &T1;
+      FineField *Tnp = &T2;
+
+      // Tn=T1 = (xscale M + mscale)in
+      RealD xscale = 2.0/(hi-lo);
+      RealD mscale = -(hi+lo)/(hi-lo);
+      hermop.HermOp(T0,y);
+      T1=y*xscale+noise*mscale;
+
+      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
+	
+	hermop.HermOp(*Tn,y);
+
+	autoView( y_v , y, AcceleratorWrite);
+	autoView( Tn_v , (*Tn), AcceleratorWrite);
+	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
+	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
+	const int Nsimd = CComplex::Nsimd();
+	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+        });
+
+	// Possible more fine grained control is needed than a linear sweep,
+	// but huge productivity gain if this is simple algorithm and not a tunable
+	int m =1;
+	if ( n>=ordermin ) m=n-ordermin;
+	if ( (m%orderstep)==0 ) { 
+	  Mn=*Tnp;
+	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
+	  subspace[b] = Mn;
+	  hermop.Op(Mn,tmp); 
+	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  b++;
+	}
+
+	// Cycle pointers to avoid copies
+	FineField *swizzle = Tnm;
+	Tnm    =Tn;
+	Tn     =Tnp;
+	Tnp    =swizzle;
+	  
+      }
+    }
+    assert(b==nn);
+  }
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
+    std::cout << GridLogMessage<<" Chebyshev subspace pure noise  : nbasis "<<nn<<std::endl;
+
+
+    for(int b =0;b<nbasis;b++)
+    {
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+
+      // Initial matrix element
+      hermop.Op(noise,Mn);
+      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+      // Filter
+      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+
+      // Refine
+      Chebyshev<FineField> PowerLaw(lo,hi,1000,AggregatePowerLaw);
+      noise = Mn;
+      PowerLaw(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+
+      // normalise
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+    }
+
+  }
+
+  virtual void CreateSubspaceChebyshevPowerLaw(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+					       int nn,
+					       double hi,
+					       int orderfilter
+					       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" [0,"<<hi<<"]"<<std::endl;
+    std::cout << GridLogMessage<<" Chebyshev subspace pure noise  : nbasis "<<nn<<std::endl;
+
+    for(int b =0;b<nbasis;b++)
+    {
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+
+      // Initial matrix element
+      hermop.Op(noise,Mn);
+      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+      // Filter
+      Chebyshev<FineField> Cheb(0.0,hi,orderfilter,AggregatePowerLaw);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+    }
+
+  }
+
+  virtual void CreateSubspaceMultishift(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+					double Lo,double tol,int maxit)
+  {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    std::cout << GridLogMessage<<" Multishift subspace : Lo "<<Lo<<std::endl;
+
+    // Filter
+    // [ 1/6(x+Lo)  - 1/2(x+2Lo) + 1/2(x+3Lo)  -1/6(x+4Lo) = Lo^3 /[ (x+1Lo)(x+2Lo)(x+3Lo)(x+4Lo) ]
+    //
+    // 1/(x+Lo)  - 1/(x+2 Lo)
+    double epsilon      = Lo/3;
+    std::vector<RealD> alpha({1.0/6.0,-1.0/2.0,1.0/2.0,-1.0/6.0});
+    std::vector<RealD> shifts({Lo,Lo+epsilon,Lo+2*epsilon,Lo+3*epsilon});
+    std::vector<RealD> tols({tol,tol,tol,tol});
+    std::cout << "sizes "<<alpha.size()<<" "<<shifts.size()<<" "<<tols.size()<<std::endl;
+
+    MultiShiftFunction msf(4,0.0,95.0);
+    std::cout << "msf constructed "<<std::endl;
+    msf.poles=shifts;
+    msf.residues=alpha;
+    msf.tolerances=tols;
+    msf.norm=0.0;
+    msf.order=alpha.size();
+    ConjugateGradientMultiShift<FineField> MSCG(maxit,msf);
+    
+    for(int b =0;b<nbasis;b++)
+    {
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+
+      // Initial matrix element
+      hermop.Op(noise,Mn);
+      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+      MSCG(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+
+    }
+
+  }
+  virtual void RefineSubspace(LinearOperatorBase<FineField> &hermop,
+			      double Lo,double tol,int maxit)
+  {
+    FineField tmp(FineGrid);
+    for(int b =0;b<nbasis;b++)
+    {
+      RealD MirsShift = Lo;
+      ConjugateGradient<FineField>  CGsloppy(tol,maxit,false);
+      ShiftedHermOpLinearOperator<FineField> ShiftedFineHermOp(hermop,MirsShift);
+      CGsloppy(hermop,subspace[b],tmp);
+      subspace[b]=tmp;
+    }
+  }
+
+  
+  
+};
+NAMESPACE_END(Grid);
@@ -56,243 +56,6 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
  blockSum(CoarseInner,fine_inner_msk);
 }

-
-class Geometry {
-public:
-  int npoint;
-  int base;
-  std::vector<int> directions   ;
-  std::vector<int> displacements;
-  std::vector<int> points_dagger;
-
-  Geometry(int _d)  {
-    
-    base = (_d==5) ? 1:0;
-
-    // make coarse grid stencil for 4d , not 5d
-    if ( _d==5 ) _d=4;
-
-    npoint = 2*_d+1;
-    directions.resize(npoint);
-    displacements.resize(npoint);
-    points_dagger.resize(npoint);
-    for(int d=0;d<_d;d++){
-      directions[d   ] = d+base;
-      directions[d+_d] = d+base;
-      displacements[d  ] = +1;
-      displacements[d+_d]= -1;
-      points_dagger[d   ] = d+_d;
-      points_dagger[d+_d] = d;
-    }
-    directions   [2*_d]=0;
-    displacements[2*_d]=0;
-    points_dagger[2*_d]=2*_d;
-  }
-
-  int point(int dir, int disp) {
-    assert(disp == -1 || disp == 0 || disp == 1);
-    assert(base+0 <= dir && dir < base+4);
-
-    // directions faster index = new indexing
-    // 4d (base = 0):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   0  1  2  3  0  1  2  3  0
-    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
-    // 5d (base = 1):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   1  2  3  4  1  2  3  4  0
-    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
-
-    // displacements faster index = old indexing
-    // 4d (base = 0):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   0  0  1  1  2  2  3  3  0
-    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
-    // 5d (base = 1):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   1  1  2  2  3  3  4  4  0
-    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
-
-    if(dir == 0 and disp == 0)
-      return 8;
-    else // New indexing
-      return (1 - disp) / 2 * 4 + dir - base;
-    // else // Old indexing
-    //   return (4 * (dir - base) + 1 - disp) / 2;
-  }
-};
-  
-template<class Fobj,class CComplex,int nbasis>
-class Aggregation   {
-public:
-  typedef iVector<CComplex,nbasis >             siteVector;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-
-  GridBase *CoarseGrid;
-  GridBase *FineGrid;
-  std::vector<Lattice<Fobj> > subspace;
-  int checkerboard;
-  int Checkerboard(void){return checkerboard;}
-  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
-    CoarseGrid(_CoarseGrid),
-    FineGrid(_FineGrid),
-    subspace(nbasis,_FineGrid),
-    checkerboard(_checkerboard)
-  {
-  };
-  
-  void Orthogonalise(void){
-    CoarseScalar InnerProd(CoarseGrid); 
-    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
-    blockOrthogonalise(InnerProd,subspace);
-  } 
-  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
-    blockProject(CoarseVec,FineVec,subspace);
-  }
-  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
-    FineVec.Checkerboard() = subspace[0].Checkerboard();
-    blockPromote(CoarseVec,FineVec,subspace);
-  }
-
-  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
-
-    RealD scale;
-
-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-
-    for(int b=0;b<nn;b++){
-      
-      subspace[b] = Zero();
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-      
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      for(int i=0;i<1;i++){
-
-	CG(hermop,noise,subspace[b]);
-
-	noise = subspace[b];
-	scale = std::pow(norm2(noise),-0.5); 
-	noise=noise*scale;
-
-      }
-
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
-      subspace[b]   = noise;
-
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
-  // and this is the best I found
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter,
-				       int ordermin,
-				       int orderstep,
-				       double filterlo
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    gaussian(RNG,noise);
-    scale = std::pow(norm2(noise),-0.5); 
-    noise=noise*scale;
-
-    // Initial matrix element
-    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-    int b =0;
-    {
-      // Filter
-      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
-      Cheb(hermop,noise,Mn);
-      // normalise
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-      b++;
-    }
-
-    // Generate a full sequence of Chebyshevs
-    {
-      lo=filterlo;
-      noise=Mn;
-
-      FineField T0(FineGrid); T0 = noise;  
-      FineField T1(FineGrid); 
-      FineField T2(FineGrid);
-      FineField y(FineGrid);
-      
-      FineField *Tnm = &T0;
-      FineField *Tn  = &T1;
-      FineField *Tnp = &T2;
-
-      // Tn=T1 = (xscale M + mscale)in
-      RealD xscale = 2.0/(hi-lo);
-      RealD mscale = -(hi+lo)/(hi-lo);
-      hermop.HermOp(T0,y);
-      T1=y*xscale+noise*mscale;
-
-      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
-	
-	hermop.HermOp(*Tn,y);
-
-	autoView( y_v , y, AcceleratorWrite);
-	autoView( Tn_v , (*Tn), AcceleratorWrite);
-	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
-	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
-	const int Nsimd = CComplex::Nsimd();
-	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
-	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
-	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
-        });
-
-	// Possible more fine grained control is needed than a linear sweep,
-	// but huge productivity gain if this is simple algorithm and not a tunable
-	int m =1;
-	if ( n>=ordermin ) m=n-ordermin;
-	if ( (m%orderstep)==0 ) { 
-	  Mn=*Tnp;
-	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
-	  subspace[b] = Mn;
-	  hermop.Op(Mn,tmp); 
-	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-	  b++;
-	}
-
-	// Cycle pointers to avoid copies
-	FineField *swizzle = Tnm;
-	Tnm    =Tn;
-	Tn     =Tnp;
-	Tnp    =swizzle;
-	  
-      }
-    }
-    assert(b==nn);
-  }
-
-};
-
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
@@ -0,0 +1,621 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
+
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// Fine Object == (per site) type of fine field
+// nbasis      == number of deflation vectors
+template<class Fobj,class CComplex,int nbasis>
+class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
+public:
+
+  typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
+  typedef iVector<CComplex,nbasis >           siteVector;
+  typedef iMatrix<CComplex,nbasis >           siteMatrix;
+  typedef Lattice<iScalar<CComplex> >         CoarseComplexField;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+  typedef iMatrix<CComplex,nbasis >  Cobj;
+  typedef iVector<CComplex,nbasis >  Cvec;
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+  typedef Lattice<CComplex >    FineComplexField;
+  typedef CoarseVector Field;
+  ////////////////////
+  // Data members
+  ////////////////////
+  int hermitian;
+  GridBase      *       _FineGrid; 
+  GridCartesian *       _CoarseGrid; 
+  NonLocalStencilGeometry &geom;
+  PaddedCell Cell;
+  GeneralLocalStencil Stencil;
+  
+  std::vector<CoarseMatrix> _A;
+  std::vector<CoarseMatrix> _Adag;
+  std::vector<CoarseVector> MultTemporaries;
+
+  ///////////////////////
+  // Interface
+  ///////////////////////
+  GridBase      * Grid(void)           { return _CoarseGrid; };   // this is all the linalg routines need to know
+  GridBase      * FineGrid(void)       { return _FineGrid; };   // this is all the linalg routines need to know
+  GridCartesian * CoarseGrid(void)     { return _CoarseGrid; };   // this is all the linalg routines need to know
+
+  /*  void ShiftMatrix(RealD shift)
+  {
+    int Nd=_FineGrid->Nd(); 
+    Coordinate zero_shift(Nd,0);
+    for(int p=0;p<geom.npoint;p++){
+      if ( zero_shift==geom.shifts[p] ) {
+	_A[p] = _A[p]+shift;
+	//	_Adag[p] = _Adag[p]+shift;
+      }
+    }    
+  }
+  void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
+  {
+    int nfound=0;
+    std::cout << GridLogMessage <<"GeneralCoarsenedMatrix::ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
+    for(int p=0;p<geom.npoint;p++){
+      for(int pp=0;pp<CopyMe.geom.npoint;pp++){
+ 	// Search for the same relative shift
+	// Avoids brutal handling of Grid pointers
+	if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
+	  _A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
+	  //	  _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
+	  nfound++;
+	}
+      }
+    }
+    assert(nfound==geom.npoint);
+    ExchangeCoarseLinks();
+  }
+  */
+  
+  GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
+    : geom(_geom),
+      _FineGrid(FineGrid),
+      _CoarseGrid(CoarseGrid),
+      hermitian(1),
+      Cell(_geom.Depth(),_CoarseGrid),
+      Stencil(Cell.grids.back(),geom.shifts)
+  {
+    {
+      int npoint = _geom.npoint;
+    }
+    _A.resize(geom.npoint,CoarseGrid);
+    //    _Adag.resize(geom.npoint,CoarseGrid);
+  }
+  void M (const CoarseVector &in, CoarseVector &out)
+  {
+    Mult(_A,in,out);
+  }
+  void Mdag (const CoarseVector &in, CoarseVector &out)
+  {
+    assert(hermitian);
+    Mult(_A,in,out);
+    //    if ( hermitian ) M(in,out);
+    //    else Mult(_Adag,in,out);
+  }
+  void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
+  {
+    RealD tviews=0;    RealD ttot=0;    RealD tmult=0;   RealD texch=0;    RealD text=0; RealD ttemps=0; RealD tcopy=0;
+    RealD tmult2=0;
+
+    ttot=-usecond();
+    conformable(CoarseGrid(),in.Grid());
+    conformable(in.Grid(),out.Grid());
+    out.Checkerboard() = in.Checkerboard();
+    CoarseVector tin=in;
+
+    texch-=usecond();
+    CoarseVector pin = Cell.ExchangePeriodic(tin);
+    texch+=usecond();
+
+    CoarseVector pout(pin.Grid());
+
+    int npoint = geom.npoint;
+    typedef LatticeView<Cobj> Aview;
+    typedef LatticeView<Cvec> Vview;
+      
+    const int Nsimd = CComplex::Nsimd();
+    
+    int64_t osites=pin.Grid()->oSites();
+
+    RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
+    RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint
+                + 2.0*osites*sizeof(siteVector)*npoint;
+      
+    {
+      tviews-=usecond();
+      autoView( in_v , pin, AcceleratorRead);
+      autoView( out_v , pout, AcceleratorWriteDiscard);
+      autoView( Stencil_v  , Stencil, AcceleratorRead);
+      tviews+=usecond();
+
+      // Static and prereserve to keep UVM region live and not resized across multiple calls
+      ttemps-=usecond();
+      MultTemporaries.resize(npoint,pin.Grid());       
+      ttemps+=usecond();
+      std::vector<Aview> AcceleratorViewContainer_h;
+      std::vector<Vview> AcceleratorVecViewContainer_h; 
+
+      tviews-=usecond();
+      for(int p=0;p<npoint;p++) {
+	AcceleratorViewContainer_h.push_back(      A[p].View(AcceleratorRead));
+	AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
+      }
+      tviews+=usecond();
+
+      static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
+      static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); 
+      
+      auto Aview_p = &AcceleratorViewContainer[0];
+      auto Vview_p = &AcceleratorVecViewContainer[0];
+      tcopy-=usecond();
+      acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
+      acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
+      tcopy+=usecond();
+
+      tmult-=usecond();
+      accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
+	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+	  int32_t ss   = spb/(nbasis*npoint);
+	  int32_t bp   = spb%(nbasis*npoint);
+	  int32_t point= bp/nbasis;
+	  int32_t b    = bp%nbasis;
+	  auto SE  = Stencil_v.GetEntry(point,ss);
+	  auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
+	  auto res = coalescedRead(Aview_p[point][ss](0,b))*nbr(0);
+	  for(int bb=1;bb<nbasis;bb++) {
+	    res = res + coalescedRead(Aview_p[point][ss](bb,b))*nbr(bb);
+	  }
+	  coalescedWrite(Vview_p[point][ss](b),res);
+      });
+      tmult2-=usecond();
+      accelerator_for(sb, osites*nbasis, Nsimd, {
+	  int ss = sb/nbasis;
+	  int b  = sb%nbasis;
+	  auto res = coalescedRead(Vview_p[0][ss](b));
+	  for(int point=1;point<npoint;point++){
+	    res = res + coalescedRead(Vview_p[point][ss](b));
+	  }
+	  coalescedWrite(out_v[ss](b),res);
+      });
+      tmult2+=usecond();
+      tmult+=usecond();
+      for(int p=0;p<npoint;p++) {
+	AcceleratorViewContainer_h[p].ViewClose();
+	AcceleratorVecViewContainer_h[p].ViewClose();
+      }
+    }
+
+    text-=usecond();
+    out = Cell.Extract(pout);
+    text+=usecond();
+    ttot+=usecond();
+    
+    std::cout << GridLogPerformance<<"Coarse 1rhs Mult Aviews "<<tviews<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<" of which mult2  "<<tmult2<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult copy  "<<tcopy<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
+    //    std::cout << GridLogPerformance<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel flops "<< flops<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel bytes/s "<< bytes/tmult<<" MB/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
+
+  };
+  
+  void PopulateAdag(void)
+  {
+    for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
+      Coordinate bcoor;
+      CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
+      
+      for(int p=0;p<geom.npoint;p++){
+	Coordinate scoor = bcoor;
+	for(int mu=0;mu<bcoor.size();mu++){
+	  int L = CoarseGrid()->GlobalDimensions()[mu];
+	  scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic
+	}
+	// Flip to poke/peekLocalSite and not too bad
+	auto link = peekSite(_A[p],scoor);
+	int pp = geom.Reverse(p);
+	pokeSite(adj(link),_Adag[pp],bcoor);
+      }
+    }
+  }
+  /////////////////////////////////////////////////////////////
+  // 
+  // A) Only reduced flops option is to use a padded cell of depth 4
+  // and apply MpcDagMpc in the padded cell.
+  //
+  // Makes for ONE application of MpcDagMpc per vector instead of 30 or 80.
+  // With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio
+  // Cost is 81x more, same as stencil size.
+  //
+  // But: can eliminate comms and do as local dirichlet.
+  //
+  // Local exchange gauge field once.
+  // Apply to all vectors, local only computation.
+  // Must exchange ghost subcells in reverse process of PaddedCell to take inner products
+  //
+  // B) Can reduce cost: pad by 1, apply Deo      (4^4+6^4+8^4+8^4 )/ (4x 4^4)
+  //                     pad by 2, apply Doe
+  //                     pad by 3, apply Deo
+  //                     then break out 8x directions; cost is ~10x MpcDagMpc per vector
+  //
+  // => almost factor of 10 in setup cost, excluding data rearrangement
+  //
+  // Intermediates -- ignore the corner terms, leave approximate and force Hermitian
+  // Intermediates -- pad by 2 and apply 1+8+24 = 33 times.
+  /////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////
+    // BFM HDCG style approach: Solve a system of equations to get Aij
+    //////////////////////////////////////////////////////////
+    /*
+     *     Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM.
+     *
+     *     conj(phases[block]) proj[k][ block*Nvec+j ] =  \sum_ball  e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} > 
+     *                                                 =  \sum_ball e^{iqk.delta} A_ji
+     *
+     *     Must invert matrix M_k,l = e^[i q_k . delta_l]
+     *
+     *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
+     */
+#if 0
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
+    GridBase *grid = FineGrid();
+
+    RealD tproj=0.0;
+    RealD teigen=0.0;
+    RealD tmat=0.0;
+    RealD tphase=0.0;
+    RealD tinv=0.0;
+
+    /////////////////////////////////////////////////////////////
+    // Orthogonalise the subblocks over the basis
+    /////////////////////////////////////////////////////////////
+    CoarseScalar InnerProd(CoarseGrid()); 
+    blockOrthogonalise(InnerProd,Subspace.subspace);
+
+    const int npoint = geom.npoint;
+      
+    Coordinate clatt = CoarseGrid()->GlobalDimensions();
+    int Nd = CoarseGrid()->Nd();
+
+      /*
+       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
+       *     Matrix index i is mapped to this shift via 
+       *               geom.shifts[i]
+       *
+       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
+       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
+       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
+       *       = M_{kl} A_ji^{b.b+l}
+       *
+       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
+       *  
+       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
+       *
+       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
+       */
+    teigen-=usecond();
+    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
+    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
+    ComplexD ci(0.0,1.0);
+    for(int k=0;k<npoint;k++){ // Loop over momenta
+
+      for(int l=0;l<npoint;l++){ // Loop over nbr relative
+	ComplexD phase(0.0,0.0);
+	for(int mu=0;mu<Nd;mu++){
+	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
+	  phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
+	}
+	phase=exp(phase*ci);
+	Mkl(k,l) = phase;
+      }
+    }
+    invMkl = Mkl.inverse();
+    teigen+=usecond();
+
+    ///////////////////////////////////////////////////////////////////////
+    // Now compute the matrix elements of linop between the orthonormal
+    // set of vectors.
+    ///////////////////////////////////////////////////////////////////////
+    FineField phaV(grid); // Phased block basis vector
+    FineField MphaV(grid);// Matrix applied
+    CoarseVector coarseInner(CoarseGrid());
+
+    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
+    std::vector<CoarseVector>          FT(npoint,CoarseGrid());
+    for(int i=0;i<nbasis;i++){// Loop over basis vectors
+      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
+      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
+	/////////////////////////////////////////////////////
+	// Stick a phase on every block
+	/////////////////////////////////////////////////////
+	tphase-=usecond();
+	CoarseComplexField coor(CoarseGrid());
+	CoarseComplexField pha(CoarseGrid());	pha=Zero();
+	for(int mu=0;mu<Nd;mu++){
+	  LatticeCoordinate(coor,mu);
+	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
+	  pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
+	}
+	pha  =exp(pha*ci);
+	phaV=Zero();
+	blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
+	tphase+=usecond();
+
+	/////////////////////////////////////////////////////////////////////
+	// Multiple phased subspace vector by matrix and project to subspace
+	// Remove local bulk phase to leave relative phases
+	/////////////////////////////////////////////////////////////////////
+	tmat-=usecond();
+	linop.Op(phaV,MphaV);
+	tmat+=usecond();
+
+	tproj-=usecond();
+	blockProject(coarseInner,MphaV,Subspace.subspace);
+	coarseInner = conjugate(pha) * coarseInner;
+
+	ComputeProj[p] = coarseInner;
+	tproj+=usecond();
+
+      }
+
+      tinv-=usecond();
+      for(int k=0;k<npoint;k++){
+	FT[k] = Zero();
+	for(int l=0;l<npoint;l++){
+	  FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
+	}
+      
+	int osites=CoarseGrid()->oSites();
+	autoView( A_v  , _A[k], AcceleratorWrite);
+	autoView( FT_v  , FT[k], AcceleratorRead);
+	accelerator_for(sss, osites, 1, {
+	    for(int j=0;j<nbasis;j++){
+	      A_v[sss](i,j) = FT_v[sss](j);
+	    }
+        });
+      }
+      tinv+=usecond();
+    }
+
+    // Only needed if nonhermitian
+    if ( ! hermitian ) {
+      //      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
+      //      PopulateAdag();
+    }
+
+    // Need to write something to populate Adag from A
+    ExchangeCoarseLinks();
+    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator mat    "<<tmat <<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator proj   "<<tproj<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
+  }
+#else
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
+    GridBase *grid = FineGrid();
+
+    RealD tproj=0.0;
+    RealD teigen=0.0;
+    RealD tmat=0.0;
+    RealD tphase=0.0;
+    RealD tphaseBZ=0.0;
+    RealD tinv=0.0;
+
+    /////////////////////////////////////////////////////////////
+    // Orthogonalise the subblocks over the basis
+    /////////////////////////////////////////////////////////////
+    CoarseScalar InnerProd(CoarseGrid()); 
+    blockOrthogonalise(InnerProd,Subspace.subspace);
+
+    for(int s=0;s<Subspace.subspace.size();s++){
+      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
+    }
+    const int npoint = geom.npoint;
+      
+    Coordinate clatt = CoarseGrid()->GlobalDimensions();
+    int Nd = CoarseGrid()->Nd();
+
+      /*
+       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
+       *     Matrix index i is mapped to this shift via 
+       *               geom.shifts[i]
+       *
+       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
+       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
+       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
+       *       = M_{kl} A_ji^{b.b+l}
+       *
+       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
+       *  
+       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
+       *
+       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
+       */
+    teigen-=usecond();
+    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
+    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
+    ComplexD ci(0.0,1.0);
+    for(int k=0;k<npoint;k++){ // Loop over momenta
+
+      for(int l=0;l<npoint;l++){ // Loop over nbr relative
+	ComplexD phase(0.0,0.0);
+	for(int mu=0;mu<Nd;mu++){
+	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
+	  phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
+	}
+	phase=exp(phase*ci);
+	Mkl(k,l) = phase;
+	std::cout<<" Mkl "<<k<<" "<<l<<" "<<phase<<std::endl;
+      }
+    }
+    invMkl = Mkl.inverse();
+    teigen+=usecond();
+
+    ///////////////////////////////////////////////////////////////////////
+    // Now compute the matrix elements of linop between the orthonormal
+    // set of vectors.
+    ///////////////////////////////////////////////////////////////////////
+    FineField phaV(grid); // Phased block basis vector
+    FineField MphaV(grid);// Matrix applied
+    std::vector<FineComplexField> phaF(npoint,grid);
+    std::vector<CoarseComplexField> pha(npoint,CoarseGrid());
+    
+    CoarseVector coarseInner(CoarseGrid());
+    
+    typedef typename CComplex::scalar_type SComplex;
+    FineComplexField one(grid); one=SComplex(1.0);
+    FineComplexField zz(grid); zz = Zero();
+    tphase=-usecond();
+    for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
+      /////////////////////////////////////////////////////
+      // Stick a phase on every block
+      /////////////////////////////////////////////////////
+      CoarseComplexField coor(CoarseGrid());
+      pha[p]=Zero();
+      for(int mu=0;mu<Nd;mu++){
+	LatticeCoordinate(coor,mu);
+	RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
+	pha[p] = pha[p] + (TwoPiL * geom.shifts[p][mu]) * coor;
+      }
+      pha[p]  =exp(pha[p]*ci);
+
+      blockZAXPY(phaF[p],pha[p],one,zz);
+      
+    }
+    tphase+=usecond();
+    
+    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
+    std::vector<CoarseVector>          FT(npoint,CoarseGrid());
+    for(int i=0;i<nbasis;i++){// Loop over basis vectors
+      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
+      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
+	tphaseBZ-=usecond();
+	phaV = phaF[p]*Subspace.subspace[i];
+	tphaseBZ+=usecond();
+
+	/////////////////////////////////////////////////////////////////////
+	// Multiple phased subspace vector by matrix and project to subspace
+	// Remove local bulk phase to leave relative phases
+	/////////////////////////////////////////////////////////////////////
+	tmat-=usecond();
+	linop.Op(phaV,MphaV);
+	tmat+=usecond();
+	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
+
+	tproj-=usecond();
+	blockProject(coarseInner,MphaV,Subspace.subspace);
+	coarseInner = conjugate(pha[p]) * coarseInner;
+
+	ComputeProj[p] = coarseInner;
+	tproj+=usecond();
+	std::cout << i << " " <<p << " ComputeProj "<<norm2(ComputeProj[p])<<std::endl;
+
+      }
+
+      tinv-=usecond();
+      for(int k=0;k<npoint;k++){
+	FT[k] = Zero();
+	for(int l=0;l<npoint;l++){
+	  FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
+	  std::cout << i << " " <<k <<" "<<l<< " FT "<<norm2(FT[k])<<" "<<invMkl(l,k)<<std::endl;
+	}
+      
+	int osites=CoarseGrid()->oSites();
+	autoView( A_v  , _A[k], AcceleratorWrite);
+	autoView( FT_v  , FT[k], AcceleratorRead);
+	accelerator_for(sss, osites, 1, {
+	    for(int j=0;j<nbasis;j++){
+	      A_v[sss](i,j) = FT_v[sss](j);
+	    }
+        });
+      }
+      tinv+=usecond();
+    }
+
+    // Only needed if nonhermitian
+    if ( ! hermitian ) {
+      //      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
+      //      PopulateAdag();
+    }
+
+    for(int p=0;p<geom.npoint;p++){
+      std::cout << " _A["<<p<<"] "<<norm2(_A[p])<<std::endl;
+    }
+
+    // Need to write something to populate Adag from A
+    ExchangeCoarseLinks();
+    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator mat    "<<tmat <<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator proj   "<<tproj<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
+  }
+#endif  
+  void ExchangeCoarseLinks(void){
+    for(int p=0;p<geom.npoint;p++){
+      _A[p] = Cell.ExchangePeriodic(_A[p]);
+      //      _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
+    }
+  }
+  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
+};
+
+
+  
+NAMESPACE_END(Grid);
@@ -0,0 +1,519 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/GeneralCoarsenedMatrixMultiRHS.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+
+NAMESPACE_BEGIN(Grid);
+
+
+// Fine Object == (per site) type of fine field
+// nbasis      == number of deflation vectors
+template<class Fobj,class CComplex,int nbasis>
+class MultiGeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
+public:
+  typedef typename CComplex::scalar_object SComplex;
+  typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
+  typedef MultiGeneralCoarsenedMatrix<Fobj,CComplex,nbasis> MultiGeneralCoarseOp;
+
+  typedef iVector<CComplex,nbasis >           siteVector;
+  typedef iMatrix<CComplex,nbasis >           siteMatrix;
+  typedef iVector<SComplex,nbasis >           calcVector;
+  typedef iMatrix<SComplex,nbasis >           calcMatrix;
+  typedef Lattice<iScalar<CComplex> >         CoarseComplexField;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+  typedef iMatrix<CComplex,nbasis >  Cobj;
+  typedef iVector<CComplex,nbasis >  Cvec;
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+  typedef Lattice<CComplex >    FineComplexField;
+  typedef CoarseVector Field;
+
+  ////////////////////
+  // Data members
+  ////////////////////
+  GridCartesian *       _CoarseGridMulti; 
+  NonLocalStencilGeometry geom;
+  NonLocalStencilGeometry geom_srhs;
+  PaddedCell Cell;
+  GeneralLocalStencil Stencil;
+
+  deviceVector<calcVector> BLAS_B;
+  deviceVector<calcVector> BLAS_C;
+  std::vector<deviceVector<calcMatrix> > BLAS_A;
+
+  std::vector<deviceVector<ComplexD *> > BLAS_AP;
+  std::vector<deviceVector<ComplexD *> > BLAS_BP;
+  deviceVector<ComplexD *>               BLAS_CP;
+
+  ///////////////////////
+  // Interface
+  ///////////////////////
+  GridBase      * Grid(void)           { return _CoarseGridMulti; };   // this is all the linalg routines need to know
+  GridCartesian * CoarseGrid(void)     { return _CoarseGridMulti; };   // this is all the linalg routines need to know
+
+  // Can be used to do I/O on the operator matrices externally
+  void SetMatrix (int p,CoarseMatrix & A)
+  {
+    assert(A.size()==geom_srhs.npoint);
+    GridtoBLAS(A[p],BLAS_A[p]);
+  }
+  void GetMatrix (int p,CoarseMatrix & A)
+  {
+    assert(A.size()==geom_srhs.npoint);
+    BLAStoGrid(A[p],BLAS_A[p]);
+  }
+  /*
+  void CopyMatrix (GeneralCoarseOp &_Op)
+  {
+    for(int p=0;p<geom.npoint;p++){
+      auto Aup = _Op.Cell.Extract(_Op._A[p]);
+      //Unpadded
+      GridtoBLAS(Aup,BLAS_A[p]);
+    }
+  }
+  void CheckMatrix (GeneralCoarseOp &_Op)
+  {
+    std::cout <<"************* Checking the little direc operator mRHS"<<std::endl;
+    for(int p=0;p<geom.npoint;p++){
+      //Unpadded
+      auto Aup = _Op.Cell.Extract(_Op._A[p]);
+      auto Ack = Aup;
+      BLAStoGrid(Ack,BLAS_A[p]);
+      std::cout << p<<" Ack "<<norm2(Ack)<<std::endl;
+      std::cout << p<<" Aup "<<norm2(Aup)<<std::endl;
+    }
+    std::cout <<"************* "<<std::endl;
+  }
+  */
+  
+  MultiGeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridCartesian *CoarseGridMulti) :
+    _CoarseGridMulti(CoarseGridMulti),
+    geom_srhs(_geom),
+    geom(_CoarseGridMulti,_geom.hops,_geom.skip+1),
+    Cell(geom.Depth(),_CoarseGridMulti),
+    Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
+  {
+    int32_t padded_sites   = Cell.grids.back()->lSites();
+    int32_t unpadded_sites = CoarseGridMulti->lSites();
+    
+    int32_t nrhs  = CoarseGridMulti->FullDimensions()[0];  // # RHS
+    int32_t orhs  = nrhs/CComplex::Nsimd();
+
+    padded_sites   = padded_sites/nrhs;
+    unpadded_sites = unpadded_sites/nrhs;
+    
+    /////////////////////////////////////////////////
+    // Device data vector storage
+    /////////////////////////////////////////////////
+    BLAS_A.resize(geom.npoint);
+    for(int p=0;p<geom.npoint;p++){
+      BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
+    }
+    
+    BLAS_B.resize(nrhs *padded_sites);   // includes ghost zone
+    BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
+    BLAS_AP.resize(geom.npoint);
+    BLAS_BP.resize(geom.npoint);
+    for(int p=0;p<geom.npoint;p++){
+      BLAS_AP[p].resize(unpadded_sites);
+      BLAS_BP[p].resize(unpadded_sites);
+    }
+    BLAS_CP.resize(unpadded_sites);
+
+    /////////////////////////////////////////////////
+    // Pointers to data
+    /////////////////////////////////////////////////
+
+    // Site identity mapping for A
+    for(int p=0;p<geom.npoint;p++){
+      for(int ss=0;ss<unpadded_sites;ss++){
+	ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
+	acceleratorPut(BLAS_AP[p][ss],ptr);
+      }
+    }
+    // Site identity mapping for C
+    for(int ss=0;ss<unpadded_sites;ss++){
+      ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
+      acceleratorPut(BLAS_CP[ss],ptr);
+    }
+
+    // Neighbour table is more complicated
+    int32_t j=0; // Interior point counter (unpadded)
+    for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
+      int ghost_zone=0;
+      for(int32_t point = 0 ; point < geom.npoint; point++){
+	int i=s*orhs*geom.npoint+point;
+	if( Stencil._entries[i]._wrap ) { // stencil is indexed by the oSite of the CoarseGridMulti, hence orhs factor
+	  ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
+	}
+      }
+
+      if( ghost_zone==0) {
+	for(int32_t point = 0 ; point < geom.npoint; point++){
+	  int i=s*orhs*geom.npoint+point;
+ 	  int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
+	  assert(nbr<BLAS_B.size());
+	  ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
+	  acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
+	}
+	j++;
+      }
+    }
+    assert(j==unpadded_sites);
+  }
+  template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
+  {
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  GridBase *Fg = from.Grid();
+  assert(!Fg->_isCheckerBoarded);
+  int nd = Fg->_ndimension;
+
+  to.resize(Fg->lSites());
+
+  Coordinate LocalLatt = Fg->LocalDimensions();
+  size_t nsite = 1;
+  for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // do the index calc on the GPU
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  Coordinate f_ostride = Fg->_ostride;
+  Coordinate f_istride = Fg->_istride;
+  Coordinate f_rdimensions = Fg->_rdimensions;
+
+  autoView(from_v,from,AcceleratorRead);
+  auto to_v = &to[0];
+
+  const int words=sizeof(vobj)/sizeof(vector_type);
+  accelerator_for(idx,nsite,1,{
+      
+      Coordinate from_coor, base;
+      Lexicographic::CoorFromIndex(base,idx,LocalLatt);
+      for(int i=0;i<nd;i++){
+	from_coor[i] = base[i];
+      }
+      int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
+      int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
+
+      const vector_type* from = (const vector_type *)&from_v[from_oidx];
+      scalar_type* to = (scalar_type *)&to_v[idx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	to[w] = stmp;
+      }
+    });
+  }    
+  template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
+  {
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  GridBase *Tg = grid.Grid();
+  assert(!Tg->_isCheckerBoarded);
+  int nd = Tg->_ndimension;
+  
+  assert(in.size()==Tg->lSites());
+
+  Coordinate LocalLatt = Tg->LocalDimensions();
+  size_t nsite = 1;
+  for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // do the index calc on the GPU
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  Coordinate t_ostride = Tg->_ostride;
+  Coordinate t_istride = Tg->_istride;
+  Coordinate t_rdimensions = Tg->_rdimensions;
+
+  autoView(to_v,grid,AcceleratorWrite);
+  auto from_v = &in[0];
+
+  const int words=sizeof(vobj)/sizeof(vector_type);
+  accelerator_for(idx,nsite,1,{
+      
+      Coordinate to_coor, base;
+      Lexicographic::CoorFromIndex(base,idx,LocalLatt);
+      for(int i=0;i<nd;i++){
+	to_coor[i] = base[i];
+      }
+      int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
+      int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
+
+      vector_type* to = (vector_type *)&to_v[to_oidx];
+      scalar_type* from = (scalar_type *)&from_v[idx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp=from[w];
+	putlane(to[w], stmp, to_lane);
+      }
+    });
+  }
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace,
+		       GridBase *CoarseGrid)
+  {
+    std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
+
+    GridBase *grid = Subspace.FineGrid;
+
+    /////////////////////////////////////////////////////////////
+    // Orthogonalise the subblocks over the basis
+    /////////////////////////////////////////////////////////////
+    CoarseScalar InnerProd(CoarseGrid); 
+    blockOrthogonalise(InnerProd,Subspace.subspace);
+
+    const int npoint = geom_srhs.npoint;
+
+    Coordinate clatt = CoarseGrid->GlobalDimensions();
+    int Nd = CoarseGrid->Nd();
+      /*
+       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
+       *     Matrix index i is mapped to this shift via 
+       *               geom.shifts[i]
+       *
+       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
+       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
+       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
+       *       = M_{kl} A_ji^{b.b+l}
+       *
+       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
+       *  
+       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
+       *
+       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
+       */
+    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
+    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
+    ComplexD ci(0.0,1.0);
+    for(int k=0;k<npoint;k++){ // Loop over momenta
+
+      for(int l=0;l<npoint;l++){ // Loop over nbr relative
+	ComplexD phase(0.0,0.0);
+	for(int mu=0;mu<Nd;mu++){
+	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
+	  phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
+	}
+	phase=exp(phase*ci);
+	Mkl(k,l) = phase;
+      }
+    }
+    invMkl = Mkl.inverse();
+
+    ///////////////////////////////////////////////////////////////////////
+    // Now compute the matrix elements of linop between the orthonormal
+    // set of vectors.
+    ///////////////////////////////////////////////////////////////////////
+    FineField phaV(grid); // Phased block basis vector
+    FineField MphaV(grid);// Matrix applied
+    std::vector<FineComplexField> phaF(npoint,grid);
+    std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
+    
+    CoarseVector coarseInner(CoarseGrid);
+    
+    typedef typename CComplex::scalar_type SComplex;
+    FineComplexField one(grid); one=SComplex(1.0);
+    FineComplexField zz(grid); zz = Zero();
+    for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
+      /////////////////////////////////////////////////////
+      // Stick a phase on every block
+      /////////////////////////////////////////////////////
+      CoarseComplexField coor(CoarseGrid);
+      pha[p]=Zero();
+      for(int mu=0;mu<Nd;mu++){
+	LatticeCoordinate(coor,mu);
+	RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
+	pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
+      }
+      pha[p]  =exp(pha[p]*ci);	
+
+      blockZAXPY(phaF[p],pha[p],one,zz);
+    }
+
+    // Could save on storage here
+    std::vector<CoarseMatrix> _A;
+    _A.resize(geom_srhs.npoint,CoarseGrid);
+
+    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
+    CoarseVector          FT(CoarseGrid);
+    for(int i=0;i<nbasis;i++){// Loop over basis vectors
+      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
+      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
+
+	phaV = phaF[p]*Subspace.subspace[i];
+
+	/////////////////////////////////////////////////////////////////////
+	// Multiple phased subspace vector by matrix and project to subspace
+	// Remove local bulk phase to leave relative phases
+	/////////////////////////////////////////////////////////////////////
+	linop.Op(phaV,MphaV);
+
+	// Fixme, could use batched block projector here
+	blockProject(coarseInner,MphaV,Subspace.subspace);
+
+	coarseInner = conjugate(pha[p]) * coarseInner;
+
+	ComputeProj[p] = coarseInner;
+      }
+
+      for(int k=0;k<npoint;k++){
+	FT = Zero();
+	for(int l=0;l<npoint;l++){
+	  FT= FT+ invMkl(l,k)*ComputeProj[l];
+	}
+      
+	int osites=CoarseGrid->oSites();
+	autoView( A_v  , _A[k], AcceleratorWrite);
+	autoView( FT_v  , FT, AcceleratorRead);
+	accelerator_for(sss, osites, 1, {
+	    for(int j=0;j<nbasis;j++){
+	      A_v[sss](i,j) = FT_v[sss](j);
+	    }
+        });
+      }
+    }
+
+    // Only needed if nonhermitian
+    //    if ( ! hermitian ) {
+    //      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
+    //      PopulateAdag();
+    //    }
+    // Need to write something to populate Adag from A
+    
+    for(int p=0;p<geom_srhs.npoint;p++){
+      GridtoBLAS(_A[p],BLAS_A[p]);
+    }
+    /*
+Grid : Message : 11698.730546 s : CoarsenOperator eigen  1334 us
+Grid : Message : 11698.730563 s : CoarsenOperator phase  34729 us
+Grid : Message : 11698.730565 s : CoarsenOperator phaseBZ 2423814 us
+Grid : Message : 11698.730566 s : CoarsenOperator mat    127890998 us
+Grid : Message : 11698.730567 s : CoarsenOperator proj   515840840 us
+Grid : Message : 11698.730568 s : CoarsenOperator inv    103948313 us
+Takes 600s to compute matrix elements, DOMINATED by the block project.
+Easy to speed up with the batched block project.
+Store npoint vectors, get npoint x Nbasis block projection, and 81 fold faster.
+     */
+  }
+  void Mdag(const CoarseVector &in, CoarseVector &out)
+  {
+    this->M(in,out);
+  }
+  void M (const CoarseVector &in, CoarseVector &out)
+  {
+    //    std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
+    conformable(CoarseGrid(),in.Grid());
+    conformable(in.Grid(),out.Grid());
+    out.Checkerboard() = in.Checkerboard();
+
+    RealD t_tot;
+    RealD t_exch;
+    RealD t_GtoB;
+    RealD t_BtoG;
+    RealD t_mult;
+
+    t_tot=-usecond();
+    CoarseVector tin=in;
+    t_exch=-usecond();
+    CoarseVector pin = Cell.ExchangePeriodic(tin); //padded input
+    t_exch+=usecond();
+
+    CoarseVector pout(pin.Grid());
+
+    int npoint = geom.npoint;
+    typedef calcMatrix* Aview;
+    typedef LatticeView<Cvec> Vview;
+      
+    const int Nsimd = CComplex::Nsimd();
+
+    int64_t nrhs  =pin.Grid()->GlobalDimensions()[0];
+    assert(nrhs>=1);
+
+    RealD flops,bytes;
+    int64_t osites=in.Grid()->oSites(); // unpadded
+    int64_t unpadded_vol = CoarseGrid()->lSites()/nrhs;
+    
+    flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
+    bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
+          + 2.0*osites*sizeof(siteVector)*npoint;
+    
+
+    t_GtoB=-usecond();
+    GridtoBLAS(pin,BLAS_B);
+    t_GtoB+=usecond();
+
+    GridBLAS BLAS;
+
+    t_mult=-usecond();
+    for(int p=0;p<geom.npoint;p++){
+      RealD c = 1.0;
+      if (p==0) c = 0.0;
+      ComplexD beta(c);
+
+      BLAS.gemmBatched(nbasis,nrhs,nbasis,
+		       ComplexD(1.0),
+		       BLAS_AP[p], 
+		       BLAS_BP[p], 
+		       ComplexD(c), 
+		       BLAS_CP);
+    }
+    BLAS.synchronise();
+    t_mult+=usecond();
+
+    t_BtoG=-usecond();
+    BLAStoGrid(out,BLAS_C);
+    t_BtoG+=usecond();
+    t_tot+=usecond();
+    /*
+    std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult GtoB  "<<t_GtoB<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult BtoG  "<<t_BtoG<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult tot  "<<t_tot<<" us"<<std::endl;
+    std::cout << GridLogMessage<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
+    */
+    //    std::cout << GridLogMessage<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
+  };
+  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
+};
+  
+NAMESPACE_END(Grid);
@@ -0,0 +1,238 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+
+/////////////////////////////////////////////////////////////////
+// Geometry class in cartesian case
+/////////////////////////////////////////////////////////////////
+
+class Geometry {
+public:
+  int npoint;
+  int base;
+  std::vector<int> directions   ;
+  std::vector<int> displacements;
+  std::vector<int> points_dagger;
+
+  Geometry(int _d)  {
+    
+    base = (_d==5) ? 1:0;
+
+    // make coarse grid stencil for 4d , not 5d
+    if ( _d==5 ) _d=4;
+
+    npoint = 2*_d+1;
+    directions.resize(npoint);
+    displacements.resize(npoint);
+    points_dagger.resize(npoint);
+    for(int d=0;d<_d;d++){
+      directions[d   ] = d+base;
+      directions[d+_d] = d+base;
+      displacements[d  ] = +1;
+      displacements[d+_d]= -1;
+      points_dagger[d   ] = d+_d;
+      points_dagger[d+_d] = d;
+    }
+    directions   [2*_d]=0;
+    displacements[2*_d]=0;
+    points_dagger[2*_d]=2*_d;
+  }
+
+  int point(int dir, int disp) {
+    assert(disp == -1 || disp == 0 || disp == 1);
+    assert(base+0 <= dir && dir < base+4);
+
+    // directions faster index = new indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  1  2  3  0  1  2  3  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  2  3  4  1  2  3  4  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+
+    // displacements faster index = old indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  0  1  1  2  2  3  3  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  1  2  2  3  3  4  4  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+
+    if(dir == 0 and disp == 0)
+      return 8;
+    else // New indexing
+      return (1 - disp) / 2 * 4 + dir - base;
+    // else // Old indexing
+    //   return (4 * (dir - base) + 1 - disp) / 2;
+  }
+};
+
+/////////////////////////////////////////////////////////////////
+// Less local equivalent of Geometry class in cartesian case
+/////////////////////////////////////////////////////////////////
+class NonLocalStencilGeometry {
+public:
+  //  int depth;
+  int skip;
+  int hops;
+  int npoint;
+  std::vector<Coordinate> shifts;
+  Coordinate stencil_size;
+  Coordinate stencil_lo;
+  Coordinate stencil_hi;
+  GridCartesian *grid;
+  GridCartesian *Grid() {return grid;};
+  int Depth(void){return 1;};   // Ghost zone depth
+  int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil
+  int DimSkip(void){return skip;};
+
+  virtual ~NonLocalStencilGeometry() {};
+
+  int  Reverse(int point)
+  {
+    int Nd = Grid()->Nd();
+    Coordinate shft = shifts[point];
+    Coordinate rev(Nd);
+    for(int mu=0;mu<Nd;mu++) rev[mu]= -shft[mu];
+    for(int p=0;p<npoint;p++){
+      if(rev==shifts[p]){
+	return p;
+      }
+    }
+    assert(0);
+    return -1;
+  }
+  void BuildShifts(void)
+  {
+    this->shifts.resize(0);
+    int Nd = this->grid->Nd();
+
+    int dd = this->DimSkip();
+    for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){
+    for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){
+    for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){
+    for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){
+      Coordinate sft(Nd,0);
+      sft[dd+0] = s0;
+      sft[dd+1] = s1;
+      sft[dd+2] = s2;
+      sft[dd+3] = s3;
+      int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3);
+      if(nhops<=this->hops) this->shifts.push_back(sft);
+    }}}}
+    this->npoint = this->shifts.size();
+    std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<<std::endl;
+  }
+  
+  NonLocalStencilGeometry(GridCartesian *_coarse_grid,int _hops,int _skip) : grid(_coarse_grid), hops(_hops), skip(_skip)
+  {
+    Coordinate latt = grid->GlobalDimensions();
+    stencil_size.resize(grid->Nd());
+    stencil_lo.resize(grid->Nd());
+    stencil_hi.resize(grid->Nd());
+    for(int d=0;d<grid->Nd();d++){
+     if ( latt[d] == 1 ) {
+      stencil_lo[d] = 0;
+      stencil_hi[d] = 0;
+      stencil_size[d]= 1;
+     } else if ( latt[d] == 2 ) {
+      stencil_lo[d] = -1;
+      stencil_hi[d] = 0;
+      stencil_size[d]= 2;
+     } else if ( latt[d] > 2 ) {
+       stencil_lo[d] = -1;
+       stencil_hi[d] =  1;
+       stencil_size[d]= 3;
+     }
+    }
+    this->BuildShifts();
+  };
+
+};
+
+// Need to worry about red-black now
+class NonLocalStencilGeometry4D : public NonLocalStencilGeometry {
+public:
+  virtual int DerivedDimSkip(void) { return 0;};
+  NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,0) { };
+  virtual ~NonLocalStencilGeometry4D() {};
+};
+class NonLocalStencilGeometry5D : public NonLocalStencilGeometry {
+public:
+  virtual int DerivedDimSkip(void) { return 1; }; 
+  NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,1)  { };
+  virtual ~NonLocalStencilGeometry5D() {};
+};
+/*
+ * Bunch of different options classes
+ */
+class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
+public:
+  NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,4)
+  {
+  };
+};
+class NextToNextToNextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
+public:
+  NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,4)
+  {
+  };
+};
+class NextToNearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
+public:
+  NextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,2)
+  {
+  };
+};
+class NextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
+public:
+  NextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,2)
+  {
+  };
+};
+class NearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
+public:
+  NearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,1)
+  {
+  };
+};
+class NearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
+public:
+  NearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,1)
+  {
+  };
+};
+
+NAMESPACE_END(Grid);
@@ -0,0 +1,34 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: Grid/algorithms/multigrid/MultiGrid.h
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+#include <Grid/algorithms/multigrid/Aggregates.h>
+#include <Grid/algorithms/multigrid/Geometry.h>
+#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
+#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
+#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>
@@ -175,8 +175,56 @@ template<class T> using cshiftAllocator = std::allocator<T>;

 template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
 template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
-template<class T> using commVector = std::vector<T,devAllocator<T> >;
-template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
+template<class T> using commVector    = std::vector<T,devAllocator<T> >;
+template<class T> using deviceVector  = std::vector<T,devAllocator<T> >;
+template<class T> using cshiftVector  = std::vector<T,cshiftAllocator<T> >;
+
+/*
+template<class T> class vecView
+{
+ protected:
+  T * data;
+  uint64_t size;
+  ViewMode mode;
+  void * cpu_ptr;
+ public:
+  accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
+  vecView(std::vector<T> &refer_to_me,ViewMode _mode)
+  {
+    cpu_ptr = &refer_to_me[0];
+    size = refer_to_me.size();
+    mode = _mode;
+    data =(T *) MemoryManager::ViewOpen(cpu_ptr,
+					size*sizeof(T),
+					mode,
+					AdviseDefault);
+  }
+  void ViewClose(void)
+  { // Inform the manager
+    MemoryManager::ViewClose(this->cpu_ptr,this->mode);    
+  }
+};
+
+template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
+{
+  vecView<T> ret(vec,_mode); // does the open
+  return ret;                // must be closed
+}
+
+// Little autoscope assister
+template<class View> 
+class VectorViewCloser
+{
+  View v;  // Take a copy of view and call view close when I go out of scope automatically
+ public:
+  VectorViewCloser(View &_v) : v(_v) {};
+  ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose();  MemoryManager::NotifyDeletion(ptr);}
+};
+
+#define autoVecView(v_v,v,mode)					\
+  auto v_v = VectorView(v,mode);				\
+  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
+*/

 NAMESPACE_END(Grid);

@@ -209,9 +209,9 @@ private:
  static void     CpuViewClose(uint64_t Ptr);
  static uint64_t CpuViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 #endif
-  static void NotifyDeletion(void * CpuPtr);

 public:
+  static void NotifyDeletion(void * CpuPtr);
  static void Print(void);
  static void PrintAll(void);
  static void PrintState( void* CpuPtr);
@@ -8,7 +8,7 @@ NAMESPACE_BEGIN(Grid);
 static char print_buffer [ MAXLINE ];

 #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
-#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
+#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
 //#define dprintf(...) 


@@ -111,7 +111,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -141,7 +141,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@@ -155,7 +155,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@@ -169,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: Flush  %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -184,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@@ -474,6 +474,7 @@ void  MemoryManager::Print(void)
  std::cout << GridLogMessage << DeviceEvictions  << " Evictions from device " << std::endl;
  std::cout << GridLogMessage << DeviceDestroy    << " Destroyed vectors on device " << std::endl;
  std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
+  acceleratorMem();
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 }
 void  MemoryManager::PrintAll(void)
@@ -70,8 +70,8 @@ public:
  Coordinate _istride;    // Inner stride i.e. within simd lane
  int _osites;                  // _isites*_osites = product(dimensions).
  int _isites;
-  int _fsites;                  // _isites*_osites = product(dimensions).
-  int _gsites;
+  int64_t _fsites;                  // _isites*_osites = product(dimensions).
+  int64_t _gsites;
  Coordinate _slice_block;// subslice information
  Coordinate _slice_stride;
  Coordinate _slice_nblock;
@@ -183,7 +183,7 @@ public:
  inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
  inline int oSites(void) const { return _osites; };
  inline int lSites(void) const { return _isites*_osites; }; 
-  inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
+  inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; 
  inline int Nd    (void) const { return _ndimension;};

  inline const Coordinate LocalStarts(void)             { return _lstart;    };
@@ -214,7 +214,7 @@ public:
  ////////////////////////////////////////////////////////////////
  // Global addressing
  ////////////////////////////////////////////////////////////////
-  void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
+  void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
    assert(gidx< gSites());
    Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
  }
@@ -222,7 +222,7 @@ public:
    assert(lidx<lSites());
    Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
  }
-  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
+  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
    gidx=0;
    int mult=1;
    for(int mu=0;mu<_ndimension;mu++) {
@@ -138,6 +138,14 @@ public:
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
+  void CommsComplete(std::vector<CommsRequest_t> &list);
+  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			   void *xmit,
+			   int dest,
+			   void *recv,
+			   int from,
+			   int bytes,int dir);
+  
  void SendToRecvFrom(void *xmit,
 		      int xmit_to_rank,
 		      void *recv,
@@ -306,6 +306,44 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
+
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes,int dir)
+{
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  assert(dest != _processor);
+  assert(from != _processor);
+
+  int tag;
+
+  tag= dir+from*32;
+  int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
+  assert(ierr==0);
+  list.push_back(rrq);
+  
+  tag= dir+_processor*32;
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
+  assert(ierr==0);
+  list.push_back(xrq);
+}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
+{
+  int nreq=list.size();
+
+  if (nreq==0) return;
+
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  assert(ierr==0);
+  list.resize(0);
+}
+
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
@@ -91,6 +91,17 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes,int dir)
+{
+  assert(0);
+}
+
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
@@ -360,7 +360,7 @@ public:

 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
  typedef typename vobj::scalar_object sobj;
-  for(int g=0;g<o.Grid()->_gsites;g++){
+  for(int64_t g=0;g<o.Grid()->_gsites;g++){

    Coordinate gcoor;
    o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
+template<class vobj> void DumpSliceNorm(std::string s,const Lattice<vobj> &f,int mu=-1)
 {
  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
@@ -203,6 +203,27 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  return real(nrm); 
 }

+
+template<class Op,class T1>
+inline auto norm2(const LatticeUnaryExpression<Op,T1> & expr)  ->RealD
+{
+  return norm2(closure(expr));
+}
+
+template<class Op,class T1,class T2>
+inline auto norm2(const LatticeBinaryExpression<Op,T1,T2> & expr)      ->RealD
+{
+  return norm2(closure(expr));
+}
+
+
+template<class Op,class T1,class T2,class T3>
+inline auto norm2(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)      ->RealD
+{
+  return norm2(closure(expr));
+}
+
+
 //The global maximum of the site norm2
 template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 {
@@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  cudaGetDevice(&device);
 #endif
 #ifdef GRID_HIP
-  hipGetDevice(&device);
+  auto discard=hipGetDevice(&device);
 #endif
  
  Iterator warpSize            = gpu_props[device].warpSize;
@@ -361,9 +361,14 @@ public:
    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }
-
-  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
-
+  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist)
+  {
+    if ( l.Grid()->_isCheckerBoarded ) {
+      Lattice<vobj> tmp(_grid);
+      fill(tmp,dist);
+      pickCheckerboard(l.Checkerboard(),l,tmp);
+      return;
+    }
    typedef typename vobj::scalar_object scalar_object;
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
@@ -427,7 +432,7 @@ public:
 #if 1
    thread_for( lidx, _grid->lSites(), {

-	int gidx;
+	int64_t gidx;
 	int o_idx;
 	int i_idx;
 	int rank;
@@ -276,18 +276,33 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,

  autoView( coarseData_ , coarseData, AcceleratorWrite);
  autoView( ip_         , ip,         AcceleratorWrite);
+  RealD t_IP=0;
+  RealD t_co=0;
+  RealD t_za=0;
  for(int v=0;v<nbasis;v++) {
+    t_IP-=usecond();
    blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
+    t_IP+=usecond();
+    t_co-=usecond();
    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
 	convertType(coarseData_[sc](v),ip_[sc]);
    });
+    t_co+=usecond();

    // improve numerical stability of projection
    // |fine> = |fine> - <basis|fine> |basis>
    ip=-ip;
+    t_za-=usecond();
    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
+    t_za+=usecond();
  }
+  //  std::cout << GridLogPerformance << " blockProject : blockInnerProduct :  "<<t_IP<<" us"<<std::endl;
+  //  std::cout << GridLogPerformance << " blockProject : conv              :  "<<t_co<<" us"<<std::endl;
+  //  std::cout << GridLogPerformance << " blockProject : blockZaxpy        :  "<<t_za<<" us"<<std::endl;
 }
+// This only minimises data motion from CPU to GPU
+// there is chance of better implementation that does a vxk loop of inner products to data share
+// at the GPU thread level
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
                               const std::vector<Lattice<vobj>> &fineData,
@@ -393,8 +408,15 @@ template<class vobj,class CComplex>
  Lattice<dotp> coarse_inner(coarse);

  // Precision promotion
+  RealD t;
+  t=-usecond();
  fine_inner = localInnerProductD<vobj>(fineX,fineY);
+  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : localInnerProductD "<<t<<" us"<<std::endl;
+  
+  t=-usecond();
  blockSum(coarse_inner,fine_inner);
+  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : blockSum "<<t<<" us"<<std::endl;
+  t=-usecond();
  {
    autoView( CoarseInner_  , CoarseInner,AcceleratorWrite);
    autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
@@ -402,6 +424,7 @@ template<class vobj,class CComplex>
      convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
    });
  }
+  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : convertType "<<t<<" us"<<std::endl;
 
 }

@@ -444,6 +467,9 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 template<class vobj>
 inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) 
 {
+  const int maxsubsec=256;
+  typedef iVector<vobj,maxsubsec> vSubsec;
+
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();

@@ -463,37 +489,62 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  autoView( coarseData_ , coarseData, AcceleratorWrite);
  autoView( fineData_   , fineData, AcceleratorRead);

-  auto coarseData_p = &coarseData_[0];
-  auto fineData_p = &fineData_[0];
+  auto coarseData_p  = &coarseData_[0];
+  auto fineData_p    = &fineData_[0];
  
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;

  vobj zz = Zero();
-  
-  accelerator_for(sc,coarse->oSites(),1,{

+  // Somewhat lazy calculation
+  // Find the biggest power of two subsection divisor less than or equal to maxsubsec
+  int subsec=maxsubsec;
+  int subvol;
+  subvol=blockVol/subsec;
+  while(subvol*subsec!=blockVol){
+    subsec = subsec/2;
+    subvol=blockVol/subsec;
+  };
+
+  Lattice<vSubsec> coarseTmp(coarse);
+  autoView( coarseTmp_, coarseTmp, AcceleratorWriteDiscard);
+  auto coarseTmp_p= &coarseTmp_[0];
+  
+  // Sum within subsecs in a first kernel
+  accelerator_for(sce,subsec*coarse->oSites(),vobj::Nsimd(),{
+
+      int sc=sce/subsec;
+      int e=sce%subsec;
+      
      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate

-      vobj cd = zz;
-      
-      for(int sb=0;sb<blockVol;sb++){
-
+      auto cd = coalescedRead(zz);
+      for(int sb=e*subvol;sb<MIN((e+1)*subvol,blockVol);sb++){
 	int sf;
 	Coordinate coor_b(_ndimension);
 	Coordinate coor_f(_ndimension);
 	Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
 	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
 	Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
-
-	cd=cd+fineData_p[sf];
+	
+	cd=cd+coalescedRead(fineData_p[sf]);
      }

-      coarseData_p[sc] = cd;
+      coalescedWrite(coarseTmp_[sc](e),cd);

    });
+   // Sum across subsecs in a second kernel
+   accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{
+      auto cd = coalescedRead(coarseTmp_p[sc](0));
+      for(int e=1;e<subsec;e++){
+	cd=cd+coalescedRead(coarseTmp_p[sc](e));
+      }
+      coalescedWrite(coarseData_p[sc],cd);
+   });
+
  return;
 }

@@ -550,7 +601,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
  blockOrthonormalize(ip,Basis);
 }

-#if 0
+#ifdef GRID_ACCELERATED
 // TODO: CPU optimized version here
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
@@ -576,26 +627,37 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  autoView( fineData_   , fineData, AcceleratorWrite);
  autoView( coarseData_ , coarseData, AcceleratorRead);

+  typedef LatticeView<vobj> Vview;
+  std::vector<Vview> AcceleratorVecViewContainer_h; 
+  for(int v=0;v<nbasis;v++) {
+    AcceleratorVecViewContainer_h.push_back(Basis[v].View(AcceleratorRead));
+  }
+  static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(nbasis); 
+  acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],nbasis *sizeof(Vview));
+  auto Basis_p = &AcceleratorVecViewContainer[0];
  // Loop with a cache friendly loop ordering
-  accelerator_for(sf,fine->oSites(),1,{
+  Coordinate frdimensions=fine->_rdimensions;
+  Coordinate crdimensions=coarse->_rdimensions;
+  accelerator_for(sf,fine->oSites(),vobj::Nsimd(),{
    int sc;
    Coordinate coor_c(_ndimension);
    Coordinate coor_f(_ndimension);

-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,frdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,crdimensions);

-    for(int i=0;i<nbasis;i++) {
-      /*      auto basis_ = Basis[i],  );*/
-      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
-      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
-    }
+    auto sum= coarseData_(sc)(0) *Basis_p[0](sf);
+    for(int i=1;i<nbasis;i++) sum = sum + coarseData_(sc)(i)*Basis_p[i](sf);
+    coalescedWrite(fineData_[sf],sum);
  });
+  for(int v=0;v<nbasis;v++) {
+    AcceleratorVecViewContainer_h[v].ViewClose();
+  }
  return;
-  
 }
 #else
+// CPU version
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -682,7 +744,11 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

-  static const int words=sizeof(vobj)/sizeof(vector_type);
+  const int words=sizeof(vobj)/sizeof(vector_type);
+
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // checks should guarantee that the operations are local
+  //////////////////////////////////////////////////////////////////////////////////////////

  GridBase *Fg = From.Grid();
  GridBase *Tg = To.Grid();
@@ -697,52 +763,38 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
-  // the above should guarantee that the operations are local
-  
-#if 1
+
+  ///////////////////////////////////////////////////////////
+  // do the index calc on the GPU
+  ///////////////////////////////////////////////////////////
+  Coordinate f_ostride = Fg->_ostride;
+  Coordinate f_istride = Fg->_istride;
+  Coordinate f_rdimensions = Fg->_rdimensions;
+  Coordinate t_ostride = Tg->_ostride;
+  Coordinate t_istride = Tg->_istride;
+  Coordinate t_rdimensions = Tg->_rdimensions;

  size_t nsite = 1;
  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
-  
-  size_t tbytes = 4*nsite*sizeof(int);
-  int *table = (int*)malloc(tbytes);
- 
-  thread_for(idx, nsite, {
-      Coordinate from_coor, to_coor;
-      size_t rem = idx;
-      for(int i=0;i<nd;i++){
-	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
-	from_coor[i] = base_i + FromLowerLeft[i];
-	to_coor[i] = base_i + ToLowerLeft[i];
-      }
-      
-      int foidx = Fg->oIndex(from_coor);
-      int fiidx = Fg->iIndex(from_coor);
-      int toidx = Tg->oIndex(to_coor);
-      int tiidx = Tg->iIndex(to_coor);
-      int* tt = table + 4*idx;
-      tt[0] = foidx;
-      tt[1] = fiidx;
-      tt[2] = toidx;
-      tt[3] = tiidx;
-    });
-  
-  int* table_d = (int*)acceleratorAllocDevice(tbytes);
-  acceleratorCopyToDevice(table,table_d,tbytes);

  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;

  autoView(from_v,From,AcceleratorRead);
  autoView(to_v,To,AcceleratorWrite);
-  
+
  accelerator_for(idx,nsite,1,{
-      static const int words=sizeof(vobj)/sizeof(vector_type);
-      int* tt = table_d + 4*idx;
-      int from_oidx = *tt++;
-      int from_lane = *tt++;
-      int to_oidx = *tt++;
-      int to_lane = *tt;
+
+      Coordinate from_coor, to_coor, base;
+      Lexicographic::CoorFromIndex(base,idx,RegionSize);
+      for(int i=0;i<nd;i++){
+	from_coor[i] = base[i] + FromLowerLeft[i];
+	to_coor[i] = base[i] + ToLowerLeft[i];
+      }
+      int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
+      int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
+      int to_oidx   = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
+      int to_lane   = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);

      const vector_type* from = (const vector_type *)&from_v[from_oidx];
      vector_type* to = (vector_type *)&to_v[to_oidx];
@@ -752,56 +804,146 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 	stmp = getlane(from[w], from_lane);
 	putlane(to[w], stmp, to_lane);
      }
-    });
-  
-  acceleratorFreeDevice(table_d);    
-  free(table);
-  
-
-#else  
-  Coordinate ldf = Fg->_ldimensions;
-  Coordinate rdf = Fg->_rdimensions;
-  Coordinate isf = Fg->_istride;
-  Coordinate osf = Fg->_ostride;
-  Coordinate rdt = Tg->_rdimensions;
-  Coordinate ist = Tg->_istride;
-  Coordinate ost = Tg->_ostride;
-
-  autoView( t_v , To, CpuWrite);
-  autoView( f_v , From, CpuRead);
-  thread_for(idx,Fg->lSites(),{
-    sobj s;
-    Coordinate Fcoor(nd);
-    Coordinate Tcoor(nd);
-    Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
-    int in_region=1;
-    for(int d=0;d<nd;d++){
-      if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){ 
-	in_region=0;
-      }
-      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
-    }
-    if (in_region) {
-#if 0      
-      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
-      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
-      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
-      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
-      scalar_type * fp = (scalar_type *)&f_v[odx_f];
-      scalar_type * tp = (scalar_type *)&t_v[odx_t];
-      for(int w=0;w<words;w++){
-	tp[w].putlane(fp[w].getlane(idx_f),idx_t);
-      }
-#else
-    peekLocalSite(s,f_v,Fcoor);
-    pokeLocalSite(s,t_v,Tcoor);
-#endif
-    }
  });
-
-#endif
 }

+template<class vobj>
+void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  const int words=sizeof(vobj)/sizeof(vector_type);
+
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // checks should guarantee that the operations are local
+  //////////////////////////////////////////////////////////////////////////////////////////
+  GridBase *Fg = From.Grid();
+  GridBase *Tg = To.Grid();
+  assert(!Fg->_isCheckerBoarded);
+  assert(!Tg->_isCheckerBoarded);
+  int Nsimd = Fg->Nsimd();
+  int nF = Fg->_ndimension;
+  int nT = Tg->_ndimension;
+  assert(nF+1 == nT);
+
+  ///////////////////////////////////////////////////////////
+  // do the index calc on the GPU
+  ///////////////////////////////////////////////////////////
+  Coordinate f_ostride = Fg->_ostride;
+  Coordinate f_istride = Fg->_istride;
+  Coordinate f_rdimensions = Fg->_rdimensions;
+  Coordinate t_ostride = Tg->_ostride;
+  Coordinate t_istride = Tg->_istride;
+  Coordinate t_rdimensions = Tg->_rdimensions;
+  Coordinate RegionSize = Fg->_ldimensions;
+  size_t nsite = 1;
+  for(int i=0;i<nF;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
+
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  autoView(from_v,From,AcceleratorRead);
+  autoView(to_v,To,AcceleratorWrite);
+
+  accelerator_for(idx,nsite,1,{
+
+      Coordinate from_coor(nF), to_coor(nT);
+      Lexicographic::CoorFromIndex(from_coor,idx,RegionSize);
+      int j=0;
+      for(int i=0;i<nT;i++){
+	if ( i!=orthog ) { 
+	  to_coor[i] = from_coor[j];
+	  j++;
+	} else {
+	  to_coor[i] = slice;
+	}
+      }
+      int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
+      int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
+      int to_oidx   = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
+      int to_lane   = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
+
+      const vector_type* from = (const vector_type *)&from_v[from_oidx];
+      vector_type* to = (vector_type *)&to_v[to_oidx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	putlane(to[w], stmp, to_lane);
+      }
+  });
+}
+
+template<class vobj>
+void ExtractSliceFast(Lattice<vobj> &To,const Lattice<vobj> & From,int slice, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  const int words=sizeof(vobj)/sizeof(vector_type);
+
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // checks should guarantee that the operations are local
+  //////////////////////////////////////////////////////////////////////////////////////////
+  GridBase *Fg = From.Grid();
+  GridBase *Tg = To.Grid();
+  assert(!Fg->_isCheckerBoarded);
+  assert(!Tg->_isCheckerBoarded);
+  int Nsimd = Fg->Nsimd();
+  int nF = Fg->_ndimension;
+  int nT = Tg->_ndimension;
+  assert(nT+1 == nF);
+
+  ///////////////////////////////////////////////////////////
+  // do the index calc on the GPU
+  ///////////////////////////////////////////////////////////
+  Coordinate f_ostride = Fg->_ostride;
+  Coordinate f_istride = Fg->_istride;
+  Coordinate f_rdimensions = Fg->_rdimensions;
+  Coordinate t_ostride = Tg->_ostride;
+  Coordinate t_istride = Tg->_istride;
+  Coordinate t_rdimensions = Tg->_rdimensions;
+  Coordinate RegionSize = Tg->_ldimensions;
+  size_t nsite = 1;
+  for(int i=0;i<nT;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
+
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  autoView(from_v,From,AcceleratorRead);
+  autoView(to_v,To,AcceleratorWrite);
+
+  accelerator_for(idx,nsite,1,{
+
+      Coordinate from_coor(nF), to_coor(nT);
+      Lexicographic::CoorFromIndex(to_coor,idx,RegionSize);
+      int j=0;
+      for(int i=0;i<nF;i++){
+	if ( i!=orthog ) { 
+	  from_coor[i] = to_coor[j];
+	  j++;
+	} else {
+	  from_coor[i] = slice;
+	}
+      }
+      int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
+      int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
+      int to_oidx   = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
+      int to_lane   = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
+
+      const vector_type* from = (const vector_type *)&from_v[from_oidx];
+      vector_type* to = (vector_type *)&to_v[to_oidx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	putlane(to[w], stmp, to_lane);
+      }
+  });
+}

 template<class vobj>
 void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
@@ -891,9 +1033,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic

 }

-
-//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
-//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
+//Can I implement with local copyregion??
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@@ -914,121 +1054,18 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
    }
  }
-
-#if 1
-  size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
-  size_t tbytes = 4*nsite*sizeof(int);
-  int *table = (int*)malloc(tbytes);
-  
-  thread_for(idx,nsite,{
-    Coordinate lcoor(nl);
-    Coordinate hcoor(nh);
-    lcoor[orthog] = slice_lo;
-    hcoor[orthog] = slice_hi;
-    size_t rem = idx;
-    for(int mu=0;mu<nl;mu++){
-      if(mu != orthog){
-	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu];
-	lcoor[mu] = hcoor[mu] = xmu;
-      }
-    }
-    int loidx = lg->oIndex(lcoor);
-    int liidx = lg->iIndex(lcoor);
-    int hoidx = hg->oIndex(hcoor);
-    int hiidx = hg->iIndex(hcoor);
-    int* tt = table + 4*idx;
-    tt[0] = loidx;
-    tt[1] = liidx;
-    tt[2] = hoidx;
-    tt[3] = hiidx;
-    });
-   
-  int* table_d = (int*)acceleratorAllocDevice(tbytes);
-  acceleratorCopyToDevice(table,table_d,tbytes);
-
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  autoView(lowDim_v,lowDim,AcceleratorRead);
-  autoView(higherDim_v,higherDim,AcceleratorWrite);
-  
-  accelerator_for(idx,nsite,1,{
-      static const int words=sizeof(vobj)/sizeof(vector_type);
-      int* tt = table_d + 4*idx;
-      int from_oidx = *tt++;
-      int from_lane = *tt++;
-      int to_oidx = *tt++;
-      int to_lane = *tt;
-
-      const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
-      vector_type* to = (vector_type *)&higherDim_v[to_oidx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	putlane(to[w], stmp, to_lane);
-      }
-    });
-  
-  acceleratorFreeDevice(table_d);    
-  free(table);
-  
-#else
-  // the above should guarantee that the operations are local
-  autoView(lowDimv,lowDim,CpuRead);
-  autoView(higherDimv,higherDim,CpuWrite);
-  thread_for(idx,lg->lSites(),{
-    sobj s;
-    Coordinate lcoor(nl);
-    Coordinate hcoor(nh);
-    lg->LocalIndexToLocalCoor(idx,lcoor);
-    if( lcoor[orthog] == slice_lo ) { 
-      hcoor=lcoor;
-      hcoor[orthog] = slice_hi;
-      peekLocalSite(s,lowDimv,lcoor);
-      pokeLocalSite(s,higherDimv,hcoor);
-    }
-  });
-#endif
+  Coordinate sz = lg->_ldimensions;
+  sz[orthog]=1;
+  Coordinate f_ll(nl,0); f_ll[orthog]=slice_lo;
+  Coordinate t_ll(nh,0); t_ll[orthog]=slice_hi;
+  localCopyRegion(lowDim,higherDim,f_ll,t_ll,sz);
 }


 template<class vobj>
 void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
-  typedef typename vobj::scalar_object sobj;
-
-  GridBase *lg = lowDim.Grid();
-  GridBase *hg = higherDim.Grid();
-  int nl = lg->_ndimension;
-  int nh = hg->_ndimension;
-
-  assert(nl == nh);
-  assert(orthog<nh);
-  assert(orthog>=0);
-
-  for(int d=0;d<nh;d++){
-    if ( d!=orthog ) {
-    assert(lg->_processors[d]  == hg->_processors[d]);
-    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
-  }
-  }
-
-  // the above should guarantee that the operations are local
-  autoView(lowDimv,lowDim,CpuWrite);
-  autoView(higherDimv,higherDim,CpuRead);
-  thread_for(idx,lg->lSites(),{
-    sobj s;
-    Coordinate lcoor(nl);
-    Coordinate hcoor(nh);
-    lg->LocalIndexToLocalCoor(idx,lcoor);
-    if( lcoor[orthog] == slice_lo ) { 
-      hcoor=lcoor;
-      hcoor[orthog] = slice_hi;
-      peekLocalSite(s,higherDimv,hcoor);
-      pokeLocalSite(s,lowDimv,lcoor);
-    }
-  });
+  InsertSliceLocal(higherDim,lowDim,slice_hi,slice_lo,orthog);
 }


@@ -1054,7 +1091,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)

  Coordinate fcoor(nd);
  Coordinate ccoor(nd);
-  for(int g=0;g<fg->gSites();g++){
+  for(int64_t g=0;g<fg->gSites();g++){

    fg->GlobalIndexToGlobalCoor(g,fcoor);
    for(int d=0;d<nd;d++){
@@ -1740,5 +1777,35 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
  }
 }

+//////////////////////////////////////////////////////
+// Faster but less accurate blockProject
+//////////////////////////////////////////////////////
+template<class vobj,class CComplex,int nbasis,class VLattice>
+inline void blockProjectFast(Lattice<iVector<CComplex,nbasis > > &coarseData,
+			     const             Lattice<vobj>   &fineData,
+			     const VLattice &Basis)
+{
+  GridBase * fine  = fineData.Grid();
+  GridBase * coarse= coarseData.Grid();
+
+  Lattice<iScalar<CComplex> > ip(coarse);
+
+  autoView( coarseData_ , coarseData, AcceleratorWrite);
+  autoView( ip_         , ip,         AcceleratorWrite);
+  RealD t_IP=0;
+  RealD t_co=0;
+  for(int v=0;v<nbasis;v++) {
+    t_IP-=usecond();
+    blockInnerProductD(ip,Basis[v],fineData); 
+    t_IP+=usecond();
+    t_co-=usecond();
+    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
+	convertType(coarseData_[sc](v),ip_[sc]);
+      });
+    t_co+=usecond();
+  }
+}
+
+
 NAMESPACE_END(Grid);

@@ -45,6 +45,188 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
 };  

+
+/*
+ *
+ * TODO: 
+ *  -- address elementsof vobj via thread block in Scatter/Gather
+ *  -- overlap comms with motion in Face_exchange
+ *
+ */
+
+template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
+					      Lattice<vobj> &lat,
+					      int x,
+					      int dim,
+					      int offset=0)
+{
+  const int Nsimd=vobj::Nsimd();
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  GridBase *grid = lat.Grid();
+  Coordinate simd = grid->_simd_layout;
+  int Nd          = grid->Nd();
+  int block       = grid->_slice_block[dim];
+  int stride      = grid->_slice_stride[dim];
+  int nblock      = grid->_slice_nblock[dim];
+  int rd          = grid->_rdimensions[dim];
+
+  int ox = x%rd;
+  int ix = x/rd;
+
+  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
+
+  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
+
+  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
+  int rNsimda= Nsimd/simd[dim]; // should be equal
+  assert(rNsimda==rNsimd);
+  int face_ovol=block*nblock;
+
+  //  assert(buf.size()==face_ovol*rNsimd);
+
+  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
+  //Let's make it work on GPU and then make a special accelerator_for that
+  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
+  //for cross platform
+  // FIXME -- can put internal indices into thread loop
+  auto buf_p = & buf[0];
+  autoView(lat_v, lat, AcceleratorWrite);
+  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
+
+    // scalar layout won't coalesce
+#ifdef GRID_SIMT
+      {
+	int blane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+      for(int blane=0;blane<Nsimd;blane++) {
+#endif
+	int olane=blane%rNsimd;               // reduced lattice lane
+	int obit =blane/rNsimd;
+
+	///////////////////////////////////////////////////////////////
+	// osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
+	///////////////////////////////////////////////////////////////
+	int ssp = ss*simd[dim]+obit;
+	int b    = ssp%block;
+	int n    = ssp/block;
+	int osite= b+n*stride + ox*block;
+	
+	////////////////////////////////////////////
+	// isite -- map lane within buffer to lane within lattice
+	////////////////////////////////////////////
+	Coordinate icoor;
+	int lane;
+	Lexicographic::CoorFromIndex(icoor,olane,rsimd);
+	icoor[dim]=ix;
+	Lexicographic::IndexFromCoor(icoor,lane,simd);
+	
+	///////////////////////////////////////////
+	// Transfer into lattice - will coalesce
+	///////////////////////////////////////////
+	//	sobj obj = extractLane(blane,buf_p[ss+offset]);
+	//	insertLane(lane,lat_v[osite],obj);
+	const int words=sizeof(vobj)/sizeof(vector_type);
+	vector_type * from = (vector_type *)&buf_p[ss+offset];
+	vector_type * to   = (vector_type *)&lat_v[osite];
+	scalar_type stmp;
+	for(int w=0;w<words;w++){
+	  stmp = getlane(from[w], blane);
+	  putlane(to[w], stmp, lane);
+	}
+      }
+  });
+}
+
+template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
+					     const Lattice<vobj> &lat,
+					     int x,
+					     int dim,
+					     int offset=0)
+{
+  const int Nsimd=vobj::Nsimd();
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  autoView(lat_v, lat, AcceleratorRead);
+
+  GridBase *grid = lat.Grid();
+  Coordinate simd = grid->_simd_layout;
+  int Nd          = grid->Nd();
+  int block       = grid->_slice_block[dim];
+  int stride      = grid->_slice_stride[dim];
+  int nblock      = grid->_slice_nblock[dim];
+  int rd          = grid->_rdimensions[dim];
+
+  int ox = x%rd;
+  int ix = x/rd;
+
+  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
+
+  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
+
+  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
+  
+  int face_ovol=block*nblock;
+
+  //  assert(buf.size()==face_ovol*rNsimd);
+
+  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
+  //Let's make it work on GPU and then make a special accelerator_for that
+  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
+  //for cross platform
+  //For CPU perhaps just run a loop over Nsimd
+  auto buf_p = & buf[0];
+  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
+
+    // scalar layout won't coalesce
+#ifdef GRID_SIMT
+      {
+	int blane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+      for(int blane=0;blane<Nsimd;blane++) {
+#endif
+	int olane=blane%rNsimd;               // reduced lattice lane
+	int obit =blane/rNsimd;
+	
+	////////////////////////////////////////////
+	// osite
+	////////////////////////////////////////////
+	int ssp = ss*simd[dim]+obit;
+	int b    = ssp%block;
+	int n    = ssp/block;
+	int osite= b+n*stride + ox*block;
+
+	////////////////////////////////////////////
+	// isite -- map lane within buffer to lane within lattice
+	////////////////////////////////////////////
+	Coordinate icoor;
+	int lane;
+	Lexicographic::CoorFromIndex(icoor,olane,rsimd);
+	icoor[dim]=ix;
+	Lexicographic::IndexFromCoor(icoor,lane,simd);
+	
+	///////////////////////////////////////////
+	// Take out of lattice
+	///////////////////////////////////////////
+	//	sobj obj = extractLane(lane,lat_v[osite]);
+	//	insertLane(blane,buf_p[ss+offset],obj);
+	const int words=sizeof(vobj)/sizeof(vector_type);
+	vector_type * to    = (vector_type *)&buf_p[ss+offset];
+	vector_type * from  = (vector_type *)&lat_v[osite];
+	scalar_type stmp;
+	for(int w=0;w<words;w++){
+	  stmp = getlane(from[w], lane);
+	  putlane(to[w], stmp, blane);
+	}
+      }
+  });
+}
+
+
 class PaddedCell {
 public:
  GridCartesian * unpadded_grid;
@@ -63,14 +245,18 @@ public:
    dims=_grid->Nd();
    AllocateGrids();
    Coordinate local     =unpadded_grid->LocalDimensions();
+    Coordinate procs     =unpadded_grid->ProcessorGrid();
    for(int d=0;d<dims;d++){
-      assert(local[d]>=depth);
+      if ( procs[d] > 1 ) assert(local[d]>=depth);
    }
  }
  void DeleteGrids(void)
  {
+    Coordinate processors=unpadded_grid->_processors;
    for(int d=0;d<grids.size();d++){
-      delete grids[d];
+      if ( processors[d] > 1 ) { 
+	delete grids[d];
+      }
    }
    grids.resize(0);
  };
@@ -81,27 +267,36 @@ public:
    Coordinate processors=unpadded_grid->_processors;
    Coordinate plocal    =unpadded_grid->LocalDimensions();
    Coordinate global(dims);
-
+    GridCartesian *old_grid = unpadded_grid;
    // expand up one dim at a time
    for(int d=0;d<dims;d++){

-      plocal[d] += 2*depth; 
+      if ( processors[d] > 1 ) { 
+	plocal[d] += 2*depth; 
+      
+	for(int d=0;d<dims;d++){
+	  global[d] = plocal[d]*processors[d];
+	}

-      for(int d=0;d<dims;d++){
-	global[d] = plocal[d]*processors[d];
+	old_grid = new GridCartesian(global,simd,processors);
      }
-
-      grids.push_back(new GridCartesian(global,simd,processors));
+      grids.push_back(old_grid);
    }
  };
  template<class vobj>
  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
  {
+    Coordinate processors=unpadded_grid->_processors;
+
    Lattice<vobj> out(unpadded_grid);

    Coordinate local     =unpadded_grid->LocalDimensions();
-    Coordinate fll(dims,depth); // depends on the MPI spread
+    // depends on the MPI spread      
+    Coordinate fll(dims,depth);
    Coordinate tll(dims,0); // depends on the MPI spread
+    for(int d=0;d<dims;d++){
+      if( processors[d]==1 ) fll[d]=0;
+    }
    localCopyRegion(in,out,fll,tll,local);
    return out;
  }
@@ -116,10 +311,22 @@ public:
    }
    return tmp;
  }
+  template<class vobj>
+  inline Lattice<vobj> ExchangePeriodic(const Lattice<vobj> &in) const
+  {
+    GridBase *old_grid = in.Grid();
+    int dims = old_grid->Nd();
+    Lattice<vobj> tmp = in;
+    for(int d=0;d<dims;d++){
+      tmp = ExpandPeriodic(d,tmp); // rvalue && assignment
+    }
+    return tmp;
+  }
  // expand up one dim at a time
  template<class vobj>
  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
+    Coordinate processors=unpadded_grid->_processors;
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
    Lattice<vobj>  padded(new_grid);
@@ -129,46 +336,236 @@ public:
    if(dim==0) conformable(old_grid,unpadded_grid);
    else       conformable(old_grid,grids[dim-1]);

-    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
-
    double tins=0, tshift=0;
-    
-    // Middle bit
-    double t = usecond();
-    for(int x=0;x<local[dim];x++){
-      InsertSliceLocal(in,padded,x,depth+x,dim);
-    }
-    tins += usecond() - t;
-    
-    // High bit
-    t = usecond();
-    shifted = cshift.Cshift(in,dim,depth);
-    tshift += usecond() - t;

-    t=usecond();
-    for(int x=0;x<depth;x++){
-      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
-    }
-    tins += usecond() - t;
-    
-    // Low bit
-    t = usecond();
-    shifted = cshift.Cshift(in,dim,-depth);
-    tshift += usecond() - t;
-    
-    t = usecond();
-    for(int x=0;x<depth;x++){
-      InsertSliceLocal(shifted,padded,x,x,dim);
-    }
-    tins += usecond() - t;
+    int islocal = 0 ;
+    if ( processors[dim] == 1 ) islocal = 1;

+    if ( islocal ) {
+
+      // replace with a copy and maybe grid swizzle
+      // return in;??
+      double t = usecond();
+      padded = in;
+      tins += usecond() - t;
+      
+    } else {
+
+      //////////////////////////////////////////////
+      // Replace sequence with
+      // ---------------------
+      // (i) Gather high face(s); start comms
+      // (ii) Gather low  face(s); start comms
+      // (iii) Copy middle bit with localCopyRegion
+      // (iv) Complete high face(s), insert slice(s)
+      // (iv) Complete low  face(s), insert slice(s)
+      //////////////////////////////////////////////
+      // Middle bit
+      double t = usecond();
+      for(int x=0;x<local[dim];x++){
+	InsertSliceLocal(in,padded,x,depth+x,dim);
+      }
+      tins += usecond() - t;
+    
+      // High bit
+      t = usecond();
+      shifted = cshift.Cshift(in,dim,depth);
+      tshift += usecond() - t;
+
+      t=usecond();
+      for(int x=0;x<depth;x++){
+	InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
+      }
+      tins += usecond() - t;
+    
+      // Low bit
+      t = usecond();
+      shifted = cshift.Cshift(in,dim,-depth);
+      tshift += usecond() - t;
+    
+      t = usecond();
+      for(int x=0;x<depth;x++){
+	InsertSliceLocal(shifted,padded,x,x,dim);
+      }
+      tins += usecond() - t;
+
+    }
    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
    
    return padded;
  }

+  template<class vobj>
+  inline Lattice<vobj> ExpandPeriodic(int dim, const Lattice<vobj> &in) const
+  {
+    Coordinate processors=unpadded_grid->_processors;
+    GridBase *old_grid = in.Grid();
+    GridCartesian *new_grid = grids[dim];//These are new grids
+    Lattice<vobj>  padded(new_grid);
+    //    Lattice<vobj> shifted(old_grid);    
+    Coordinate local     =old_grid->LocalDimensions();
+    Coordinate plocal    =new_grid->LocalDimensions();
+    if(dim==0) conformable(old_grid,unpadded_grid);
+    else       conformable(old_grid,grids[dim-1]);
+
+    //    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
+    double tins=0, tshift=0;
+
+    int islocal = 0 ;
+    if ( processors[dim] == 1 ) islocal = 1;
+
+    if ( islocal ) {
+      padded=in; // slightly different interface could avoid a copy operation
+    } else {
+      Face_exchange(in,padded,dim,depth);
+      return padded;
+    }
+    return padded;
+  }
+  template<class vobj>
+  void Face_exchange(const Lattice<vobj> &from,
+		     Lattice<vobj> &to,
+		     int dimension,int depth) const
+  {
+    typedef typename vobj::vector_type vector_type;
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::scalar_object sobj;
+
+    RealD t_gather=0.0;
+    RealD t_scatter=0.0;
+    RealD t_comms=0.0;
+    RealD t_copy=0.0;
+    
+    //    std::cout << GridLogMessage << "dimension " <<dimension<<std::endl;
+    //    DumpSliceNorm(std::string("Face_exchange from"),from,dimension);
+    GridBase *grid=from.Grid();
+    GridBase *new_grid=to.Grid();
+
+    Coordinate lds = from.Grid()->_ldimensions;
+    Coordinate nlds=   to.Grid()->_ldimensions;
+    Coordinate simd= from.Grid()->_simd_layout;
+    int ld    = lds[dimension];
+    int nld   = to.Grid()->_ldimensions[dimension];
+    const int Nsimd = vobj::Nsimd();
+
+    assert(depth<=lds[dimension]); // A must be on neighbouring node
+    assert(depth>0);   // A caller bug if zero
+    assert(ld+2*depth==nld);
+    ////////////////////////////////////////////////////////////////////////////
+    // Face size and byte calculations
+    ////////////////////////////////////////////////////////////////////////////
+    int buffer_size = 1;
+    for(int d=0;d<lds.size();d++){
+      if ( d!= dimension) buffer_size=buffer_size*lds[d];
+    }
+    buffer_size = buffer_size  / Nsimd;
+    int rNsimd = Nsimd / simd[dimension];
+    assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
+
+    static cshiftVector<vobj> send_buf; 
+    static cshiftVector<vobj> recv_buf;
+    send_buf.resize(buffer_size*2*depth);    
+    recv_buf.resize(buffer_size*2*depth);
+
+    std::vector<CommsRequest_t> fwd_req;   
+    std::vector<CommsRequest_t> bwd_req;   
+
+    int words = buffer_size;
+    int bytes = words * sizeof(vobj);
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Communication coords
+    ////////////////////////////////////////////////////////////////////////////
+    int comm_proc = 1;
+    int xmit_to_rank;
+    int recv_from_rank;
+    grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Gather all surface terms up to depth "d"
+    ////////////////////////////////////////////////////////////////////////////
+    RealD t;
+    RealD t_tot=-usecond();
+    int plane=0;
+    for ( int d=0;d < depth ; d ++ ) {
+      int tag = d*1024 + dimension*2+0;
+
+      t=usecond();
+      GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
+      t_gather+=usecond()-t;
+
+      t=usecond();
+      grid->SendToRecvFromBegin(fwd_req,
+				(void *)&send_buf[d*buffer_size], xmit_to_rank,
+				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+      t_comms+=usecond()-t;
+     }
+    for ( int d=0;d < depth ; d ++ ) {
+      int tag = d*1024 + dimension*2+1;
+
+      t=usecond();
+      GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
+      t_gather+= usecond() - t;
+
+      t=usecond();
+      grid->SendToRecvFromBegin(bwd_req,
+				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
+				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+      t_comms+=usecond()-t;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Copy interior -- overlap this with comms
+    ////////////////////////////////////////////////////////////////////////////
+    int Nd = new_grid->Nd();
+    Coordinate LL(Nd,0);
+    Coordinate sz = grid->_ldimensions;
+    Coordinate toLL(Nd,0);
+    toLL[dimension]=depth;
+    t=usecond();
+    localCopyRegion(from,to,LL,toLL,sz);
+    t_copy= usecond() - t;
+    
+    ////////////////////////////////////////////////////////////////////////////
+    // Scatter all faces
+    ////////////////////////////////////////////////////////////////////////////
+    plane=0;
+
+    t=usecond();
+    grid->CommsComplete(fwd_req);
+    t_comms+= usecond() - t;
+
+    t=usecond();
+    for ( int d=0;d < depth ; d ++ ) {
+      ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
+    }
+    t_scatter= usecond() - t;
+
+    t=usecond();
+    grid->CommsComplete(bwd_req);
+    t_comms+= usecond() - t;
+    
+    t=usecond();
+    for ( int d=0;d < depth ; d ++ ) {
+      ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
+    }
+    t_scatter+= usecond() - t;
+    t_tot+=usecond();
+
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000  << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000   << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy   :" << t_copy/1000      << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << t_comms/1000     << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total  :" << t_tot/1000     << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << depth*4.0*bytes/t_gather << "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << depth*4.0*bytes/t_scatter<< "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << (RealD)4.0*bytes/t_comms   << "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: face bytes  :" << depth*bytes/1e6 << "MB"<<std::endl;
+  }
+  
 };
 

 NAMESPACE_END(Grid);

+
@@ -165,7 +165,7 @@ class BinaryIO {
 	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
 	
-	int global_site;
+	int64_t global_site;

 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);

@@ -175,8 +175,8 @@ class BinaryIO {

 	Lexicographic::IndexFromCoor(coor,global_site,global_vol);

-	uint32_t gsite29   = global_site%29;
-	uint32_t gsite31   = global_site%31;
+	uint64_t gsite29   = global_site%29;
+	uint64_t gsite31   = global_site%31;
 	
 	site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
 	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
@@ -545,7 +545,9 @@ class BinaryIO {
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
-				       uint32_t &scidac_csumb)
+				       uint32_t &scidac_csumb,
+				       int control=BINARYIO_LEXICOGRAPHIC
+				       )
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@@ -556,7 +558,7 @@ class BinaryIO {
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
    
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|control,
 	     nersc_csum,scidac_csuma,scidac_csumb);

    GridStopWatch timer; 
@@ -582,7 +584,8 @@ class BinaryIO {
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
-					  uint32_t &scidac_csumb)
+					  uint32_t &scidac_csumb,
+					  int control=BINARYIO_LEXICOGRAPHIC)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@@ -607,7 +610,7 @@ class BinaryIO {
    while (attemptsLeft >= 0)
    {
      grid->Barrier();
-      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
+      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|control,
 	             nersc_csum,scidac_csuma,scidac_csumb);
      if (checkWrite)
      {
@@ -617,7 +620,7 @@ class BinaryIO {

        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
        grid->Barrier();
-        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|control,
 	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
        {
@@ -162,8 +162,14 @@ template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
 {
   uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
   uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
-   if ( scidac_csuma !=scidac_checksuma) return 0;
-   if ( scidac_csumb !=scidac_checksumb) return 0;
+   std::cout << GridLogMessage << " scidacChecksumVerify computed "<<scidac_csuma<<" expected "<<scidac_checksuma <<std::endl;
+   std::cout << GridLogMessage << " scidacChecksumVerify computed "<<scidac_csumb<<" expected "<<scidac_checksumb <<std::endl;
+   if ( scidac_csuma !=scidac_checksuma) {
+     return 0;
+   };
+   if ( scidac_csumb !=scidac_checksumb) {
+     return 0;
+   };
   return 1;
 }

@@ -206,7 +212,7 @@ class GridLimeReader : public BinaryIO {
  // Read a generic lattice field and verify checksum
  ////////////////////////////////////////////
  template<class vobj>
-  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
+  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
  {
    typedef typename vobj::scalar_object sobj;
    scidacChecksum scidacChecksum_;
@@ -238,7 +244,7 @@ class GridLimeReader : public BinaryIO {
 	uint64_t offset= ftello(File);
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
-	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb,control);
 	std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
 	std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
 	/////////////////////////////////////////////
@@ -408,7 +414,7 @@ class GridLimeWriter : public BinaryIO
  // in communicator used by the field.Grid()
  ////////////////////////////////////////////////////
  template<class vobj>
-  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
+  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
  {
    ////////////////////////////////////////////////////////////////////
    // NB: FILE and iostream are jointly writing disjoint sequences in the
@@ -459,7 +465,7 @@ class GridLimeWriter : public BinaryIO
    ///////////////////////////////////////////
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
-    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb,control);

    ///////////////////////////////////////////
    // Wind forward and close the record
@@ -512,7 +518,8 @@ class ScidacWriter : public GridLimeWriter {
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
-                              const unsigned int recordScientificPrec = 0) 
+                              const unsigned int recordScientificPrec = 0,
+			      int control=BINARYIO_LEXICOGRAPHIC)
  {
    GridBase * grid = field.Grid();

@@ -534,7 +541,7 @@ class ScidacWriter : public GridLimeWriter {
      writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    }
    // Collective call
-    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
+    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);      // Closes message with checksum
  }
 };

@@ -553,7 +560,8 @@ class ScidacReader : public GridLimeReader {
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
-  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
+  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord,
+			     int control=BINARYIO_LEXICOGRAPHIC) 
  {
    typedef typename vobj::scalar_object sobj;
    GridBase * grid = field.Grid();
@@ -571,7 +579,7 @@ class ScidacReader : public GridLimeReader {
    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
-    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
+    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);
  }
  void skipPastBinaryRecord(void) {
    std::string rec_name(ILDG_BINARY_DATA);
@@ -67,7 +67,6 @@ NAMESPACE_CHECK(Scalar);
 #include <Grid/qcd/utils/Metric.h>
 NAMESPACE_CHECK(Metric);
 #include <Grid/qcd/utils/CovariantLaplacian.h>
-#include <Grid/qcd/utils/CovariantLaplacianRat.h>
 NAMESPACE_CHECK(CovariantLaplacian);


@@ -65,19 +65,6 @@ struct WilsonImplParams {
  }
 };

-struct GaugeImplParams {
-//  bool overlapCommsCompute;
-//  AcceleratorVector<Real,Nd> twist_n_2pi_L;
-  AcceleratorVector<Complex,Nd> boundary_phases;
-  GaugeImplParams()  {
-    boundary_phases.resize(Nd, 1.0);
-//      twist_n_2pi_L.resize(Nd, 0.0);
-  };
-  GaugeImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi) {
-//    twist_n_2pi_L.resize(Nd, 0.0);
-  }
-};
-
 struct StaggeredImplParams {
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  int  partialDirichlet;
@@ -32,7 +32,7 @@ directory

 NAMESPACE_BEGIN(Grid);

-#undef CPS_MD_TIME
+#define CPS_MD_TIME

 #ifdef CPS_MD_TIME
 #define HMC_MOMENTUM_DENOMINATOR (2.0)
@@ -42,13 +42,9 @@ template <class Gimpl>
 class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
 public:  
  INHERIT_GIMPL_TYPES(Gimpl);
-  typedef GaugeImplParams ImplParams;
-  ImplParams Params;

  /////////////////////////// constructors
-  explicit WilsonGaugeAction(RealD beta_,
-		  const ImplParams &p = ImplParams()
-		  ):beta(beta_),Params(p){};
+  explicit WilsonGaugeAction(RealD beta_):beta(beta_){};

  virtual std::string action_name() {return "WilsonGaugeAction";}

@@ -60,53 +56,14 @@ public:

  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){};  // noop as no pseudoferms

-// Umu<->U maximally confusing
-  virtual void boundary(const GaugeField &Umu, GaugeField &Ub){
-    typedef typename Simd::scalar_type scalar_type;
-    assert(Params.boundary_phases.size() == Nd);
-    GridBase *GaugeGrid=Umu.Grid();
-    GaugeLinkField U(GaugeGrid);
-    GaugeLinkField tmp(GaugeGrid);
-
-    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-    for (int mu = 0; mu < Nd; mu++) {
-	////////// boundary phase /////////////
-      auto pha = Params.boundary_phases[mu];
-      scalar_type phase( real(pha),imag(pha) );
-      std::cout<< GridLogIterative << "[WilsonGaugeAction] boundary "<<mu<<" "<<phase<< std::endl; 
-
-	int L   = GaugeGrid->GlobalDimensions()[mu];
-        int Lmu = L - 1;
-
-      LatticeCoordinate(coor, mu);
-
-      U = PeekIndex<LorentzIndex>(Umu, mu);
-      tmp = where(coor == Lmu, phase * U, U);
-      PokeIndex<LorentzIndex>(Ub, tmp, mu);
-//      PokeIndex<LorentzIndex>(Ub, U, mu);
-//      PokeIndex<LorentzIndex>(Umu, tmp, mu);
-
-    }
-  };
-
  virtual RealD S(const GaugeField &U) {
-    GaugeField Ub(U.Grid());
-    this->boundary(U,Ub);
-    static RealD lastG=0.;
-    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(Ub);
-    RealD vol = Ub.Grid()->gSites();
+    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
+    RealD vol = U.Grid()->gSites();
    RealD action = beta * (1.0 - plaq) * (Nd * (Nd - 1.0)) * vol * 0.5;
-    std::cout << GridLogMessage << "[WilsonGaugeAction] dH: " << action-lastG << std::endl;
-    RealD plaq_o = WilsonLoops<Gimpl>::avgPlaquette(U);
-    RealD action_o = beta * (1.0 - plaq_o) * (Nd * (Nd - 1.0)) * vol * 0.5;
-    std::cout << GridLogMessage << "[WilsonGaugeAction] U: " << action_o <<" Ub: "<< action  << std::endl;
-    lastG=action;
    return action;
  };

  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
-    GaugeField Ub(U.Grid());
-    this->boundary(U,Ub);
    // not optimal implementation FIXME
    // extend Ta to include Lorentz indexes

@@ -116,9 +73,10 @@ public:
    GaugeLinkField dSdU_mu(U.Grid());
    for (int mu = 0; mu < Nd; mu++) {

-      Umu = PeekIndex<LorentzIndex>(Ub, mu);
+      Umu = PeekIndex<LorentzIndex>(U, mu);
+      
      // Staple in direction mu
-      WilsonLoops<Gimpl>::Staple(dSdU_mu, Ub, mu);
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
      
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
@@ -178,10 +178,7 @@ NAMESPACE_BEGIN(Grid);
        // Use chronological inverter to forecast solutions across poles
        std::vector<FermionField> prev_solns;
        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
-	MdagMLinearOperator<AbstractEOFAFermion<Impl> ,FermionField> MdagML(Lop);
-	MdagMLinearOperator<AbstractEOFAFermion<Impl> ,FermionField> MdagMR(Rop);
-//        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
-	ChronoForecast<MdagMLinearOperator<AbstractEOFAFermion<Impl>, FermionField> , FermionField> Forecast;
+        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;

        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
        RealD N(PowerNegHalf.norm);
@@ -201,7 +198,7 @@ NAMESPACE_BEGIN(Grid);
          heatbathRefreshShiftCoefficients(0, -gamma_l);
          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
            Lop.Mdag(CG_src, Forecast_src);
-            CG_soln = Forecast(MdagML, Forecast_src, prev_solns);
+            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
            SolverHBL(Lop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
@@ -228,7 +225,7 @@ NAMESPACE_BEGIN(Grid);
 	  heatbathRefreshShiftCoefficients(1, -gamma_l*PowerNegHalf.poles[k]);
          if(use_heatbath_forecasting){
            Rop.Mdag(CG_src, Forecast_src);
-            CG_soln = Forecast(MdagMR, Forecast_src, prev_solns);
+            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
            SolverHBR(Rop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
@@ -1,6 +1,6 @@
 #pragma once

-#undef CPS_MD_TIME 
+#define CPS_MD_TIME 

 #ifdef CPS_MD_TIME
 #define HMC_MOMENTUM_DENOMINATOR (2.0)
@@ -121,19 +121,12 @@ public:

  template <class SmearingPolicy>
  void Run(SmearingPolicy &S) {
-    TrivialMetric<typename Implementation::Field> Mtr;
-    Runner(S,Mtr);
-  }
-
-  template <class SmearingPolicy, class Metric>
-  void Run(SmearingPolicy &S, Metric &Mtr) {
-    Runner(S,Mtr);
+    Runner(S);
  }

  void Run(){
    NoSmearing<Implementation> S;
-    TrivialMetric<typename Implementation::Field> Mtr;
-    Runner(S,Mtr);
+    Runner(S);
  }

  //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
@@ -183,15 +176,15 @@ public:
  //////////////////////////////////////////////////////////////////

 private:
-  template <class SmearingPolicy, class Metric>
-  void Runner(SmearingPolicy &Smearing, Metric &Mtr) {
+  template <class SmearingPolicy>
+  void Runner(SmearingPolicy &Smearing) {
    auto UGrid = Resources.GetCartesian();
    Field U(UGrid);

    initializeGaugeFieldAndRNGs(U);

    typedef IntegratorType<SmearingPolicy> TheIntegrator;
-    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing,Mtr);
+    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);

    // Sets the momentum filter
    MDynamics.setMomentumFilter(*(Resources.GetMomentumFilter()));
@@ -55,8 +55,6 @@ struct HMCparameters: Serializable {
                                  Integer, NoMetropolisUntil,
 				  bool, PerformRandomShift, /* @brief Randomly shift the gauge configuration at the start of a trajectory */
                                  std::string, StartingType,
-				  Integer, SW,
-                                  RealD, Kappa,
                                  IntegratorParameters, MD)

  HMCparameters() {
@@ -112,8 +110,6 @@ private:
  IntegratorType &TheIntegrator;
  ObsListType Observables;

-  int traj_num;
-
  /////////////////////////////////////////////////////////
  // Metropolis step
  /////////////////////////////////////////////////////////
@@ -204,14 +200,14 @@ private:

    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << " Molecular Dynamics evolution ";
-    TheIntegrator.integrate(U,traj_num);
+    TheIntegrator.integrate(U);
    std::cout << GridLogMessage << "--------------------------------------------------\n";

    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // updated state action
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
-    std::cout << GridLogMessage << "Compute final action" <<std::endl;
+    std::cout << GridLogMessage << "Compute final action";
    RealD H1 = TheIntegrator.S(U);  
    std::cout << GridLogMessage << "--------------------------------------------------\n";

@@ -246,7 +242,7 @@ public:
  HybridMonteCarlo(HMCparameters _Pams, IntegratorType &_Int,
                   GridSerialRNG &_sRNG, GridParallelRNG &_pRNG, 
                   ObsListType _Obs, Field &_U)
-    : Params(_Pams), TheIntegrator(_Int), sRNG(_sRNG), pRNG(_pRNG), Observables(_Obs), Ucur(_U),traj_num(0) {}
+    : Params(_Pams), TheIntegrator(_Int), sRNG(_sRNG), pRNG(_pRNG), Observables(_Obs), Ucur(_U) {}
  ~HybridMonteCarlo(){};

  void evolve(void) {
@@ -261,10 +257,9 @@ public:
    unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory;

    for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
-    

      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
-      traj_num=traj;
+
      if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
      	std::cout << GridLogHMC << "-- Thermalization" << std::endl;
      }
@@ -9,7 +9,6 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <cossu@post.kek.jp>
-Author: Chulwoo Jung <chulwoo@bnl.gov>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -34,7 +33,6 @@ directory
 #define INTEGRATOR_INCLUDED

 #include <memory>
-#include <Grid/parallelIO/NerscIO.h>

 NAMESPACE_BEGIN(Grid);

@@ -43,19 +41,10 @@ public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(IntegratorParameters,
 				  std::string, name,      // name of the integrator
 				  unsigned int, MDsteps,  // number of outer steps
-				  RealD, RMHMCTol,
-                                  RealD, RMHMCCGTol,
-                                  RealD, lambda0,
-                                  RealD, lambda1,
-                                  RealD, lambda2,
 				  RealD, trajL)           // trajectory length

  IntegratorParameters(int MDsteps_ = 10, RealD trajL_ = 1.0)
  : MDsteps(MDsteps_),
-   lambda0(0.1931833275037836),
-   lambda1(0.1931833275037836),
-   lambda2(0.1931833275037836),
-   RMHMCTol(1e-8),RMHMCCGTol(1e-8),
    trajL(trajL_) {};

  template <class ReaderClass, typename std::enable_if<isReader<ReaderClass>::value, int >::type = 0 >
@@ -86,14 +75,11 @@ public:
  double t_U;  // Track time passing on each level and for U and for P
  std::vector<double> t_P;  

-//  MomentaField P;
-  GeneralisedMomenta<FieldImplementation > P;
+  MomentaField P;
  SmearingPolicy& Smearer;
  RepresentationPolicy Representations;
  IntegratorParameters Params;

-  RealD Saux,Smom,Sg;
-
  //Filters allow the user to manipulate the conjugate momentum, for example to freeze links in DDHMC
  //It is applied whenever the momentum is updated / refreshed
  //The default filter does nothing
@@ -110,16 +96,7 @@ public:
  void update_P(Field& U, int level, double ep) 
  {
    t_P[level] += ep;
-    update_P(P.Mom, U, level, ep);
-
-    std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
-  }
-
-  void update_P2(Field& U, int level, double ep) 
-  {
-    t_P[level] += ep;
-    update_P2(P.Mom, U, level, ep);
-
+    update_P(P, U, level, ep);
    std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
  }

@@ -142,174 +119,62 @@ public:
    }
  } update_P_hireps{};

+ 
  void update_P(MomentaField& Mom, Field& U, int level, double ep) {
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing

    for (int a = 0; a < as[level].actions.size(); ++a) {
+
      double start_full = usecond();
      Field force(U.Grid());
      conformable(U.Grid(), Mom.Grid());

-      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      double start_force = usecond();
-      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta

-      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
-      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
+      as[level].actions.at(a)->deriv_timer_start();
+      as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta
+      as[level].actions.at(a)->deriv_timer_stop();
+
+      auto name = as[level].actions.at(a)->action_name();
+
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
+      
+      MomFilter->applyFilter(force);
+
+      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl;
+      
+      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
+      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
+
+      Real force_max   = std::sqrt(maxLocalNorm2(force));
+      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    
+
+      as[level].actions.at(a)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
+      
+      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] dt           : " << ep <<" "<<name<<std::endl;
+      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl;
+      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max    : " << force_max <<" "<<name<<std::endl;
+      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt average  : " << impulse_abs <<" "<<name<<std::endl;
+      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt max      : " << impulse_max <<" "<<name<<std::endl;
+
      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
      double time_force = (end_force - start_force) / 1e3;
      std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
+
    }

    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);
-  }

-  void update_P2(MomentaField& Mom, Field& U, int level, double ep) {
-    // input U actually not used in the fundamental case
-    // Fundamental updates, include smearing
-
-    std::cout << GridLogIntegrator << "U before update_P2: " << std::sqrt(norm2(U)) << std::endl;
-    // Generalised momenta  
-    // Derivative of the kinetic term must be computed before
-    // Mom is the momenta and gets updated by the 
-    // actions derivatives
-    MomentaField MomDer(P.Mom.Grid());
-    P.M.ImportGauge(U);
-    P.DerivativeU(P.Mom, MomDer);
-    std::cout << GridLogIntegrator << "MomDer update_P2: " << std::sqrt(norm2(MomDer)) << std::endl;
-//    Mom -= MomDer * ep;
-    Mom -= MomDer * ep * HMC_MOMENTUM_DENOMINATOR;
-    std::cout << GridLogIntegrator << "Mom update_P2: " << std::sqrt(norm2(Mom)) << std::endl;
-
-    // Auxiliary fields
-    P.update_auxiliary_momenta(ep*0.5 );
-    P.AuxiliaryFieldsDerivative(MomDer);
-    std::cout << GridLogIntegrator << "MomDer(Aux) update_P2: " << std::sqrt(norm2(Mom)) << std::endl;
-//    Mom -= MomDer * ep;
-    Mom -= MomDer * ep * HMC_MOMENTUM_DENOMINATOR;
-    P.update_auxiliary_momenta(ep*0.5 );
-
-    for (int a = 0; a < as[level].actions.size(); ++a) {
-      double start_full = usecond();
-      Field force(U.Grid());
-      conformable(U.Grid(), Mom.Grid());
-
-      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
-      double start_force = usecond();
-      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
-
-      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
-      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
-      force = FieldImplementation::projectForce(force); // Ta for gauge fields
-      double end_force = usecond();
-      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
-      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
-      double end_full = usecond();
-      double time_full  = (end_full - start_full) / 1e3;
-      double time_force = (end_force - start_force) / 1e3;
-      std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
-    }
-
-    // Force from the other representations
-    as[level].apply(update_P_hireps, Representations, Mom, U, ep);
-  }
-
-  void implicit_update_P(Field& U, int level, double ep, double ep1, bool intermediate = false) {
-    t_P[level] += ep;
-
-    double ep2= ep-ep1;
-
-    std::cout << GridLogIntegrator << "[" << level << "] P "
-              << " dt " << ep << " : t_P " << t_P[level] << std::endl;
-    std::cout << GridLogIntegrator << "U before implicit_update_P: " << std::sqrt(norm2(U)) << std::endl;
-    // Fundamental updates, include smearing
-    MomentaField Msum(P.Mom.Grid());
-    Msum = Zero();
-    for (int a = 0; a < as[level].actions.size(); ++a) {
-      // Compute the force terms for the lagrangian part
-      // We need to compute the derivative of the actions
-      // only once
-      Field force(U.Grid());
-      conformable(U.Grid(), P.Mom.Grid());
-      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
-      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
-
-      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
-      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
-      force = FieldImplementation::projectForce(force);  // Ta for gauge fields
-      Real force_abs = std::sqrt(norm2(force) / U.Grid()->gSites());
-      std::cout << GridLogIntegrator << "|Force| site average: " << force_abs
-                << std::endl;
-      Msum += force;
-    }
-
-    MomentaField NewMom = P.Mom;
-    MomentaField OldMom = P.Mom;
-    double threshold = Params.RMHMCTol;
-    P.M.ImportGauge(U);
-    MomentaField MomDer(P.Mom.Grid());
-    MomentaField MomDer1(P.Mom.Grid());
-    MomentaField AuxDer(P.Mom.Grid());
-    MomDer1 = Zero();
-    MomentaField diff(P.Mom.Grid());
-    double factor = 2.0;
-    if (intermediate){
-      P.DerivativeU(P.Mom, MomDer1);
-      factor = 1.0;
-    }
-//    std::cout << GridLogIntegrator << "MomDer1 implicit_update_P: " << std::sqrt(norm2(MomDer1)) << std::endl;
-
-    // Auxiliary fields
-    P.update_auxiliary_momenta(ep1);
-    P.AuxiliaryFieldsDerivative(AuxDer);
-    Msum += AuxDer;
-    
-
-    // Here run recursively
-    int counter = 1;
-    RealD RelativeError;
-    do {
-      std::cout << GridLogIntegrator << "UpdateP implicit step "<< counter << std::endl;
-
-      // Compute the derivative of the kinetic term
-      // with respect to the gauge field
-      P.DerivativeU(NewMom, MomDer);
-      Real force_abs = std::sqrt(norm2(MomDer) / U.Grid()->gSites());
-      std::cout << GridLogIntegrator << "|Force| laplacian site average: " << force_abs
-                << std::endl;
-
-//      NewMom = P.Mom - ep* 0.5 * HMC_MOMENTUM_DENOMINATOR * (2.0*Msum + factor*MomDer + MomDer1);// simplify
-      NewMom = P.Mom -  HMC_MOMENTUM_DENOMINATOR * (ep*Msum + ep1* factor*MomDer + ep2* MomDer1);// simplify
-      diff = NewMom - OldMom;
-      counter++;
-      RelativeError = std::sqrt(norm2(diff))/std::sqrt(norm2(NewMom));
-      std::cout << GridLogIntegrator << "UpdateP RelativeError: " << RelativeError << std::endl;
-      OldMom = NewMom;
-    } while (RelativeError > threshold);
-
-    P.Mom = NewMom;
-    std::cout << GridLogIntegrator << "NewMom implicit_update_P: " << std::sqrt(norm2(NewMom)) << std::endl;
-
-    // update the auxiliary fields momenta    
-    P.update_auxiliary_momenta(ep2);
-  }
-
-  void implicit_update_P(Field& U, int level, double ep, bool intermediate = false) {
-      implicit_update_P( U, level, ep, ep*0.5, intermediate ); 
  }

  void update_U(Field& U, double ep) 
  {
-    update_U(P.Mom, U, ep);
+    update_U(P, U, ep);

    t_U += ep;
    int fl = levels - 1;
@@ -318,8 +183,12 @@ public:
  
  void update_U(MomentaField& Mom, Field& U, double ep) 
  {
+    MomentaField MomFiltered(Mom.Grid());
+    MomFiltered = Mom;
+    MomFilter->applyFilter(MomFiltered);
+
    // exponential of Mom*U in the gauge fields case
-    FieldImplementation::update_field(Mom, U, ep);
+    FieldImplementation::update_field(MomFiltered, U, ep);

    // Update the smeared fields, can be implemented as observer
    Smearer.set_Field(U);
@@ -328,74 +197,18 @@ public:
    Representations.update(U);  // void functions if fundamental representation
  }

-  void implicit_update_U(Field&U, double ep, double ep1 ){
-    double ep2=ep-ep1;
-    t_U += ep;
-    int fl = levels - 1;
-    std::cout << GridLogIntegrator << "   " << "[" << fl << "] U " << " dt " << ep << " : t_U " << t_U << std::endl;
-    std::cout << GridLogIntegrator << "U before implicit_update_U: " << std::sqrt(norm2(U)) << std::endl;
-
-    MomentaField Mom1(P.Mom.Grid());
-    MomentaField Mom2(P.Mom.Grid());
-    RealD RelativeError;
-    Field diff(U.Grid());
-    Real threshold =  Params.RMHMCTol;
-    int counter = 1;
-    int MaxCounter = 100;
-
-    Field OldU = U;
-    Field NewU = U;
-
-    P.M.ImportGauge(U);
-    P.DerivativeP(Mom1); // first term in the derivative 
-    std::cout << GridLogIntegrator << "implicit_update_U: Mom1: " << std::sqrt(norm2(Mom1)) << std::endl;
-
-    P.update_auxiliary_fields(ep1);
-
-
-    MomentaField sum=Mom1;
-    do {
-      std::cout << GridLogIntegrator << "UpdateU implicit step "<< counter << std::endl;
-      
-      P.DerivativeP(Mom2); // second term in the derivative, on the updated U
-      std::cout << GridLogIntegrator << "implicit_update_U: Mom1: " << std::sqrt(norm2(Mom1)) << std::endl;
-      sum = (Mom1*ep1 + Mom2*ep2);
-
-      for (int mu = 0; mu < Nd; mu++) {
-        auto Umu = PeekIndex<LorentzIndex>(U, mu);
-        auto Pmu = PeekIndex<LorentzIndex>(sum, mu);
-        Umu = expMat(Pmu, 1, 12) * Umu;
-        PokeIndex<LorentzIndex>(NewU, ProjectOnGroup(Umu), mu);
-      }
-
-      diff = NewU - OldU;
-      RelativeError = std::sqrt(norm2(diff))/std::sqrt(norm2(NewU));
-      std::cout << GridLogIntegrator << "UpdateU RelativeError: " << RelativeError << std::endl;
-      
-      P.M.ImportGauge(NewU);
-      OldU = NewU; // some redundancy to be eliminated
-      counter++;
-    } while (RelativeError > threshold && counter < MaxCounter);
-
-    U = NewU;
-    std::cout << GridLogIntegrator << "NewU implicit_update_U: " << std::sqrt(norm2(U)) << std::endl;
-    P.update_auxiliary_fields(ep2);
-  }
-
-
  virtual void step(Field& U, int level, int first, int last) = 0;

 public:
  Integrator(GridBase* grid, IntegratorParameters Par,
             ActionSet<Field, RepresentationPolicy>& Aset,
-             SmearingPolicy& Sm, Metric<MomentaField>& M)
+             SmearingPolicy& Sm)
    : Params(Par),
      as(Aset),
-      P(grid, M),
+      P(grid),
      levels(Aset.size()),
      Smearer(Sm),
-      Representations(grid),
-      Saux(0.),Smom(0.),Sg(0.)
+      Representations(grid) 
  {
    t_P.resize(levels, 0.0);
    t_U = 0.0;
@@ -511,8 +324,7 @@ public:

  void reverse_momenta()
  {
-    P.Mom *= -1.0;
-    P.AuxMom *= -1.0;
+    P *= -1.0;
  }

  // to be used by the actionlevel class to iterate
@@ -531,14 +343,11 @@ public:
  // Initialization of momenta and actions
  void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG) 
  {
-    assert(P.Mom.Grid() == U.Grid());
+    assert(P.Grid() == U.Grid());
    std::cout << GridLogIntegrator << "Integrator refresh" << std::endl;

    std::cout << GridLogIntegrator << "Generating momentum" << std::endl;
-//    FieldImplementation::generate_momenta(P.Mom, sRNG, pRNG);
-    P.M.ImportGauge(U);
-    P.MomentaDistribution(sRNG,pRNG);
-
+    FieldImplementation::generate_momenta(P, sRNG, pRNG);

    // Update the smeared fields, can be implemented as observer
    // necessary to keep the fields updated even after a reject
@@ -593,22 +402,9 @@ public:

    std::cout << GridLogIntegrator << "Integrator action\n";

-//    RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
-//    RealD Hterm;
-
-//    static RealD Saux=0.,Smom=0.,Sg=0.;
-
-    RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
-    std::cout << GridLogMessage << "S:FieldSquareNorm H_p = " << H << "\n";
-    std::cout << GridLogMessage << "S:dSField = " << H-Smom << "\n";
-    Smom=H;
-    P.M.ImportGauge(U);
-    RealD Hterm = - P.MomentaAction();
-    std::cout << GridLogMessage << "S:Momentum action H_p = " << Hterm << "\n";
-    std::cout << GridLogMessage << "S:dSMom = " << Hterm-Saux << "\n";
-    Saux=Hterm;
-    H = Hterm;
+    RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom

+    RealD Hterm;

    // Actions
    for (int level = 0; level < as.size(); ++level) {
@@ -650,18 +446,9 @@ public:

    std::cout << GridLogIntegrator << "Integrator initial action\n";

-//    RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
-//    RealD Hterm;
-    RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
-    std::cout << GridLogMessage << "S:FieldSquareNorm H_p = " << H << "\n";
-    std::cout << GridLogMessage << "S:dSField = " << H-Smom << "\n";
-    Smom=H;
-    P.M.ImportGauge(U);
-    RealD Hterm = - P.MomentaAction();
-    std::cout << GridLogMessage << "S:Momentum action H_p = " << Hterm << "\n";
-    std::cout << GridLogMessage << "S:dSMom = " << Hterm-Saux << "\n";
-    Saux=Hterm;
-    H = Hterm;
+    RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
+
+    RealD Hterm;

    // Actions
    for (int level = 0; level < as.size(); ++level) {
@@ -684,7 +471,7 @@ public:
  }

  
-  void integrate(Field& U, int traj=-1 ) 
+  void integrate(Field& U) 
  {
    // reset the clocks
    t_U = 0;
@@ -696,12 +483,6 @@ public:
      int first_step = (stp == 0);
      int last_step = (stp == Params.MDsteps - 1);
      this->step(U, 0, first_step, last_step);
-      if (traj>=0){
-        std::string file("./config."+std::to_string(traj)+"_"+std::to_string(stp+1) );
-        int precision32 = 0;
-        int tworow      = 0;
-        NerscIO::writeConfiguration(U,file,tworow,precision32);
-      }
    }

    // Check the clocks all match on all levels
@@ -711,6 +492,7 @@ public:
    }

    FieldImplementation::Project(U);
+
    // and that we indeed got to the end of the trajectory
    assert(fabs(t_U - Params.trajL) < 1.0e-6);

@@ -102,8 +102,8 @@ public:

  std::string integrator_name(){return "LeapFrog";}

-  LeapFrog(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
-    : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm,M){};
+  LeapFrog(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm)
+    : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm){};

  void step(Field& U, int level, int _first, int _last) {
    int fl = this->as.size() - 1;
@@ -140,14 +140,14 @@ template <class FieldImplementation_, class SmearingPolicy, class Representation
 class MinimumNorm2 : public Integrator<FieldImplementation_, SmearingPolicy, RepresentationPolicy> 
 {
 private:
-//  const RealD lambda = 0.1931833275037836;
+  const RealD lambda = 0.1931833275037836;

 public:
  typedef FieldImplementation_ FieldImplementation;
  INHERIT_FIELD_TYPES(FieldImplementation);

-  MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
-    : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm,M){};
+  MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm)
+    : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm){};

  std::string integrator_name(){return "MininumNorm2";}

@@ -155,11 +155,6 @@ public:
    // level  : current level
    // fl     : final level
    // eps    : current step size
-    assert(level<3);
-    RealD lambda= this->Params.lambda0;
-    if (level>0) lambda= this->Params.lambda1;
-    if (level>1) lambda= this->Params.lambda2;
-    std::cout << GridLogMessage << "level: "<<level<< "lambda: "<<lambda<<std::endl;

    int fl = this->as.size() - 1;

@@ -215,9 +210,9 @@ public:
  // Looks like dH scales as dt^4. tested wilson/wilson 2 level.
  ForceGradient(GridBase* grid, IntegratorParameters Par,
                ActionSet<Field, RepresentationPolicy>& Aset,
-                SmearingPolicy& Sm, Metric<Field>& M)
+                SmearingPolicy& Sm)
    : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
-									    grid, Par, Aset, Sm,M){};
+									    grid, Par, Aset, Sm){};

  std::string integrator_name(){return "ForceGradient";}
  
@@ -280,255 +275,6 @@ public:
  }
 };

-////////////////////////////////
-// Riemannian Manifold HMC
-// Girolami et al
-////////////////////////////////
-
-
-
-// correct
-template <class FieldImplementation, class SmearingPolicy,
-          class RepresentationPolicy =
-              Representations<FundamentalRepresentation> >
-class ImplicitLeapFrog : public Integrator<FieldImplementation, SmearingPolicy,
-                                           RepresentationPolicy> {
- public:
-  typedef ImplicitLeapFrog<FieldImplementation, SmearingPolicy, RepresentationPolicy>
-      Algorithm;
-  INHERIT_FIELD_TYPES(FieldImplementation);
-
-  // Riemannian manifold metric operator
-  // Hermitian operator Fisher
-
-  std::string integrator_name(){return "ImplicitLeapFrog";}
-
-  ImplicitLeapFrog(GridBase* grid, IntegratorParameters Par,
-           ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
-      : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
-            grid, Par, Aset, Sm, M){};
-
-  void step(Field& U, int level, int _first, int _last) {
-    int fl = this->as.size() - 1;
-    // level  : current level
-    // fl     : final level
-    // eps    : current step size
-
-    // Get current level step size
-    RealD eps = this->Params.trajL/this->Params.MDsteps;
-    for (int l = 0; l <= level; ++l) eps /= this->as[l].multiplier;
-
-    int multiplier = this->as[level].multiplier;
-    for (int e = 0; e < multiplier; ++e) {
-      int first_step = _first && (e == 0);
-      int last_step = _last && (e == multiplier - 1);
-
-      if (first_step) {  // initial half step
-       this->implicit_update_P(U, level, eps / 2.0);
-      }
-
-      if (level == fl) {  // lowest level
-        this->implicit_update_U(U, eps,eps/2.);
-      } else {  // recursive function call
-        this->step(U, level + 1, first_step, last_step);
-      }
-
-      //int mm = last_step ? 1 : 2;
-      if (last_step){
-        this->update_P2(U, level, eps / 2.0);
-      } else {
-      this->implicit_update_P(U, level, eps, true);// works intermediate step
-      }
-    }
-  }
-};
-
-
-template <class FieldImplementation, class SmearingPolicy,
-          class RepresentationPolicy =
-              Representations<FundamentalRepresentation> >
-class ImplicitMinimumNorm2 : public Integrator<FieldImplementation, SmearingPolicy,
-                                       RepresentationPolicy> {
- private:
-//  const RealD lambda = 0.1931833275037836;
-
- public:
-  INHERIT_FIELD_TYPES(FieldImplementation);
-
-  ImplicitMinimumNorm2(GridBase* grid, IntegratorParameters Par,
-               ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
-      : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
-            grid, Par, Aset, Sm, M){};
-
-  std::string integrator_name(){return "ImplicitMininumNorm2";}
-
-  void step(Field& U, int level, int _first, int _last) {
-    // level  : current level
-    // fl     : final level
-    // eps    : current step size
-
-    int fl = this->as.size() - 1;
-//    assert(Params.lambda.size()>level);
-//    RealD lambda= Params.lambda[level];
-    assert(level<3);
-    RealD lambda= this->Params.lambda0;
-    if (level>0) lambda= this->Params.lambda1;
-    if (level>1) lambda= this->Params.lambda2;
-    std::cout << GridLogMessage << "level: "<<level<< "lambda: "<<lambda<<std::endl;
-
-  if(level<fl){
-
-    RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0;
-    for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier;
-
-    // Nesting:  2xupdate_U of size eps/2
-    // Next level is eps/2/multiplier
-
-    int multiplier = this->as[level].multiplier;
-    for (int e = 0; e < multiplier; ++e) {  // steps per step
-
-      int first_step = _first && (e == 0);
-      int last_step = _last && (e == multiplier - 1);
-
-      if (first_step) {  // initial half step
-        this->update_P(U, level, lambda * eps);
-      }
-
-        this->step(U, level + 1, first_step, 0);
-
-      this->update_P(U, level, (1.0 - 2.0 * lambda) * eps);
-
-        this->step(U, level + 1, 0, last_step);
-
-      int mm = (last_step) ? 1 : 2;
-      this->update_P(U, level, lambda * eps * mm);
-    }
-  } 
-  else 
-  { // last level
-    RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0;
-    for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier;
-
-    // Nesting:  2xupdate_U of size eps/2
-    // Next level is eps/2/multiplier
-
-    int multiplier = this->as[level].multiplier;
-    for (int e = 0; e < multiplier; ++e) {  // steps per step
-
-      int first_step = _first && (e == 0);
-      int last_step = _last && (e == multiplier - 1);
-
-      if (first_step) {  // initial half step
-        this->implicit_update_P(U, level, lambda * eps);
-      }
-
-      this->implicit_update_U(U, 0.5 * eps,lambda*eps);
-
-      this->implicit_update_P(U, level, (1.0 - 2.0 * lambda) * eps, true);
-
-      this->implicit_update_U(U, 0.5 * eps, (0.5-lambda)*eps);
-
-      if (last_step) {
-        this->update_P2(U, level, eps * lambda);
-      } else {
-        this->implicit_update_P(U, level, lambda * eps*2.0, true);
-      }
-    }
-  }
-
-  }
-};
-
-template <class FieldImplementation, class SmearingPolicy,
-          class RepresentationPolicy =
-              Representations<FundamentalRepresentation> >
-class ImplicitCampostrini : public Integrator<FieldImplementation, SmearingPolicy,
-                                       RepresentationPolicy> {
- private:
-//  const RealD lambda = 0.1931833275037836;
-
- public:
-  INHERIT_FIELD_TYPES(FieldImplementation);
-
-  ImplicitCampostrini(GridBase* grid, IntegratorParameters Par,
-               ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
-      : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
-            grid, Par, Aset, Sm, M){};
-
-  std::string integrator_name(){return "ImplicitCampostrini";}
-
-  void step(Field& U, int level, int _first, int _last) {
-    // level  : current level
-    // fl     : final level
-    // eps    : current step size
-
-    int fl = this->as.size() - 1;
-//    assert(Params.lambda.size()>level);
-//    RealD lambda= Params.lambda[level];
-    assert(level<3);
-    RealD lambda= this->Params.lambda0;
-    if (level>0) lambda= this->Params.lambda1;
-    if (level>1) lambda= this->Params.lambda2;
-    std::cout << GridLogMessage << "level: "<<level<< "lambda: "<<lambda<<std::endl;
-    
-    RealD sigma=pow(2.0,1./3.);
-
-  if(level<fl){
-//Still Omelyan. Needs to change step() to accept variable stepsize
-    RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0;
-    for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier;
-
-    // Nesting:  2xupdate_U of size eps/2
-    // Next level is eps/2/multiplier
-
-    int multiplier = this->as[level].multiplier;
-    for (int e = 0; e < multiplier; ++e) {  // steps per step
-
-      int first_step = _first && (e == 0);
-      int last_step = _last && (e == multiplier - 1);
-
-      if (first_step) {  // initial half step
-        this->update_P(U, level, lambda * eps);
-      }
-
-        this->step(U, level + 1, first_step, 0);
-
-      this->update_P(U, level, (1.0 - 2.0 * lambda) * eps);
-
-        this->step(U, level + 1, 0, last_step);
-
-      int mm = (last_step) ? 1 : 2;
-      this->update_P(U, level, lambda * eps * mm);
-    }
-  } 
-  else 
-  { // last level
-    RealD dt = this->Params.trajL/this->Params.MDsteps * 2.0;
-    for (int l = 0; l <= level; ++l) dt /= 2.0 * this->as[l].multiplier;
-
-    RealD epsilon = dt/(2.0 - sigma);
-
-    int multiplier = this->as[level].multiplier;
-    for (int e = 0; e < multiplier; ++e) {  // steps per step
-
-      int first_step = _first && (e == 0);
-      int last_step = _last && (e == multiplier - 1);
-      // initial half step
-      if (first_step) {  this->implicit_update_P(U, level, epsilon*0.5); }
-      this->implicit_update_U(U, epsilon,epsilon*0.5);
-      this->implicit_update_P(U, level, (1.0 - sigma) * epsilon *0.5, epsilon*0.5, true);
-      this->implicit_update_U(U, -epsilon*sigma, -epsilon*sigma*0.5);
-      this->implicit_update_P(U, level, (1.0 - sigma) * epsilon *0.5, -epsilon*sigma*0.5, true);
-      this->implicit_update_U(U, epsilon,epsilon*0.5);
-      if (last_step) { this->update_P2(U, level, epsilon*0.5 ); } 
-      else
-      this->implicit_update_P(U, level, epsilon,epsilon*0.5);
-    }
-  }
-
-  }
-};
-
 NAMESPACE_END(Grid);

 #endif  // INTEGRATOR_INCLUDED
@@ -54,361 +54,7 @@ struct LaplacianParams : Serializable {
      precision(precision){};
 };

-#define LEG_LOAD(Dir)						 \
-  SE = st.GetEntry(ptype, Dir, ss);				 \
-  if (SE->_is_local ) {						 \
-    int perm= SE->_permute;					 \
-    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
-  } else {							 \
-    chi = coalescedRead(buf[SE->_offset],lane);			 \
-  }								 \
-  acceleratorSynchronise();

-const std::vector<int> directions4D   ({Xdir,Ydir,Zdir,Tdir,Xdir,Ydir,Zdir,Tdir});
-const std::vector<int> displacements4D({1,1,1,1,-1,-1,-1,-1});
-
-template<class Gimpl,class Field> class CovariantAdjointLaplacianStencil : public SparseMatrixBase<Field>
-{
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-//  RealD kappa;
-
-  typedef typename Field::vector_object siteObject;
-
-  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nc> >, Nds>;
-  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
-  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
-  typedef CartesianStencil<siteObject, siteObject, DefaultImplParams> StencilImpl;
-
-  GridBase *grid;
-  StencilImpl Stencil;
-  SimpleCompressor<siteObject> Compressor;
-  DoubledGaugeField Uds;
-
-  CovariantAdjointLaplacianStencil( GridBase *_grid)
-    : grid(_grid),
-      Stencil    (grid,8,Even,directions4D,displacements4D),
-      Uds(grid){}
-
-  CovariantAdjointLaplacianStencil(GaugeField &Umu)
-    :
-      grid(Umu.Grid()),
-      Stencil    (grid,8,Even,directions4D,displacements4D),
-      Uds(grid)
-  { GaugeImport(Umu); }
-
-  void GaugeImport (const GaugeField &Umu)
-  {
-    assert(grid == Umu.Grid());
-    for (int mu = 0; mu < Nd; mu++) {
-      auto U = PeekIndex<LorentzIndex>(Umu, mu);
-      PokeIndex<LorentzIndex>(Uds, U, mu );
-      U = adj(Cshift(U, mu, -1));
-      PokeIndex<LorentzIndex>(Uds, U, mu + 4);
-    }
-  };
-  
-  virtual GridBase *Grid(void) { return grid; };
-//broken
-#if 0
-  virtual void  MDeriv(const Field &_left, Field &_right,Field &_der, int mu)
-  {
-    ///////////////////////////////////////////////
-    // Halo exchange for this geometry of stencil
-    ///////////////////////////////////////////////
-    Stencil.HaloExchange(_lef, Compressor);
-
-    ///////////////////////////////////
-    // Arithmetic expressions
-    ///////////////////////////////////
-    autoView( st     , Stencil    , AcceleratorRead);
-    auto buf = st.CommBuf();
-
-    autoView( in     , _left    , AcceleratorRead);
-    autoView( right    , _right   , AcceleratorRead);
-    autoView( der    , _der   , AcceleratorWrite);
-    autoView( U     , Uds    , AcceleratorRead);
-
-    typedef typename Field::vector_object        vobj;
-    typedef decltype(coalescedRead(left[0]))    calcObj;
-    typedef decltype(coalescedRead(U[0](0))) calcLink;
-
-    const int      Nsimd = vobj::Nsimd();
-    const uint64_t NN = grid->oSites();
-
-    accelerator_for( ss, NN, Nsimd, {
-
-	StencilEntry *SE;
-	
-	const int lane=acceleratorSIMTlane(Nsimd);
-
-	calcObj chi;
-	calcObj phi;
-	calcObj res;
-	calcObj Uchi;
-	calcObj Utmp;
-	calcObj Utmp2;
-	calcLink UU;
-	calcLink Udag;
-	int ptype;
-
-	res                 = coalescedRead(def[ss]);
-	phi                 = coalescedRead(right[ss]);
-
-#define LEG_LOAD_MULT_LINK(leg,polarisation)			\
-	UU = coalescedRead(U[ss](polarisation));	\
-	Udag = adj(UU);					\
-	LEG_LOAD(leg);					\
-	mult(&Utmp(), &UU, &chi());			\
-	Utmp2 = adj(Utmp);				\
-	mult(&Utmp(), &UU, &Utmp2());			\
-	Utmp2 = adj(Utmp);				\
-	mult(&Uchi(), &phi(), &Utmp2());			\
-	res = res + Uchi;
-	
-	LEG_LOAD_MULT_LINK(0,Xp);
-	LEG_LOAD_MULT_LINK(1,Yp);
-	LEG_LOAD_MULT_LINK(2,Zp);
-	LEG_LOAD_MULT_LINK(3,Tp);
-
-	coalescedWrite(der[ss], res,lane);
-    });
-
-  };
-#endif
-
-  virtual void  Morig(const Field &_in, Field &_out)
-  {
-    ///////////////////////////////////////////////
-    // Halo exchange for this geometry of stencil
-    ///////////////////////////////////////////////
-    Stencil.HaloExchange(_in, Compressor);
-
-    ///////////////////////////////////
-    // Arithmetic expressions
-    ///////////////////////////////////
-//    auto st = Stencil.View(AcceleratorRead);
-    autoView( st     , Stencil    , AcceleratorRead);
-    auto buf = st.CommBuf();
-
-    autoView( in     , _in    , AcceleratorRead);
-    autoView( out    , _out   , AcceleratorWrite);
-    autoView( U     , Uds    , AcceleratorRead);
-
-    typedef typename Field::vector_object        vobj;
-    typedef decltype(coalescedRead(in[0]))    calcObj;
-    typedef decltype(coalescedRead(U[0](0))) calcLink;
-
-    const int      Nsimd = vobj::Nsimd();
-    const uint64_t NN = grid->oSites();
-
-    accelerator_for( ss, NN, Nsimd, {
-
-	StencilEntry *SE;
-	
-	const int lane=acceleratorSIMTlane(Nsimd);
-
-	calcObj chi;
-	calcObj res;
-	calcObj Uchi;
-	calcObj Utmp;
-	calcObj Utmp2;
-	calcLink UU;
-	calcLink Udag;
-	int ptype;
-
-	res                 = coalescedRead(in[ss])*(-8.0);
-
-#define LEG_LOAD_MULT(leg,polarisation)			\
-	UU = coalescedRead(U[ss](polarisation));	\
-	Udag = adj(UU);					\
-	LEG_LOAD(leg);					\
-	mult(&Utmp(), &UU, &chi());			\
-	Utmp2 = adj(Utmp);				\
-	mult(&Utmp(), &UU, &Utmp2());			\
-	Uchi = adj(Utmp);				\
-	res = res + Uchi;
-	
-	LEG_LOAD_MULT(0,Xp);
-	LEG_LOAD_MULT(1,Yp);
-	LEG_LOAD_MULT(2,Zp);
-	LEG_LOAD_MULT(3,Tp);
-	LEG_LOAD_MULT(4,Xm);
-	LEG_LOAD_MULT(5,Ym);
-	LEG_LOAD_MULT(6,Zm);
-	LEG_LOAD_MULT(7,Tm);
-
-	coalescedWrite(out[ss], res,lane);
-    });
-
-  };
-  virtual void  Mnew (const Field &_in, Field &_out)
-  {
-    ///////////////////////////////////////////////
-    // Halo exchange for this geometry of stencil
-    ///////////////////////////////////////////////
-//    Stencil.HaloExchange(_in, Compressor);
-      std::vector<std::vector<CommsRequest_t> > requests;
-      Stencil.Prepare();
-  {
-    GRID_TRACE("Laplace Gather");
-    Stencil.HaloGather(_in,Compressor);
-  }
-
-  tracePush("Laplace Communication");
-  Stencil.CommunicateBegin(requests);
-  {
-    GRID_TRACE("MergeSHM");
-    Stencil.CommsMergeSHM(Compressor);
-  }
-    
-
-    ///////////////////////////////////
-    // Arithmetic expressions
-    ///////////////////////////////////
-//    auto st = Stencil.View(AcceleratorRead);
-    autoView( st     , Stencil    , AcceleratorRead);
-    auto buf = st.CommBuf();
-
-    autoView( in     , _in    , AcceleratorRead);
-    autoView( out    , _out   , AcceleratorWrite);
-    autoView( U     , Uds    , AcceleratorRead);
-
-    typedef typename Field::vector_object        vobj;
-    typedef decltype(coalescedRead(in[0]))    calcObj;
-    typedef decltype(coalescedRead(U[0](0))) calcLink;
-
-    const int      Nsimd = vobj::Nsimd();
-    const uint64_t NN = grid->oSites();
-
-    accelerator_for( ss, NN, Nsimd, {
-
-	StencilEntry *SE;
-	
-	const int lane=acceleratorSIMTlane(Nsimd);
-
-	calcObj chi;
-	calcObj res;
-	calcObj Uchi;
-	calcObj Utmp;
-	calcObj Utmp2;
-	calcLink UU;
-	calcLink Udag;
-	int ptype;
-
-	res                 = coalescedRead(in[ss])*(-8.0);
-
-
-        SE = st.GetEntry(ptype, 0, ss);				 
-        if (SE->_is_local ) {
-	LEG_LOAD_MULT(0,Xp);
-	}
-        SE = st.GetEntry(ptype, 1, ss);				 
-        if (SE->_is_local ) {
-	LEG_LOAD_MULT(1,Yp);
-	}
-        SE = st.GetEntry(ptype, 2, ss);				 
-        if (SE->_is_local ) {
-	LEG_LOAD_MULT(2,Zp);
-	}
-        SE = st.GetEntry(ptype, 3, ss);				 
-        if (SE->_is_local ) {
-	LEG_LOAD_MULT(3,Tp);
-	}
-        SE = st.GetEntry(ptype, 4, ss);				 
-        if (SE->_is_local ) {
-	LEG_LOAD_MULT(4,Xm);
-	}
-        SE = st.GetEntry(ptype, 5, ss);				 
-        if (SE->_is_local ) {
-	LEG_LOAD_MULT(5,Ym);
-	}
-        SE = st.GetEntry(ptype, 6, ss);				 
-        if (SE->_is_local ) {
-	LEG_LOAD_MULT(6,Zm);
-	}
-        SE = st.GetEntry(ptype, 7, ss);				 
-        if (SE->_is_local ) {
-	LEG_LOAD_MULT(7,Tm);
-	}
-
-	coalescedWrite(out[ss], res,lane);
-    });
-
-    Stencil.CommunicateComplete(requests);
-  tracePop("Communication");
-
-  {
-    GRID_TRACE("Merge");
-    Stencil.CommsMerge(Compressor);
-  }
-
-
-    accelerator_for( ss, NN, Nsimd, {
-
-	StencilEntry *SE;
-	
-	const int lane=acceleratorSIMTlane(Nsimd);
-
-	calcObj chi;
-	calcObj res;
-	calcObj Uchi;
-	calcObj Utmp;
-	calcObj Utmp2;
-	calcLink UU;
-	calcLink Udag;
-	int ptype;
-
-//	res                 = coalescedRead(in[ss])*(-8.0);
-	res                 = coalescedRead(out[ss]);
-
-        SE = st.GetEntry(ptype, 0, ss);				 
-        if ((SE->_is_local )==0){
-	LEG_LOAD_MULT(0,Xp);
-	}
-        SE = st.GetEntry(ptype, 1, ss);				 
-        if ((SE->_is_local )==0){
-	LEG_LOAD_MULT(1,Yp);
-	}
-        SE = st.GetEntry(ptype, 2, ss);				 
-        if ((SE->_is_local )==0){
-	LEG_LOAD_MULT(2,Zp);
-	}
-        SE = st.GetEntry(ptype, 3, ss);
-        if ((SE->_is_local )==0){
-	LEG_LOAD_MULT(3,Tp);
-	}
-        SE = st.GetEntry(ptype, 4, ss);
-        if ((SE->_is_local )==0){
-	LEG_LOAD_MULT(4,Xm);
-	}
-        SE = st.GetEntry(ptype, 5, ss);
-        if ((SE->_is_local )==0){
-	LEG_LOAD_MULT(5,Ym);
-	}
-        SE = st.GetEntry(ptype, 6, ss);
-        if ((SE->_is_local )==0){
-	LEG_LOAD_MULT(6,Zm);
-	}
-        SE = st.GetEntry(ptype, 7, ss);
-        if ((SE->_is_local )==0){
-	LEG_LOAD_MULT(7,Tm);
-	}
-
-	coalescedWrite(out[ss], res,lane);
-    });
-  };
-
-  virtual void  M(const Field &in, Field &out) {Mnew(in,out);};
-  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
-  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
-};
-
-#undef LEG_LOAD_MULT
-#undef LEG_LOAD_MULT_LINK
-#undef LEG_LOAD

 ////////////////////////////////////////////////////////////
 // Laplacian operator L on adjoint fields
@@ -430,40 +76,29 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
  LaplacianParams param;
  MultiShiftFunction PowerHalf;    
  MultiShiftFunction PowerInvHalf;    
-//template<class Gimpl,class Field> class CovariantAdjointLaplacianStencil : public SparseMatrixBase<Field>
-  CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField> LapStencil;

 public:
  INHERIT_GIMPL_TYPES(Impl);

-  LaplacianAdjointField(GridBase* grid, OperatorFunction<GaugeField>& S, LaplacianParams& p, const RealD k = 1.0, bool if_remez=true)
-    : U(Nd, grid), Solver(S), param(p), kappa(k)
-	,LapStencil(grid){
+  LaplacianAdjointField(GridBase* grid, OperatorFunction<GaugeField>& S, LaplacianParams& p, const RealD k = 1.0)
+    : U(Nd, grid), Solver(S), param(p), kappa(k){
    AlgRemez remez(param.lo,param.hi,param.precision);
    std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-    if(if_remez){
    remez.generateApprox(param.degree,1,2);
    PowerHalf.Init(remez,param.tolerance,false);
    PowerInvHalf.Init(remez,param.tolerance,true);
-    }
-    this->triv=0;
        

  };
-  LaplacianAdjointField(){this->triv=0; printf("triv=%d\n",this->Trivial());}
+
  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);}
  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}

  void ImportGauge(const GaugeField& _U) {
-    RealD total=0.;
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(_U, mu);
-      total += norm2(U[mu]);
    }
-    LapStencil.GaugeImport (_U);
-
-    std::cout << GridLogDebug <<"ImportGauge:norm2(U _U) = "<<total<<std::endl;
  }

  void M(const GaugeField& in, GaugeField& out) {
@@ -471,12 +106,10 @@ public:
    // test
    //GaugeField herm = in + adj(in);
    //std::cout << "AHermiticity: " << norm2(herm) << std::endl;
-//    std::cout << GridLogDebug <<"M:Kappa = "<<kappa<<std::endl;

-    GaugeLinkField sum(in.Grid());
-#if 0
    GaugeLinkField tmp(in.Grid());
    GaugeLinkField tmp2(in.Grid());
+    GaugeLinkField sum(in.Grid());

    for (int nu = 0; nu < Nd; nu++) {
      sum = Zero();
@@ -490,22 +123,10 @@ public:
      out_nu = (1.0 - kappa) * in_nu - kappa / (double(4 * Nd)) * sum;
      PokeIndex<LorentzIndex>(out, out_nu, nu);
    }
-#else
-    for (int nu = 0; nu < Nd; nu++) {
-      GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
-      GaugeLinkField out_nu(out.Grid());
-      LapStencil.M(in_nu,sum);
-      out_nu = (1.0 - kappa) * in_nu - kappa / (double(4 * Nd)) * sum;
-      PokeIndex<LorentzIndex>(out, out_nu, nu);
-    }
-#endif
-//    std::cout << GridLogDebug <<"M:norm2(out) = "<<norm2(out)<<std::endl;
  }

-
  void MDeriv(const GaugeField& in, GaugeField& der) {
    // in is anti-hermitian
-//    std::cout << GridLogDebug <<"MDeriv:Kappa = "<<kappa<<std::endl;
    RealD factor = -kappa / (double(4 * Nd));
    
    for (int mu = 0; mu < Nd; mu++){
@@ -519,7 +140,6 @@ public:
      // adjoint in the last multiplication
      PokeIndex<LorentzIndex>(der,  -2.0 * factor * der_mu, mu);
    } 
-    std::cout << GridLogDebug <<"MDeriv: Kappa= "<< kappa << " norm2(der) = "<<norm2(der)<<std::endl;
  }

  // separating this temporarily
@@ -539,22 +159,11 @@ public:
      }
      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu);
    }
-    std::cout << GridLogDebug <<"MDeriv: Kappa= "<< kappa << " norm2(der) = "<<norm2(der)<<std::endl;
  }

  void Minv(const GaugeField& in, GaugeField& inverted){
    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
    Solver(HermOp, in, inverted);
-    std::cout << GridLogDebug <<"Minv:norm2(inverted) = "<<norm2(inverted)<<std::endl;
-  }
-
-
-  void MinvDeriv(const GaugeField& in, GaugeField& der) {
-    GaugeField X(in.Grid());
-    Minv(in,X);
-    MDeriv(X,der);
-    der *=-1.0;
-    std::cout << GridLogDebug <<"MinvDeriv:norm2(der) = "<<norm2(der)<<std::endl;
  }

  void MSquareRoot(GaugeField& P){
@@ -563,7 +172,6 @@ public:
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerHalf);
    msCG(HermOp,P,Gp);
    P = Gp; 
-    std::cout << GridLogDebug <<"MSquareRoot:norm2(P) = "<<norm2(P)<<std::endl;
  }

  void MInvSquareRoot(GaugeField& P){
@@ -572,7 +180,6 @@ public:
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerInvHalf);
    msCG(HermOp,P,Gp);
    P = Gp; 
-    std::cout << GridLogDebug <<"MInvSquareRoot:norm2(P) = "<<norm2(P)<<std::endl;
  }


@@ -1,403 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/scalar/CovariantLaplacianRat.h
-
-Copyright (C) 2021
-
-Author: Chulwoo Jung <chulwoo@bnl.gov>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-			   /*  END LEGAL */
-#pragma once 
-#define MIXED_CG
-//enable/disable push_back
-#undef USE_CHRONO 
-
-//#include <roctracer/roctx.h>
-
-NAMESPACE_BEGIN(Grid);
-
-struct LaplacianRatParams {
-
-  RealD offset;
-  int order;
-  std::vector<RealD> a0;
-  std::vector<RealD> a1;
-  std::vector<RealD> b0;
-  std::vector<RealD> b1;
-  RealD b2; //for debugging
-  int   MaxIter;
-  RealD tolerance;
-  int   precision;
-  
-  // constructor 
-  LaplacianRatParams(int ord = 1,
-                  int maxit     = 1000,
-                  RealD tol     = 1.0e-8, 
-                  int precision = 64)
-    : offset(1.), order(ord),b2(1.),
-      MaxIter(maxit),
-      tolerance(tol),
-      precision(precision){ 
-      a0.resize(ord,0.);
-      a1.resize(ord,0.);
-      b0.resize(ord,0.);
-      b1.resize(ord,0.);
-      };
-};
-
-
-
-////////////////////////////////////////////////////////////
-// Laplacian operator L on adjoint fields
-//
-// phi: adjoint field
-// L: D_mu^dag D_mu
-//
-// L phi(x) = Sum_mu [ U_mu(x)phi(x+mu)U_mu(x)^dag + 
-//                     U_mu(x-mu)^dag phi(x-mu)U_mu(x-mu)
-//                     -2phi(x)]
-//
-// Operator designed to be encapsulated by
-// an HermitianLinearOperator<.. , ..>
-////////////////////////////////////////////////////////////
-
-template <class Impl, class ImplF>
-class LaplacianAdjointRat: public Metric<typename Impl::Field> {
-  OperatorFunction<typename Impl::Field> &Solver;
-  LaplacianRatParams Gparam;
-  LaplacianRatParams Mparam;
-  GridBase *grid;
-  GridBase *grid_f;
-  CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField> LapStencil;
-  CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField> LapStencilF;
-public:
-  INHERIT_GIMPL_TYPES(Impl);
-//   typedef typename GImpl::LinkField GaugeLinkField; \
-//  typedef typename GImpl::Field GaugeField;         
-  typedef typename ImplF::Field GaugeFieldF;
-  typedef typename ImplF::LinkField GaugeLinkFieldF; \
-  GaugeField Usav;
-  GaugeFieldF UsavF;
-  std::vector< std::vector<GaugeLinkField> > prev_solnsM;
-  std::vector< std::vector<GaugeLinkField> > prev_solnsMinv;
-  std::vector< std::vector<GaugeLinkField> > prev_solnsMDeriv;
-  std::vector< std::vector<GaugeLinkField> > prev_solnsMinvDeriv;
-
-	  LaplacianAdjointRat(GridBase* _grid, GridBase* _grid_f, OperatorFunction<GaugeField>& S, LaplacianRatParams& gpar, LaplacianRatParams& mpar)
-    : grid(_grid),grid_f(_grid_f), LapStencil(_grid), LapStencilF(_grid_f), U(Nd, _grid), Solver(S), Gparam(gpar), Mparam(mpar),Usav(_grid), UsavF(_grid_f),
-      prev_solnsM(4),prev_solnsMinv(4),prev_solnsMDeriv(4),prev_solnsMinvDeriv(4) {
-//    std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-    this->triv=0;
-        
-
-  };
-  LaplacianAdjointRat(){this->triv=0; printf("triv=%d\n",this->Trivial());}
-  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
-  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);}
-  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}
-
-  void ImportGauge(const GaugeField& _U) {
-    RealD total=0.;
-    for (int mu = 0; mu < Nd; mu++) {
-      U[mu] = PeekIndex<LorentzIndex>(_U, mu);
-      total += norm2(U[mu]);
-    }
-    Usav = _U;
-    precisionChange(UsavF,Usav);
-    std::cout <<GridLogDebug << "ImportGauge:norm2(_U) = "<<" "<<total<<std::endl;
-  }
-
-  void MDerivLink(const GaugeLinkField& left, const GaugeLinkField& right,
-              GaugeField& der) {
-    std::cout<<GridLogMessage << "MDerivLink start "<< std::endl;
-    RealD factor = -1. / (double(4 * Nd));
-    for (int mu = 0; mu < Nd; mu++) {
-      GaugeLinkField der_mu(der.Grid());
-      der_mu = Zero();
-//      for (int nu = 0; nu < Nd; nu++) {
-//        GaugeLinkField left_nu = PeekIndex<LorentzIndex>(left, nu);
-//        GaugeLinkField right_nu = PeekIndex<LorentzIndex>(right, nu);
-        der_mu += U[mu] * Cshift(left, mu, 1) * adj(U[mu]) * right;
-        der_mu += U[mu] * Cshift(right, mu, 1) * adj(U[mu]) * left;
-//      }
-      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu);
-    }
-//    std::cout << GridLogDebug <<"MDerivLink:  norm2(der) = "<<norm2(der)<<std::endl;
-    std::cout<<GridLogMessage << "MDerivLink end "<< std::endl;
-  }
-
-  void MDerivLink(const GaugeLinkField& left, const GaugeLinkField& right,
-              std::vector<GaugeLinkField> & der) {
-//    std::cout<<GridLogMessage << "MDerivLink "<< std::endl;
-    RealD factor = -1. / (double(4 * Nd));
-
-    for (int mu = 0; mu < Nd; mu++) {
-      GaugeLinkField der_mu(left.Grid());
-      der_mu = Zero();
-        der_mu += U[mu] * Cshift(left, mu, 1) * adj(U[mu]) * right;
-        der_mu += U[mu] * Cshift(right, mu, 1) * adj(U[mu]) * left;
-//      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu);
-      der[mu] = -factor*der_mu;
-//      std::cout << GridLogDebug <<"MDerivLink:  norm2(der) = "<<norm2(der[mu])<<std::endl;
-        
-    }
-//    std::cout<<GridLogMessage << "MDerivLink end "<< std::endl;
-  }
-
-  void MDerivInt(LaplacianRatParams &par, const GaugeField& left, const GaugeField& right,
-              GaugeField& der ,  std::vector< std::vector<GaugeLinkField> >& prev_solns ) {
-
-// get rid of this please
-    std::cout<<GridLogMessage << "LaplaceStart " <<std::endl;
-    RealD fac =  - 1. / (double(4 * Nd)) ;
-    RealD coef=0.5;
-    LapStencil.GaugeImport(Usav);
-    LapStencilF.GaugeImport(UsavF);
-
-
-    for (int nu=0;nu<Nd;nu++){
-        GaugeLinkField right_nu = PeekIndex<LorentzIndex>(right, nu);
-        GaugeLinkField left_nu = PeekIndex<LorentzIndex>(left, nu);
-        GaugeLinkField LMinvMom(left.Grid());
-    
-        GaugeLinkField GMom(left.Grid());
-        GaugeLinkField LMinvGMom(left.Grid());
-    
-        GaugeLinkField AGMom(left.Grid());
-        GaugeLinkField MinvAGMom(left.Grid());
-        GaugeLinkField LMinvAGMom(left.Grid());
-    
-        GaugeLinkField AMinvMom(left.Grid());
-        GaugeLinkField LMinvAMom(left.Grid());
-        GaugeLinkField temp(left.Grid());
-        GaugeLinkField temp2(left.Grid());
-    
-        std::vector<GaugeLinkField> MinvMom(par.order,left.Grid());
-    
-        GaugeLinkField MinvGMom(left.Grid());
-        GaugeLinkField Gtemp(left.Grid());
-        GaugeLinkField Gtemp2(left.Grid());
-    
-    
-        ConjugateGradient<GaugeLinkField> CG(par.tolerance,10000,false);
-    //    ConjugateGradient<GaugeFieldF> CG_f(par.tolerance,10000,false);
-        LaplacianParams LapPar(0.0001, 1.0, 10000, 1e-8, 12, 64);
-    
-        ChronoForecast< QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,GaugeLinkField>,GaugeLinkField> , GaugeLinkField> Forecast;
-    
-        GMom = par.offset * right_nu;
-    
-        for(int i =0;i<par.order;i++){
-        QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> QuadOp(LapStencil,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
-#if USE_CHRONO
-        MinvMom[i] = Forecast(QuadOp, right_nu, prev_solns[nu]);
-#endif
-#ifndef MIXED_CG
-        CG(QuadOp,right_nu,MinvMom[i]);
-#else
-        QuadLinearOperator<CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
-    //    QuadLinearOperator<LaplacianAdjointField<ImplF>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],par.b1[i],par.b2);
-        MixedPrecisionConjugateGradient<GaugeLinkField,GaugeLinkFieldF> MixedCG(par.tolerance,10000,10000,grid_f,QuadOpF,QuadOp);
-        MixedCG.InnerTolerance=par.tolerance;
-        MixedCG(right_nu,MinvMom[i]);
-    #endif
-    #if USE_CHRONO
-        prev_solns[nu].push_back(MinvMom[i]);
-    #endif
-        
-        GMom += par.a0[i]*MinvMom[i]; 
-        LapStencil.M(MinvMom[i],Gtemp2);
-        GMom += par.a1[i]*fac*Gtemp2; 
-        }
-        for(int i =0;i<par.order;i++){
-        QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> QuadOp(LapStencil,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
-    
-        MinvGMom = Forecast(QuadOp, GMom, prev_solns[nu]);
-    #ifndef MIXED_CG
-        CG(QuadOp,GMom,MinvGMom);
-        LapStencil.M(MinvGMom, Gtemp2); LMinvGMom=fac*Gtemp2;
-        CG(QuadOp,right_nu,MinvMom[i]);
-    #else
-        QuadLinearOperator<CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
-    //    QuadLinearOperator<LaplacianAdjointField<ImplF>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],par.b1[i],par.b2);
-        MixedPrecisionConjugateGradient<GaugeLinkField,GaugeLinkFieldF> MixedCG(par.tolerance,10000,10000,grid_f,QuadOpF,QuadOp);
-        MixedCG.InnerTolerance=par.tolerance;
-        MixedCG(GMom,MinvGMom);
-        LapStencil.M(MinvGMom, Gtemp2); LMinvGMom=fac*Gtemp2;
-    //    Laplacian.M(MinvGMom, LMinvGMom);
-        MixedCG(right_nu,MinvMom[i]);
-    #endif
-#if USE_CHRONO
-        prev_solns[nu].push_back(MinvGMom);
-#endif
-    
-        LapStencil.M(MinvMom[i], Gtemp2); LMinvMom=fac*Gtemp2;
-        AMinvMom = par.a1[i]*LMinvMom;
-        AMinvMom += par.a0[i]*MinvMom[i];
-    
-        LapStencil.M(AMinvMom, Gtemp2); LMinvAMom=fac*Gtemp2;
-        LapStencil.M(MinvGMom, Gtemp2); temp=fac*Gtemp2;
-        MinvAGMom = par.a1[i]*temp;
-        MinvAGMom += par.a0[i]*MinvGMom;
-        LapStencil.M(MinvAGMom, Gtemp2); LMinvAGMom=fac*Gtemp2;
-    
-    
-        GaugeField tempDer(left.Grid());
-        std::vector<GaugeLinkField> DerLink(Nd,left.Grid());
-        std::vector<GaugeLinkField> tempDerLink(Nd,left.Grid());
-
-        std::cout<<GridLogMessage << "force contraction "<< i <<std::endl;
-    //    roctxRangePushA("RMHMC force contraction");
- #if 0
-        MDerivLink(GMom,MinvMom[i],tempDer); der += coef*2*par.a1[i]*tempDer;
-        MDerivLink(left_nu,MinvGMom,tempDer); der += coef*2*par.a1[i]*tempDer;
-        MDerivLink(LMinvAGMom,MinvMom[i],tempDer); der += coef*-2.*par.b2*tempDer;
-        MDerivLink(LMinvAMom,MinvGMom,tempDer); der += coef*-2.*par.b2*tempDer;
-        MDerivLink(MinvAGMom,LMinvMom,tempDer); der += coef*-2.*par.b2*tempDer;
-        MDerivLink(AMinvMom,LMinvGMom,tempDer); der += coef*-2.*par.b2*tempDer;
-        MDerivLink(MinvAGMom,MinvMom[i],tempDer); der += coef*-2.*par.b1[i]*tempDer;
-        MDerivLink(AMinvMom,MinvGMom,tempDer); der += coef*-2.*par.b1[i]*tempDer;
-#else
-	for (int mu=0;mu<Nd;mu++) DerLink[mu]=Zero();
-        MDerivLink(GMom,MinvMom[i],tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*2*par.a1[i]*tempDerLink[mu];
-        MDerivLink(left_nu,MinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*2*par.a1[i]*tempDerLink[mu];
-        MDerivLink(LMinvAGMom,MinvMom[i],tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu];
-        MDerivLink(LMinvAMom,MinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu];
-        MDerivLink(MinvAGMom,LMinvMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu];
-        MDerivLink(AMinvMom,LMinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu];
-        MDerivLink(MinvAGMom,MinvMom[i],tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b1[i]*tempDerLink[mu];
-        MDerivLink(AMinvMom,MinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b1[i]*tempDerLink[mu];
-//      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu);
-        for (int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(tempDer, tempDerLink[mu], mu);
-
-	der += tempDer;
-#endif
-        std::cout<<GridLogMessage << "coef =  force contraction "<< i << "done "<< coef <<std::endl;
-    //    roctxRangePop();
-    
-        }
-    }
-    std::cout<<GridLogMessage << "LaplaceEnd " <<std::endl;
-//  exit(-42);
-  }
-
-  void MDeriv(const GaugeField& in, GaugeField& der) {
-    MDeriv(in,in, der);
-  }
-
-  void MDeriv(const GaugeField& left, const GaugeField& right,
-              GaugeField& der) {
-
-    der=Zero();
-    MDerivInt(Mparam, left, right, der,prev_solnsMDeriv );
-    std::cout <<GridLogDebug << "MDeriv:norm2(der) = "<<norm2(der)<<std::endl;
-  }
-
-  void MinvDeriv(const GaugeField& in, GaugeField& der) {
-    std::vector< std::vector<GaugeLinkField> > prev_solns(4);
-    der=Zero();
-    MDerivInt(Gparam, in, in, der,prev_solnsMinvDeriv);
-    std::cout <<GridLogDebug << "MinvDeriv:norm2(der) = "<<norm2(der)<<std::endl;
-  }
-
-
-  void MSquareRootInt(LaplacianRatParams &par, GaugeField& P, std::vector< std::vector<GaugeLinkField> > & prev_solns ){
-
-    std::cout<<GridLogMessage << "LaplaceStart " <<std::endl;
-    RealD fac = -1. / (double(4 * Nd));
-    LapStencil.GaugeImport(Usav);
-    LapStencilF.GaugeImport(UsavF);
-    for(int nu=0; nu<Nd;nu++){
-        GaugeLinkField P_nu = PeekIndex<LorentzIndex>(P, nu);
-        GaugeLinkField Gp(P.Grid());
-        Gp = par.offset * P_nu;
-        ConjugateGradient<GaugeLinkField> CG(par.tolerance,10000);
-    //    ConjugateGradient<GaugeLinkFieldF> CG_f(1.0e-8,10000);
-    
-        ChronoForecast< QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> , GaugeLinkField> Forecast;
-    
-        GaugeLinkField Gtemp(P.Grid());
-        GaugeLinkField Gtemp2(P.Grid());
-    
-    
-        for(int i =0;i<par.order;i++){
-        QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> QuadOp(LapStencil,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
-    
-        Gtemp = Forecast(QuadOp, P_nu, prev_solns[nu]);
-    #ifndef MIXED_CG
-        CG(QuadOp,P_nu,Gtemp);
-    #else
-        QuadLinearOperator<CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
-    //    QuadLinearOperator<LaplacianAdjointField<ImplF>,GaugeFieldF> QuadOpF(LapStencilF,par.b0[i],par.b1[i],par.b2);
-        MixedPrecisionConjugateGradient<GaugeLinkField,GaugeLinkFieldF> MixedCG(par.tolerance,10000,10000,grid_f,QuadOpF,QuadOp);
-        MixedCG.InnerTolerance=par.tolerance;
-        MixedCG(P_nu,Gtemp);
-    #endif
-    #if USE_CHRONO
-        prev_solns[nu].push_back(Gtemp);
-    #endif
-    
-        Gp += par.a0[i]*Gtemp; 
-        LapStencil.M(Gtemp,Gtemp2);
-        Gp += par.a1[i]*fac*Gtemp2; 
-        }
-        PokeIndex<LorentzIndex>(P, Gp, nu);
-    }
-    std::cout<<GridLogMessage << "LaplaceEnd " <<std::endl;
-  }
-
-  void MSquareRoot(GaugeField& P){
-    std::vector< std::vector<GaugeLinkField> > prev_solns(4);
-    MSquareRootInt(Mparam,P,prev_solns);
-    std::cout <<GridLogDebug << "MSquareRoot:norm2(P) = "<<norm2(P)<<std::endl;
-  }
-
-  void MInvSquareRoot(GaugeField& P){
-    std::vector< std::vector<GaugeLinkField> > prev_solns(4);
-    MSquareRootInt(Gparam,P,prev_solns);
-    std::cout <<GridLogDebug << "MInvSquareRoot:norm2(P) = "<<norm2(P)<<std::endl;
-  }
-
-  void M(const GaugeField& in, GaugeField& out) {
-      out = in;
-      std::vector< std::vector<GaugeLinkField> > prev_solns(4);
-      MSquareRootInt(Mparam,out,prev_solns);
-      MSquareRootInt(Mparam,out,prev_solns);
-      std::cout <<GridLogDebug << "M:norm2(out) = "<<norm2(out)<<std::endl;
-  }
-
-  void Minv(const GaugeField& in, GaugeField& inverted){
-      inverted = in;
-      std::vector< std::vector<GaugeLinkField> > prev_solns(4);
-      MSquareRootInt(Gparam,inverted,prev_solns);
-      MSquareRootInt(Gparam,inverted,prev_solns);
-      std::cout <<GridLogDebug << "Minv:norm2(inverted) = "<<norm2(inverted)<<std::endl;
-  }
-
-
-
-private:
-  std::vector<GaugeLinkField> U;
-};
-#undef MIXED_CG
-
-NAMESPACE_END(Grid);
@@ -7,7 +7,6 @@ Source file: ./lib/qcd/hmc/integrators/Integrator.h
 Copyright (C) 2015

 Author: Guido Cossu <guido.cossu@ed.ac.uk>
-Author: Chulwoo Jung <chulwoo@bnl.gov>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -34,12 +33,7 @@ NAMESPACE_BEGIN(Grid);

 template <typename Field> 
 class Metric{
-protected:
-  int triv;
 public:
-  Metric(){this->triv=1;}
-  int Trivial(){ return triv;}
-//printf("Metric::Trivial=%d\n",triv); ;
  virtual void ImportGauge(const Field&)   = 0;
  virtual void M(const Field&, Field&)     = 0;
  virtual void Minv(const Field&, Field&)  = 0;
@@ -47,8 +41,6 @@ public:
  virtual void MInvSquareRoot(Field&) = 0;
  virtual void MDeriv(const Field&, Field&) = 0;
  virtual void MDeriv(const Field&, const Field&, Field&) = 0;
-  virtual void MinvDeriv(const Field&, Field&) = 0;
-//  virtual void MinvDeriv(const Field&, const Field&, Field&) = 0;
 };


@@ -56,36 +48,23 @@ public:
 template <typename Field>
 class TrivialMetric : public Metric<Field>{
 public:
-//  TrivialMetric(){this->triv=1;printf("TrivialMetric::triv=%d\n",this->Trivial());}
  virtual void ImportGauge(const Field&){};
  virtual void M(const Field& in, Field& out){
-//    printf("M:norm=%0.15e\n",norm2(in));
-    std::cout << GridLogIntegrator << " M:norm(in)= " << std::sqrt(norm2(in)) << std::endl;
    out = in;
  }
  virtual void Minv(const Field& in, Field& out){
-    std::cout << GridLogIntegrator << " Minv:norm(in)= " << std::sqrt(norm2(in)) << std::endl;
    out = in;
  }
  virtual void MSquareRoot(Field& P){
-    std::cout << GridLogIntegrator << " MSquareRoot:norm(P)= " << std::sqrt(norm2(P)) << std::endl;
    // do nothing
  }
  virtual void MInvSquareRoot(Field& P){
-    std::cout << GridLogIntegrator << " MInvSquareRoot:norm(P)= " << std::sqrt(norm2(P)) << std::endl;
    // do nothing
  }
  virtual void MDeriv(const Field& in, Field& out){
-    std::cout << GridLogIntegrator << " MDeriv:norm(in)= " << std::sqrt(norm2(in)) << std::endl;
-    out = Zero();
-  }
-  virtual void MinvDeriv(const Field& in, Field& out){
-    std::cout << GridLogIntegrator << " MinvDeriv:norm(in)= " << std::sqrt(norm2(in)) << std::endl;
    out = Zero();
  }
  virtual void MDeriv(const Field& left, const Field& right, Field& out){
-    std::cout << GridLogIntegrator << " MDeriv:norm(left)= " << std::sqrt(norm2(left)) << std::endl;
-    std::cout << GridLogIntegrator << " MDeriv:norm(right)= " << std::sqrt(norm2(right)) << std::endl;
    out = Zero();
  }

@@ -122,15 +101,14 @@ public:
    // Generate gaussian momenta
    Implementation::generate_momenta(Mom, sRNG, pRNG);
    // Modify the distribution with the metric
-//    if(M.Trivial()) return;
    M.MSquareRoot(Mom);

    if (1) {
      // Auxiliary momenta
      // do nothing if trivial, so hide in the metric
      MomentaField AuxMomTemp(Mom.Grid());
-      Implementation::generate_momenta(AuxMom, sRNG,pRNG);
-      Implementation::generate_momenta(AuxField, sRNG,pRNG);
+      Implementation::generate_momenta(AuxMom, sRNG, pRNG);
+      Implementation::generate_momenta(AuxField, sRNG, pRNG);
      // Modify the distribution with the metric
      // Aux^dag M Aux
      M.MInvSquareRoot(AuxMom);  // AuxMom = M^{-1/2} AuxMomTemp
@@ -139,12 +117,11 @@ public:

  // Correct
  RealD MomentaAction(){
-    static RealD Saux=0.,Smom=0.;
    MomentaField inv(Mom.Grid());
    inv = Zero();
    M.Minv(Mom, inv);
-    LatticeComplex Hloc(Mom.Grid()); Hloc = Zero();
-    LatticeComplex Hloc2(Mom.Grid()); Hloc2 = Zero();
+    LatticeComplex Hloc(Mom.Grid());
+    Hloc = Zero();
    for (int mu = 0; mu < Nd; mu++) {
      // This is not very general
      // hide in the metric
@@ -152,15 +129,8 @@ public:
      auto inv_mu = PeekIndex<LorentzIndex>(inv, mu);
      Hloc += trace(Mom_mu * inv_mu);
    }
-    auto Htmp1 = TensorRemove(sum(Hloc));
-    std::cout << GridLogMessage << "S:dSmom = " << Htmp1.real()-Smom << "\n";
-    Smom=Htmp1.real()/HMC_MOMENTUM_DENOMINATOR;
-    

-    
-
-//    if(!M.Trivial()) 
-    {
+    if (1) {
      // Auxiliary Fields
      // hide in the metric
      M.M(AuxMom, inv);
@@ -170,18 +140,13 @@ public:
        auto inv_mu = PeekIndex<LorentzIndex>(inv, mu);
        auto am_mu = PeekIndex<LorentzIndex>(AuxMom, mu);
        auto af_mu = PeekIndex<LorentzIndex>(AuxField, mu);
-        Hloc += trace(am_mu * inv_mu);
-        Hloc2 += trace(af_mu * af_mu);
+        Hloc += trace(am_mu * inv_mu);// p M p
+        Hloc += trace(af_mu * af_mu);
      }
    }
-    auto Htmp2 = TensorRemove(sum(Hloc))-Htmp1;
-    std::cout << GridLogMessage << "S:dSaux = " << Htmp2.real()-Saux << "\n";
-    Saux=Htmp2.real();

-    auto Hsum = TensorRemove(sum(Hloc))/HMC_MOMENTUM_DENOMINATOR;
-    auto Hsum2 = TensorRemove(sum(Hloc2));
-    std::cout << GridLogIntegrator << "MomentaAction: " <<  Hsum.real()+Hsum2.real() << std::endl;
-    return Hsum.real()+Hsum2.real();
+    auto Hsum = TensorRemove(sum(Hloc));
+    return Hsum.real();
  }

  // Correct
@@ -192,17 +157,15 @@ public:
    MomentaField MDer(in.Grid());
    MomentaField X(in.Grid());
    X = Zero();
-    M.MinvDeriv(in, MDer);  // MDer = U * dS/dU
-    der = -1.0* Implementation::projectForce(MDer);  // Ta if gauge fields
-//    std::cout << GridLogIntegrator << " DerivativeU: norm(in)= " << std::sqrt(norm2(in)) << std::endl;
-//    std::cout << GridLogIntegrator << " DerivativeU: norm(der)= " << std::sqrt(norm2(der)) << std::endl;
+    M.Minv(in, X);  // X = G in
+    M.MDeriv(X, MDer);  // MDer = U * dS/dU
+    der = Implementation::projectForce(MDer);  // Ta if gauge fields
    
  }

  void AuxiliaryFieldsDerivative(MomentaField& der){
    der = Zero();
-//    if(!M.Trivial()) 
-    {
+    if (1){
      // Auxiliary fields
      MomentaField der_temp(der.Grid());
      MomentaField X(der.Grid());
@@ -210,7 +173,6 @@ public:
      //M.M(AuxMom, X); // X = M Aux
      // Two derivative terms
      // the Mderiv need separation of left and right terms
-    std::cout << GridLogIntegrator << " AuxiliaryFieldsDerivative:norm(AuxMom)= " << std::sqrt(norm2(AuxMom)) << std::endl;
      M.MDeriv(AuxMom, der); 


@@ -218,7 +180,6 @@ public:
      //M.MDeriv(X, AuxMom, der_temp); der += der_temp;

      der = -1.0*Implementation::projectForce(der);
-      std::cout << GridLogIntegrator << " AuxiliaryFieldsDerivative:norm(der)= " << std::sqrt(norm2(der)) << std::endl;
    }
  }

@@ -228,28 +189,22 @@ public:
    // is the projection necessary here?
    // no for fields in the algebra
    der = Implementation::projectForce(der); 
-    std::cout << GridLogIntegrator << " DerivativeP:norm(der)= " << std::sqrt(norm2(der)) << std::endl;
  }

  void update_auxiliary_momenta(RealD ep){
-      std::cout << GridLogIntegrator << "AuxMom update_auxiliary_fields: " << std::sqrt(norm2(AuxMom)) << std::endl;
-      std::cout << GridLogIntegrator << "AuxField update_auxiliary_fields: " << std::sqrt(norm2(AuxField)) << std::endl;
-    {
-      AuxMom -= ep * AuxField * HMC_MOMENTUM_DENOMINATOR;
-      std::cout << GridLogIntegrator << "AuxMom update_auxiliary_fields: " << std::sqrt(norm2(AuxMom)) << std::endl;
+    if(1){
+      AuxMom -= ep * AuxField;
    }
  }

  void update_auxiliary_fields(RealD ep){
-//    if(!M.Trivial()) 
-    {
+    if (1) {
      MomentaField tmp(AuxMom.Grid());
      MomentaField tmp2(AuxMom.Grid());
      M.M(AuxMom, tmp);
      // M.M(tmp, tmp2);
      AuxField += ep * tmp;  // M^2 AuxMom
      // factor of 2?
-      std::cout << GridLogIntegrator << "AuxField update_auxiliary_fields: " << std::sqrt(norm2(AuxField)) << std::endl;
    }
  }

@@ -464,8 +464,7 @@ public:
  //U_padded: the gauge link fields padded out using the PaddedCell class
  //Cell: the padded cell class
  //gStencil: the precomputed generalized local stencil for the staple
-  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil)
-  {
+  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
    double t0 = usecond();
    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
@@ -488,9 +487,9 @@ public:
    for(int mu=0;mu<Nd;mu++){
      { //view scope
 	autoView( gStaple_v , gStaple, AcceleratorWrite);
-	auto gStencil_v = gStencil.View();
+	auto gStencil_v = gStencil.View(AcceleratorRead);
 	
-	accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), {
+	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
 	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
 	    stencil_ss = Zero();
 	    int off = outer_off;
@@ -1200,9 +1199,9 @@ public:

      { //view scope
 	autoView( gStaple_v , gStaple, AcceleratorWrite);
-	auto gStencil_v = gStencil.View();
+	auto gStencil_v = gStencil.View(AcceleratorRead);

-	accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), {
+	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
 	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
 	    stencil_ss = Zero();
 	    int s=offset;
@@ -1130,6 +1130,14 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
 #endif
 #endif

+// Fixme need coalesced read gpermute
+template<class vobj> void gpermute(vobj & inout,int perm){
+  vobj tmp=inout;
+  if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
+  if (perm & 0x2 ) { permute(inout,tmp,1); tmp=inout;}
+  if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
+  if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
+}

 NAMESPACE_END(Grid);

@@ -32,7 +32,12 @@ NAMESPACE_BEGIN(Grid);
 struct GeneralStencilEntry { 
  uint64_t _offset;            // 4 bytes 
  uint8_t _permute;            // 1 bytes // Horrible alignment properties
+  uint8_t _wrap;               // 1 bytes // Horrible alignment properties
 };
+struct GeneralStencilEntryReordered : public GeneralStencilEntry {
+  uint64_t _input;
+};
+
 // Could pack to 8 + 4 + 4 = 128 bit and use 

 class GeneralLocalStencilView {
@@ -43,10 +48,10 @@ class GeneralLocalStencilView {
  int                               _npoints; // Move to template param?
  GeneralStencilEntry*  _entries_p;

-  accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) const { 
+  accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) { 
    return & this->_entries_p[point+this->_npoints*osite]; 
  }
-
+  void ViewClose(void){};
 };
 ////////////////////////////////////////
 // The Stencil Class itself
@@ -61,7 +66,7 @@ protected:
 public: 
  GridBase *Grid(void) const { return _grid; }

-  View_type View(void) const {
+  View_type View(int mode) const {
    View_type accessor(*( (View_type *) this));
    return accessor;
  }
@@ -101,17 +106,23 @@ public:
 	  // Simpler version using icoor calculation
 	  ////////////////////////////////////////////////
 	  SE._permute =0;
+	  SE._wrap=0;
 	  for(int d=0;d<Coor.size();d++){

 	    int fd = grid->_fdimensions[d];
 	    int rd = grid->_rdimensions[d];
+	    int ld = grid->_ldimensions[d];
 	    int ly = grid->_simd_layout[d];

-	    assert((ly==1)||(ly==2));
+	    assert((ly==1)||(ly==2)||(ly==grid->Nsimd()));

 	    int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
 	    int x = Coor[d];                // x in [0... rd-1] as an oSite 

+	    if ( (x + shift)%fd != (x+shift)%ld ){
+	      SE._wrap = 1;
+	    }
+	    
 	    int permute_dim  = grid->PermuteDim(d);
 	    int permute_slice=0;
 	    if(permute_dim){    
@@ -120,7 +120,7 @@ hipStream_t computeStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
-  hipGetDeviceCount(&nDevices);
+  auto discard = hipGetDeviceCount(&nDevices);
  gpu_props = new hipDeviceProp_t[nDevices];

  char * localRankStr = NULL;
@@ -147,7 +147,7 @@ void acceleratorInit(void)
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
-    hipGetDeviceProperties(&gpu_props[i], i);
+    discard = hipGetDeviceProperties(&gpu_props[i], i);
    hipDeviceProp_t prop; 
    prop = gpu_props[i];
    totalDeviceMem = prop.totalGlobalMem;
@@ -184,13 +184,13 @@ void acceleratorInit(void)
  }
  int device = rank;
 #endif
-  hipSetDevice(device);
-  hipStreamCreate(&copyStream);
-  hipStreamCreate(&computeStream);
+  discard = hipSetDevice(device);
+  discard = hipStreamCreate(&copyStream);
+  discard = hipStreamCreate(&computeStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
-    hipDeviceGetPCIBusId(busid, len, device);
+    discard = hipDeviceGetPCIBusId(busid, len, device);
    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
  }
  if ( world_rank == 0 )  printf("AcceleratorHipInit: ================================================\n");
@@ -117,7 +117,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #endif
 } // CUDA specific

-inline void cuda_mem(void)
+inline void acceleratorMem(void)
 {
  size_t free_t,total_t,used_t;
  cudaMemGetInfo(&free_t,&total_t);
@@ -125,6 +125,11 @@ inline void cuda_mem(void)
  std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
 }

+inline void cuda_mem(void)
+{
+  acceleratorMem();
+}
+
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
    int nt=acceleratorThreads();					\
@@ -137,6 +142,18 @@ inline void cuda_mem(void)
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
  }
+#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
+  {									\
+    int nt=acceleratorThreads();					\
+    typedef uint64_t Iterator;						\
+    auto lambda = [=] accelerator					\
+      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
+      __VA_ARGS__;							\
+    };									\
+    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
+    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
+    ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
+  }

 #define accelerator_for6dNB(iter1, num1,				\
                            iter2, num2,				\
@@ -157,6 +174,20 @@ inline void cuda_mem(void)
    Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
  }

+
+#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
+  {									\
+    int nt=acceleratorThreads();					\
+    typedef uint64_t Iterator;						\
+    auto lambda = [=] accelerator					\
+      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
+      __VA_ARGS__;							\
+    };									\
+    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
+    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
+    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
+  }
+
 template<typename lambda>  __global__
 void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
 {
@@ -168,6 +199,17 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
    Lambda(x,y,z);
  }
 }
+template<typename lambda>  __global__
+void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
+{
+  // Weird permute is to make lane coalesce for large blocks
+  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
+  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
+  uint64_t z = threadIdx.x;
+  if ( (x < num1) && (y<num2) && (z<num3) ) {
+    Lambda(x,y,z);
+  }
+}

 template<typename lambda>  __global__
 void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
@@ -208,6 +250,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    assert(0);
  }
  return ptr;
 };
@@ -232,6 +275,7 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };

+
 inline int  acceleratorIsCommunicable(void *ptr)
 {
  //  int uvm=0;
@@ -267,6 +311,11 @@ NAMESPACE_END(Grid);

 NAMESPACE_BEGIN(Grid);

+inline void acceleratorMem(void)
+{
+  std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
+}
+
 extern cl::sycl::queue *theGridAccelerator;
 extern cl::sycl::queue *theCopyAccelerator;

@@ -345,6 +394,15 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator        __host__ __device__
 #define accelerator_inline __host__ __device__ inline

+inline void acceleratorMem(void)
+{
+  size_t free_t,total_t,used_t;
+  auto discard = hipMemGetInfo(&free_t,&total_t);
+  used_t=total_t-free_t;
+  std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
+}
+
+
 extern hipStream_t copyStream;
 extern hipStream_t computeStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
@@ -405,7 +463,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 #define accelerator_barrier(dummy)				\
  {								\
-    hipStreamSynchronize(computeStream);			\
+    auto tmp=hipStreamSynchronize(computeStream);		\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@@ -421,7 +479,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  auto err = hipMallocManaged((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err));
+    fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
  }
  return ptr;
 };
@@ -433,24 +491,24 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = hipMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
+    fprintf(stderr," hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
  }
  return ptr;
 };

-inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
-inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
+inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
 //inline void acceleratorCopySynchronise(void) {  }
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}

 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
-  hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 }
-inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
+inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };

 #endif

@@ -460,6 +518,9 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
 // FIXME -- the non-blocking nature got broken March 30 2023 by PAB
 #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );  
+#define prof_accelerator_for( iter1, num1, nsimd, ... ) \
+  prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\
+  accelerator_barrier(dummy);

 #define accelerator_for( iter, num, nsimd, ... )		\
  accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } );	\
@@ -473,6 +534,12 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);

 #endif

+inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
+{
+  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
+  acceleratorCopySynchronise();
+}
+
 //////////////////////////////////////////////
 // CPU Target - No accelerator just thread instead
 //////////////////////////////////////////////
@@ -482,6 +549,15 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 #undef GRID_SIMT


+inline void acceleratorMem(void)
+{
+  /*
+    struct rusage rusage;
+    getrusage( RUSAGE_SELF, &rusage );
+    return (size_t)rusage.ru_maxrss;
+  */
+  std::cout <<" system acceleratorMem not implemented"<<std::endl;
+}

 #define accelerator 
 #define accelerator_inline strong_inline
@@ -575,4 +651,17 @@ accelerator_inline void acceleratorFence(void)
  return;
 }

+template<class T> void acceleratorPut(T& dev,T&host)
+{
+  acceleratorCopyToDevice(&host,&dev,sizeof(T));
+}
+template<class T> T acceleratorGet(T& dev)
+{
+  T host;
+  acceleratorCopyFromDevice(&dev,&host,sizeof(T));
+  return host;
+}
+
+
+
 NAMESPACE_END(Grid);
@@ -94,6 +94,13 @@ static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION;

 typedef AcceleratorVector<int,MaxDims> Coordinate;

+template<class T,int _ndim>
+inline bool operator==(const AcceleratorVector<T,_ndim> &v,const AcceleratorVector<T,_ndim> &w)
+{
+  if (v.size()!=w.size()) return false;
+  for(int i=0;i<v.size();i++) if ( v[i]!=w[i] ) return false;
+  return true;
+}
 template<class T,int _ndim>
 inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector<T,_ndim> &v)
 {
@@ -283,6 +283,7 @@ void GridBanner(void)
    std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
 #endif
    std::cout << std::endl;
+    std::cout << std::setprecision(9);
 }

 void Grid_init(int *argc,char ***argv)
@@ -413,7 +414,7 @@ void Grid_init(int *argc,char ***argv)
  // Logging
  ////////////////////////////////////
  std::vector<std::string> logstreams;
-  std::string defaultLog("Error,Warning,Message,Performance");
+  std::string defaultLog("Error,Warning,Message");
  GridCmdOptionCSL(defaultLog,logstreams);
  GridLogConfigure(logstreams);

@@ -537,6 +538,10 @@ void Grid_init(int *argc,char ***argv)

 void Grid_finalize(void)
 {
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<"******* Grid Finalize                ******"<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+
 #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
@@ -8,7 +8,7 @@ namespace Grid{
  public:

    template<class coor_t>
-    static accelerator_inline void CoorFromIndex (coor_t& coor,int index,const coor_t &dims){
+    static accelerator_inline void CoorFromIndex (coor_t& coor,int64_t index,const coor_t &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=0;d<nd;d++){
@@ -18,28 +18,45 @@ namespace Grid{
    }

    template<class coor_t>
-    static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
+    static accelerator_inline void IndexFromCoor (const coor_t& coor,int64_t &index,const coor_t &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=0;d<nd;d++){
-	index = index+stride*coor[d];
+	index = index+(int64_t)stride*coor[d];
 	stride=stride*dims[d];
      }
    }
+    template<class coor_t>
+    static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
+      int64_t index64;
+      IndexFromCoor(coor,index64,dims);
+      assert(index64<2*1024*1024*1024LL);
+      index = (int) index64;
+    }

    template<class coor_t>
-    static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
+    static inline void IndexFromCoorReversed (const coor_t& coor,int64_t &index,const coor_t &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=nd-1;d>=0;d--){
-	index = index+stride*coor[d];
+	index = index+(int64_t)stride*coor[d];
 	stride=stride*dims[d];
      }
    }
    template<class coor_t>
-    static inline void CoorFromIndexReversed (coor_t& coor,int index,const coor_t &dims){
+    static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
+      int64_t index64;
+      IndexFromCoorReversed(coor,index64,dims);
+      if ( index64>=2*1024*1024*1024LL ){
+	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+      }
+      assert(index64<2*1024*1024*1024LL);
+      index = (int) index64;
+    }
+    template<class coor_t>
+    static inline void CoorFromIndexReversed (coor_t& coor,int64_t index,const coor_t &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=nd-1;d>=0;d--){
@@ -1,637 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: 
-
-Copyright (C) 2015-2016
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Guido Cossu
-Author: David Murphy
-Author: Chulwoo Jung <chulwoo@bnl.gov>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-#define MIXED_PRECISION
-#endif
-// second level EOFA
-#undef EOFA_H
-#undef USE_OBC
-#define DO_IMPLICIT
-
-NAMESPACE_BEGIN(Grid);
-
-  /*
-   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
-   *    -- Store the single prec action operator.
-   *    -- Clone the gauge field from the operator function argument.
-   *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
-   */
-
-  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
-  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
-  public:
-    typedef typename FermionOperatorD::FermionField FieldD;
-    typedef typename FermionOperatorF::FermionField FieldF;
-
-    using OperatorFunction<FieldD>::operator();
-
-    RealD   Tolerance;
-    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
-    Integer MaxInnerIterations;
-    Integer MaxOuterIterations;
-    GridBase* SinglePrecGrid4; //Grid for single-precision fields
-    GridBase* SinglePrecGrid5; //Grid for single-precision fields
-    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
-
-    FermionOperatorF &FermOpF;
-    FermionOperatorD &FermOpD;;
-    SchurOperatorF &LinOpF;
-    SchurOperatorD &LinOpD;
-
-    Integer TotalInnerIterations; //Number of inner CG iterations
-    Integer TotalOuterIterations; //Number of restarts
-    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
-
-    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
-						    Integer maxinnerit, 
-						    Integer maxouterit, 
-						    GridBase* _sp_grid4, 
-						    GridBase* _sp_grid5, 
-						    FermionOperatorF &_FermOpF,
-						    FermionOperatorD &_FermOpD,
-						    SchurOperatorF   &_LinOpF,
-						    SchurOperatorD   &_LinOpD): 
-      LinOpF(_LinOpF),
-      LinOpD(_LinOpD),
-      FermOpF(_FermOpF),
-      FermOpD(_FermOpD),
-      Tolerance(tol), 
-      InnerTolerance(tol), 
-      MaxInnerIterations(maxinnerit), 
-      MaxOuterIterations(maxouterit), 
-      SinglePrecGrid4(_sp_grid4),
-      SinglePrecGrid5(_sp_grid5),
-      OuterLoopNormMult(100.) 
-    { 
-      /* Debugging instances of objects; references are stored
-      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
-      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
-      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
-      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
-      */
-    };
-
-    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
-
-      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
-
-      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      
-      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
-      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
-      // Assumption made in code to extract gauge field
-      // We could avoid storing LinopD reference alltogether ?
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Must snarf a single precision copy of the gauge field in Linop_d argument
-      ////////////////////////////////////////////////////////////////////////////////////
-      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
-      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
-      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
-      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
-
-      GridBase * GridPtrF = SinglePrecGrid4;
-      GridBase * GridPtrD = FermOpD.Umu.Grid();
-      GaugeFieldF     U_f  (GridPtrF);
-      GaugeLinkFieldF Umu_f(GridPtrF);
-      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
-      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Moving this to a Clone method of fermion operator would allow to duplicate the 
-      // physics parameters and decrease gauge field copies
-      ////////////////////////////////////////////////////////////////////////////////////
-      GaugeLinkFieldD Umu_d(GridPtrD);
-      for(int mu=0;mu<Nd*2;mu++){ 
-	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
-	precisionChange(Umu_f,Umu_d);
-	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
-      }
-      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
-      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Make a mixed precision conjugate gradient
-      ////////////////////////////////////////////////////////////////////////////////////
-      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
-      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
-      MPCG(src,psi);
-    }
-  };
-
-NAMESPACE_END(Grid);
-
-
-int main(int argc, char **argv) {
-  using namespace Grid;
-
-  Grid_init(&argc, &argv);
-  int threads = GridThread::GetThreads();
-  // here make a routine to print all the relevant information on the run
-  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
-
-   // Typedefs to simplify notation
-  typedef WilsonImplR FermionImplPolicy;
-  typedef MobiusFermionD FermionAction;
-  typedef MobiusFermionF FermionActionF;
-  typedef MobiusEOFAFermionD FermionEOFAAction;
-  typedef MobiusEOFAFermionF FermionEOFAActionF;
-  typedef typename FermionAction::FermionField FermionField;
-  typedef typename FermionActionF::FermionField FermionFieldF;
-
-  typedef Grid::XmlReader       Serialiser;
-  
-  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-
-  HMCparameters HMCparams;
-#if 1
-  {
-    XmlReader  HMCrd("HMCparameters.xml");
-    read(HMCrd,"HMCparameters",HMCparams);
-  }
-#else
-  {
-//    HMCparameters HMCparams;
-  //  "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  //  HMCparams.StartingType     =std::string("ColdStart");
-    HMCparams.StartingType     =std::string("CheckpointStart");
-    HMCparams.StartTrajectory  =7;
-    HMCparams.SW  =4;
-    HMCparams.Trajectories     =1000;
-    HMCparams.NoMetropolisUntil=0;
-    HMCparams.MD.name          =std::string("Force Gradient");
-    HMCparams.MD.MDsteps       = 10;
-    HMCparams.MD.trajL         = 1.0;
-  }
-#endif
-
-#ifdef DO_IMPLICIT
-//    typedef GenericHMCRunner<ImplicitLeapFrog> HMCWrapper; 
-  typedef GenericHMCRunner<ImplicitMinimumNorm2> HMCWrapper; 
-  HMCparams.MD.name          =std::string("ImplicitMinimumNorm2");
-#else
-//  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
-  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
-//  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
-  HMCparams.MD.name          =std::string("ForceGradient");
-#endif
-
-  std::cout << GridLogMessage<< HMCparams <<std::endl;
-  HMCWrapper TheHMC(HMCparams);
-  TheHMC.ReadCommandLine(argc, argv);
-  { 
-    XmlWriter HMCwr("HMCparameters.xml.out");
-    write(HMCwr,"HMCparameters",TheHMC.Parameters);
-  }
-
-  // Grid from the command line arguments --grid and --mpi
-  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-  
-  CheckpointerParameters CPparams;
-  CPparams.config_prefix = "ckpoint_lat";
-  CPparams.rng_prefix    = "ckpoint_rng";
-  CPparams.saveInterval  = 1;
-  CPparams.format        = "IEEE64BIG";
-  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-
-  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5";
-  RNGpar.parallel_seeds = "6 7 8 9 10";
-  TheHMC.Resources.SetRNGSeeds(RNGpar);
-
-  // Construct observables
-  // here there is too much indirection 
-  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
-  TheHMC.Resources.AddObservable<PlaqObs>();
-  //////////////////////////////////////////////
-
-  const int Ls      = 12;
-  Real beta         = 5.983;
-  std::cout << GridLogMessage << " beta  "<< beta << std::endl;
-  Real light_mass   = 0.00049;
-  Real strange_mass = 0.0158;
-  Real charm_mass = 0.191;
-  Real pv_mass    = 1.0;
-  RealD M5  = 1.4;
-  RealD b   = 2.0; 
-  RealD c   = 1.0;
-
-  // Copied from paper
-//  std::vector<Real> hasenbusch({ 0.045 }); // Paper values from F1 incorrect run
-  std::vector<Real> hasenbusch({ 0.0038, 0.0145, 0.045, 0.108 , 0.25, 0.51 }); // Paper values from F1 incorrect run
-  std::vector<Real> hasenbusch2({ 0.4 }); // Paper values from F1 incorrect run
-
-//  RealD eofa_mass=0.05 ;
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////
-  //Bad choices with large dH. Equalising force L2 norm was not wise.
-  ///////////////////////////////////////////////////////////////////////////////////////////////
-  //std::vector<Real> hasenbusch({ 0.03, 0.2, 0.3, 0.5, 0.8 }); 
-
-  auto GridPtr   = TheHMC.Resources.GetCartesian();
-  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
-  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
-  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
-
-  Coordinate latt  = GridDefaultLatt();
-  Coordinate mpi   = GridDefaultMpi();
-  Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
-  Coordinate simdD = GridDefaultSimd(Nd,vComplexD::Nsimd());
-//  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
-  auto UGrid_f    = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
-  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f);
-  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
-  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
-
-
-#ifndef USE_OBC
-//  IwasakiGaugeActionR GaugeAction(beta);
-  WilsonGaugeActionR GaugeAction(beta);
-#else
-  std::vector<Complex> boundaryG = {1,1,1,0};
-  WilsonGaugeActionR::ImplParams ParamsG(boundaryG);
-  WilsonGaugeActionR GaugeAction(beta,ParamsG);
-#endif
-
-  // temporarily need a gauge field
-  LatticeGaugeField U(GridPtr);
-  LatticeGaugeFieldF UF(UGrid_f);
-
-  // These lines are unecessary if BC are all periodic
-#ifndef USE_OBC
-  std::vector<Complex> boundary = {1,1,1,-1};
-#else
-  std::vector<Complex> boundary = {1,1,1,0};
-#endif
-  FermionAction::ImplParams Params(boundary);
-  FermionActionF::ImplParams ParamsF(boundary);
-  
-  double ActionStoppingCondition     = 1e-8;
-  double DerivativeStoppingCondition = 1e-8;
-  double MaxCGIterations =  100000;
-
-  ////////////////////////////////////
-  // Collect actions
-  ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(HMCparams.SW);
-
-  ////////////////////////////////////
-  // Strange action
-  ////////////////////////////////////
-  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
-  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
-  typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF;
-  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
-
-  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
-  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA;
-
-  // DJM: setup for EOFA ratio (Mobius)
-  OneFlavourRationalParams OFRp;
-  OFRp.lo       = 0.99; // How do I know this on F1?
-  OFRp.hi       = 20;
-  OFRp.MaxIter  = 100000;
-  OFRp.tolerance= 1.0e-12;
-  OFRp.degree   = 12;
-  OFRp.precision= 50;
-
-  
-  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, charm_mass, 0.0, -1, M5, b, c);
-  MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, strange_mass, strange_mass, charm_mass, 0.0, -1, M5, b, c);
-  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , charm_mass, strange_mass,      charm_mass, -1.0, 1, M5, b, c);
-  MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, charm_mass, strange_mass,      charm_mass, -1.0, 1, M5, b, c);
-  
-#ifdef EOFA_H
-  MobiusEOFAFermionD Strange2_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , eofa_mass, eofa_mass, charm_mass , 0.0, -1, M5, b, c);
-  MobiusEOFAFermionF Strange2_Op_LF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, eofa_mass, eofa_mass, charm_mass , 0.0, -1, M5, b, c);
-  MobiusEOFAFermionD Strange2_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , charm_mass , eofa_mass,      charm_mass , -1.0, 1, M5, b, c);
-  MobiusEOFAFermionF Strange2_Op_RF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, charm_mass , eofa_mass,      charm_mass , -1.0, 1, M5, b, c);
-#endif
-
-  ConjugateGradient<FermionField>      ActionCG(ActionStoppingCondition,MaxCGIterations);
-  ConjugateGradient<FermionField>  DerivativeCG(DerivativeStoppingCondition,MaxCGIterations);
-#ifdef MIXED_PRECISION
-  const int MX_inner = 50000;
-
-  // Mixed precision EOFA
-  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
-  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
-  LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF);
-  LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
-
-#ifdef EOFA_H
-  // Mixed precision EOFA
-  LinearOperatorEOFAD Strange2_LinOp_L (Strange2_Op_L);
-  LinearOperatorEOFAD Strange2_LinOp_R (Strange2_Op_R);
-  LinearOperatorEOFAF Strange2_LinOp_LF(Strange2_Op_LF);
-  LinearOperatorEOFAF Strange2_LinOp_RF(Strange2_Op_RF);
-#endif
-
-  MxPCG_EOFA ActionCGL(ActionStoppingCondition,
-		       MX_inner,
-		       MaxCGIterations,
-		       UGrid_f,
-		       FrbGridF,
-		       Strange_Op_LF,Strange_Op_L,
-		       Strange_LinOp_LF,Strange_LinOp_L);
-
-#ifdef EOFA_H
-  MxPCG_EOFA ActionCGL2(ActionStoppingCondition,
-		       MX_inner,
-		       MaxCGIterations,
-		       UGrid_f,
-		       FrbGridF,
-		       Strange2_Op_LF,Strange2_Op_L,
-		       Strange2_LinOp_LF,Strange2_LinOp_L);
-#endif
-
-  MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,
-			   MX_inner,
-			   MaxCGIterations,
-			   UGrid_f,
-			   FrbGridF,
-			   Strange_Op_LF,Strange_Op_L,
-			   Strange_LinOp_LF,Strange_LinOp_L);
-
-#ifdef EOFA_H
-  MxPCG_EOFA DerivativeCGL2(DerivativeStoppingCondition,
-			   MX_inner,
-			   MaxCGIterations,
-			   UGrid_f,
-			   FrbGridF,
-			   Strange2_Op_LF,Strange2_Op_L,
-			   Strange2_LinOp_LF,Strange2_LinOp_L);
-#endif
-  
-  MxPCG_EOFA ActionCGR(ActionStoppingCondition,
-		       MX_inner,
-		       MaxCGIterations,
-		       UGrid_f,
-		       FrbGridF,
-		       Strange_Op_RF,Strange_Op_R,
-		       Strange_LinOp_RF,Strange_LinOp_R);
-  
-#ifdef EOFA_H
-  MxPCG_EOFA ActionCGR2(ActionStoppingCondition,
-		       MX_inner,
-		       MaxCGIterations,
-		       UGrid_f,
-		       FrbGridF,
-		       Strange2_Op_RF,Strange2_Op_R,
-		       Strange2_LinOp_RF,Strange2_LinOp_R);
-#endif
-  
-  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
-			   MX_inner,
-			   MaxCGIterations,
-			   UGrid_f,
-			   FrbGridF,
-			   Strange_Op_RF,Strange_Op_R,
-			   Strange_LinOp_RF,Strange_LinOp_R);
-  
-#ifdef EOFA_H
-  MxPCG_EOFA DerivativeCGR2(DerivativeStoppingCondition,
-			   MX_inner,
-			   MaxCGIterations,
-			   UGrid_f,
-			   FrbGridF,
-			   Strange2_Op_RF,Strange2_Op_R,
-			   Strange2_LinOp_RF,Strange2_LinOp_R);
-#endif
-  
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA(Strange_Op_L, Strange_Op_R, 
-	 ActionCG, 
-	 ActionCGL, ActionCGR,
-	 DerivativeCGL, DerivativeCGR,
-	 OFRp, true);
-  
-#ifdef EOFA_H
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA2(Strange2_Op_L, Strange2_Op_R, 
-	 ActionCG, 
-	 ActionCGL2, ActionCGR2,
-	 DerivativeCGL2, DerivativeCGR2,
-	 OFRp, true);
-#endif
-
-  Level1.push_back(&EOFA);
-#ifdef EOFA_H
-  Level1.push_back(&EOFA2);
-#endif
-
-#else
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA(Strange_Op_L, Strange_Op_R, 
-	 ActionCG, 
-	 ActionCG, ActionCG,
-	 ActionCG, ActionCG,
-	 //         DerivativeCG, DerivativeCG,
-	 OFRp, true);
-  Level1.push_back(&EOFA);
-#endif
-
-  ////////////////////////////////////
-  // up down action
-  ////////////////////////////////////
-  std::vector<Real> light_den;
-  std::vector<Real> light_num;
-
-  int n_hasenbusch = hasenbusch.size();
-  light_den.push_back(light_mass);
-  for(int h=0;h<n_hasenbusch;h++){
-    light_den.push_back(hasenbusch[h]);
-    light_num.push_back(hasenbusch[h]);
-  }
-  light_num.push_back(pv_mass);
-
-  int n_hasenbusch2 = hasenbusch2.size();
-  light_den.push_back(charm_mass);
-  for(int h=0;h<n_hasenbusch2;h++){
-    light_den.push_back(hasenbusch2[h]);
-    light_num.push_back(hasenbusch2[h]);
-  }
-  light_num.push_back(pv_mass);
-
-
-  //////////////////////////////////////////////////////////////
-  // Forced to replicate the MxPCG and DenominatorsF etc.. because
-  // there is no convenient way to "Clone" physics params from double op
-  // into single op for any operator pair.
-  // Same issue prevents using MxPCG in the Heatbath step
-  //////////////////////////////////////////////////////////////
-  std::vector<FermionAction *> Numerators;
-  std::vector<FermionAction *> Denominators;
-  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
-  std::vector<MxPCG *> ActionMPCG;
-  std::vector<MxPCG *> MPCG;
-  std::vector<FermionActionF *> DenominatorsF;
-  std::vector<LinearOperatorD *> LinOpD;
-  std::vector<LinearOperatorF *> LinOpF; 
-
-  for(int h=0;h<light_den.size();h++){
-
-    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
-
-    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
-    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
-
-#ifdef MIXED_PRECISION
-    ////////////////////////////////////////////////////////////////////////////
-    // Mixed precision CG for 2f force
-    ////////////////////////////////////////////////////////////////////////////
-    double DerivativeStoppingConditionLoose = 1e-8;
-
-    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*UGrid_f,*GridRBPtrF,light_den[h],M5,b,c, ParamsF));
-    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
-    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
-
-    double conv  = DerivativeStoppingCondition;
-    if (h<3) conv= DerivativeStoppingConditionLoose; // Relax on first two hasenbusch factors
-    MPCG.push_back(new MxPCG(conv,
-			     MX_inner,
-			     MaxCGIterations,
-			     UGrid_f,
-			     FrbGridF,
-			     *DenominatorsF[h],*Denominators[h],
-			     *LinOpF[h], *LinOpD[h]) );
-
-    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,
-				   MX_inner,
-				   MaxCGIterations,
-				   UGrid_f,
-				   FrbGridF,
-				   *DenominatorsF[h],*Denominators[h],
-				   *LinOpF[h], *LinOpD[h]) );
-
-    // Heatbath not mixed yet. As inverts numerators not so important as raised mass.
-    Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],ActionCG));
-#else
-    ////////////////////////////////////////////////////////////////////////////
-    // Standard CG for 2f force
-    ////////////////////////////////////////////////////////////////////////////
-    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],DerivativeCG,ActionCG));
-#endif
-
-  }
-
-  for(int h=0;h<n_hasenbusch+1;h++){
-    Level1.push_back(Quotients[h]);
-  }
-
-  /////////////////////////////////////////////////////////////
-  // Gauge action
-  /////////////////////////////////////////////////////////////
-  Level2.push_back(&GaugeAction);
-  TheHMC.TheAction.push_back(Level1);
-  TheHMC.TheAction.push_back(Level2);
-  std::cout << GridLogMessage << " Action complete "<< std::endl;
-
-  /////////////////////////////////////////////////////////////
-  // HMC parameters are serialisable
-
-  NoSmearing<HMCWrapper::ImplPolicy> S;
-#ifndef DO_IMPLICIT
-  TrivialMetric<HMCWrapper::ImplPolicy::Field> Mtr;
-#else
-    LaplacianRatParams gpar(2),mpar(2);
-    gpar.offset = 1.;
-    gpar.a0[0] = 500.;
-    gpar.a1[0] = 0.;
-    gpar.b0[0] = 0.25;
-    gpar.b1[0] = 1.;
-    gpar.a0[1] = -500.;
-    gpar.a1[1] = 0.;
-    gpar.b0[1] = 0.36;
-    gpar.b1[1] = 1.2;
-    gpar.b2=1.;
-
-    mpar.offset = 1.;
-    mpar.a0[0] =  -0.850891906532;
-    mpar.a1[0] = -1.54707654538;
-    mpar. b0[0] = 2.85557166137;
-    mpar. b1[0] = 5.74194794773;
-    mpar.a0[1] = -13.5120056831218384729709214298;
-    mpar.a1[1] = 1.54707654538396877086370295729;
-    mpar.b0[1] = 19.2921090880640520026645390317;
-    mpar.b1[1] = -3.54194794773029020262811172870;
-    mpar.b2=1.;
-    for(int i=0;i<2;i++){
-       gpar.a1[i] *=16.;
-       gpar.b1[i] *=16.;
-       mpar.a1[i] *=16.;
-       mpar.b1[i] *=16.;
-    }
-    gpar.b2 *= 16.*16.;
-    mpar.b2 *= 16.*16.;
-
-    ConjugateGradient<LatticeGaugeField> CG(1.0e-8,10000);
-    LaplacianParams LapPar(0.0001, 1.0, 10000, 1e-8, 12, 64);
-
-    std::cout << GridLogMessage << "LaplacianRat " << std::endl;
-    gpar.tolerance=HMCparams.MD.RMHMCCGTol;
-    mpar.tolerance=HMCparams.MD.RMHMCCGTol;
-    std::cout << GridLogMessage << "gpar offset= " << gpar.offset <<std::endl;
-    std::cout << GridLogMessage << " a0= " << gpar.a0 <<std::endl;
-    std::cout << GridLogMessage << " a1= " << gpar.a1 <<std::endl;
-    std::cout << GridLogMessage << " b0= " << gpar.b0 <<std::endl;
-    std::cout << GridLogMessage << " b1= " << gpar.b1 <<std::endl;
-    std::cout << GridLogMessage << " b2= " << gpar.b2 <<std::endl ;;
-
-    std::cout << GridLogMessage << "mpar offset= " << mpar.offset <<std::endl;
-    std::cout << GridLogMessage << " a0= " << mpar.a0 <<std::endl;
-    std::cout << GridLogMessage << " a1= " << mpar.a1 <<std::endl;
-    std::cout << GridLogMessage << " b0= " << mpar.b0 <<std::endl;
-    std::cout << GridLogMessage << " b1= " << mpar.b1 <<std::endl;
-    std::cout << GridLogMessage << " b2= " << mpar.b2 <<std::endl;
-//  Assumes PeriodicGimplR or D at the moment
-    auto UGrid = TheHMC.Resources.GetCartesian("gauge");
-//    auto UGrid_f   = GridPtrF;
-//  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
-//    std::cout << GridLogMessage << " UGrid= " << UGrid <<std::endl;
-//    std::cout << GridLogMessage << " UGrid_f= " << UGrid_f <<std::endl;
-
-    LaplacianAdjointRat<HMCWrapper::ImplPolicy, PeriodicGimplF> Mtr(UGrid, UGrid_f ,CG, gpar, mpar);
-#endif
-
-  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
-  TheHMC.Run(S,Mtr);  // no smearing
-
-  Grid_finalize();
-} // main
-
-
-
@@ -1,6 +1,44 @@
- - Slice sum optimisation & A2A - atomic addition
+i) Clean up CoarsenedMatrix, GeneralCoarsenedMatrix, GeneralCoarsenedMatrixMultiRHS
+
+ -- Ideally want a SINGLE implementation that does MultiRHS **AND** works with one RHS.
+
+ -- -- Getting there. One RHS is hard due to vectorisation & hardwired coarse5d layout
+ -- Compromise: Wrap it in a copy in/out for a slice.
+ 
+ -- Bad for Lanczos: need to do a BLOCK Lanczos instead. Longer term.
+
+ -- **** Make the test do ONLY the single RHS. ****
+ -- I/O for the matrix elements required.
+ -- Make the Adef2 build an eigenvector deflater and a block projector
+ -- 
+ 
+ -- Work with Regensburg on tests.
+ -- Plan interface preserving the coarsened matrix interface (??)
+
+-- Move functionality from GeneralCoarsenedMatrix INTO GeneralCoarsenedMatrixMultiRHS -- DONE
+   -- Don't immediately delete original
+   -- Instead make the new one self contained, then delete.
+   -- New DWF inverter test.
+
+  // void PopulateAdag(void)
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop, Aggregation<Fobj,CComplex,nbasis> & Subspace) -- DONE
+  ExchangeCoarseLinks();
+
+iii) Aurora -- christoph's problem -- DONE
+     Aurora -- Carleton's problem staggered.
+
+iv) Dennis merge and test Aurora -- DONE (save test)
+
+v) Merge Ed Bennet's request --DONE 
+
+vi) Repro CG  -- get down to the level of single node testing via split grid test 
+
+
+=========================
+
+===============
+- - Slice sum optimisation & A2A - atomic addition -- Dennis
 - - Also faster non-atomic reduction
- - Remaining PRs
 - - DDHMC
  - - MixedPrec is the action eval, high precision
  - - MixedPrecCleanup is the force eval, low precision
@@ -17,7 +55,6 @@ DDHMC
 -- Multishift Mixed Precision - DONE
 -- Pole dependent residual  - DONE

-
 =======
 -- comms threads issue??
 -- Part done: Staggered kernel performance on GPU
@@ -365,15 +365,9 @@ public:
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;

-#if 1
    typedef DomainWallFermionF Action;
    typedef typename Action::FermionField Fermion;
    typedef LatticeGaugeFieldF Gauge;
-#else
-    typedef GparityDomainWallFermionF Action;
-    typedef typename Action::FermionField Fermion;
-    typedef LatticeGaugeFieldF Gauge;
-#endif
    
    ///////// Source preparation ////////////
    Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
@@ -641,170 +635,6 @@ public:
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    return mflops_best;
  }
-
-  static double Laplace(int L)
-  {
-    double mflops;
-    double mflops_best = 0;
-    double mflops_worst= 0;
-    std::vector<double> mflops_all;
-
-    ///////////////////////////////////////////////////////
-    // Set/Get the layout & grid size
-    ///////////////////////////////////////////////////////
-    int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
-    Coordinate local({L,L,L,L});
-    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
-    
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
-								       GridDefaultSimd(Nd,vComplex::Nsimd()),
-								       GridDefaultMpi());
-    uint64_t NP = TmpGrid->RankCount();
-    uint64_t NN = TmpGrid->NodeCount();
-    NN_global=NN;
-    uint64_t SHM=NP/NN;
-
-
-    ///////// Welcome message ////////////
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "Benchmark Laplace on "<<L<<"^4 local volume "<<std::endl;
-    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
-    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
-    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
-    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
-    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
-    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    ///////// Lattice Init ////////////
-    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
-    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
-    
-    ///////// RNG Init ////////////
-    std::vector<int> seeds4({1,2,3,4});
-    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
-    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
-
-    RealD mass=0.1;
-    RealD c1=9.0/8.0;
-    RealD c2=-1.0/24.0;
-    RealD u0=1.0;
-
-//    typedef ImprovedStaggeredFermionF Action;
-//    typedef typename Action::FermionField Fermion; 
-    typedef LatticeGaugeFieldF Gauge;
-    
-    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
-
-//    typename Action::ImplParams params;
-//    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
-
-//  PeriodicGimplF
-    typedef typename PeriodicGimplF::LinkField GaugeLinkFieldF;
-
-    ///////// Source preparation ////////////
-    GaugeLinkFieldF src   (FGrid); random(RNG4,src);
-//    GaugeLinkFieldF src_e (FrbGrid);
-//    GaugeLinkFieldF src_o (FrbGrid);
-//    GaugeLinkFieldF r_e   (FrbGrid);
-//    GaugeLinkFieldF r_o   (FrbGrid);
-    GaugeLinkFieldF r_eo  (FGrid);
-  
-    {
-
- //     pickCheckerboard(Even,src_e,src);
- //     pickCheckerboard(Odd,src_o,src);
-    
-      const int num_cases = 1;
-      std::string fmt("G/O/C  ");
-      
-      controls Cases [] = {
-	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
-      }; 
-
-      for(int c=0;c<num_cases;c++) {
-        CovariantAdjointLaplacianStencil<PeriodicGimplF,typename PeriodicGimplF::LinkField> LapStencilF(FGrid);
-        QuadLinearOperator<CovariantAdjointLaplacianStencil<PeriodicGimplF,typename PeriodicGimplF::LinkField>,PeriodicGimplF::LinkField> QuadOpF(LapStencilF,c2,c1,1.);
-        LapStencilF.GaugeImport(Umu);
-	
-
-	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
-	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
-	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
-      
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using Stencil Nc Laplace" <<std::endl;
-	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
-	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-	
-	int nwarm = 10;
-	double t0=usecond();
-	FGrid->Barrier();
-	for(int i=0;i<nwarm;i++){
-//	  Ds.DhopEO(src_o,r_e,DaggerNo);
-          QuadOpF.HermOp(src,r_eo);
-	}
-	FGrid->Barrier();
-	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
-
-	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-
-	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-//	  Ds.DhopEO(src_o,r_e,DaggerNo);
-          QuadOpF.HermOp(src,r_eo);
-	  t1=usecond();
-	  t_time[i] = t1-t0;
-	}
-	FGrid->Barrier();
-	
-	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-//	double flops=(1146.0*volume)/2;
-	double flops=(2*2*8*216.0*volume);
-	double mf_hi, mf_lo, mf_err;
-	
-	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
-	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-
-	mflops = flops/timestat.mean;
-	mflops_all.push_back(mflops);
-	if ( mflops_best == 0   ) mflops_best = mflops;
-	if ( mflops_worst== 0   ) mflops_worst= mflops;
-	if ( mflops>mflops_best ) mflops_best = mflops;
-	if ( mflops<mflops_worst) mflops_worst= mflops;
-	
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s per rank   "<< mflops/NP<<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s per node   "<< mflops/NN<<std::endl;
-	FGrid->Barrier();
-      
-      }
-
-      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-      std::cout<<GridLogMessage << L<<"^4  Quad Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage << L<<"^4  Quad Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage <<fmt << std::endl;
-      std::cout<<GridLogMessage ;
-	FGrid->Barrier();
-
-      for(int i=0;i<mflops_all.size();i++){
-	std::cout<<mflops_all[i]/NN<<" ; " ;
-      }
-      std::cout<<std::endl;
-    }
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    return mflops_best;
-  }
 };


@@ -832,7 +662,6 @@ int main (int argc, char ** argv)
  std::vector<double> wilson;
  std::vector<double> dwf4;
  std::vector<double> staggered;
-  std::vector<double> lap;

  int Ls=1;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -859,20 +688,12 @@ int main (int argc, char ** argv)
    staggered.push_back(result);
  }

-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Laplace QuadOp 4D " <<std::endl;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  for(int l=0;l<L_list.size();l++){
-    double result = Benchmark::Laplace(L_list[l]) ;
-    lap.push_back(result);
-  }
-
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered \t\t Quad Laplace" <<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" <<std::endl;
  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<< " \t\t "<< lap[l]<< std::endl;
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

@@ -1,34 +0,0 @@
-# =============================================================================
-#  https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_14.html
-# =============================================================================
-#
-# SYNOPSIS
-#
-#   AX_CXX_COMPILE_STDCXX_14([ext|noext], [mandatory|optional])
-#
-# DESCRIPTION
-#
-#   Check for baseline language coverage in the compiler for the C++14
-#   standard; if necessary, add switches to CXX and CXXCPP to enable
-#   support.
-#
-#   This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX
-#   macro with the version set to C++14.  The two optional arguments are
-#   forwarded literally as the second and third argument respectively.
-#   Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for
-#   more information.  If you want to use this macro, you also need to
-#   download the ax_cxx_compile_stdcxx.m4 file.
-#
-# LICENSE
-#
-#   Copyright (c) 2015 Moritz Klammler <moritz@klammler.eu>
-#
-#   Copying and distribution of this file, with or without modification, are
-#   permitted in any medium without royalty provided the copyright notice
-#   and this notice are preserved. This file is offered as-is, without any
-#   warranty.
-
-#serial 5
-
-AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX])
-AC_DEFUN([AX_CXX_COMPILE_STDCXX_14], [AX_CXX_COMPILE_STDCXX([14], [$1], [$2])])
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+if [ $1 = "install" ]
+then
+    dir=`pwd`
+    cd $HOME
+    git clone -c feature.manyFiles=true https://github.com/spack/spack.git
+    source $HOME/spack/share/spack/setup-env.sh
+
+    spack install autoconf
+    spack install automake
+    spack install c-lime cppflags=-fPIE
+    spack install fftw
+    spack install llvm
+    spack install gmp
+    spack install mpfr
+    spack install cuda@11.8
+    spack install openmpi
+    spack install openssl
+    spack install hdf5
+else
+    source $HOME/spack/share/spack/setup-env.sh
+fi
+
+spack load autoconf
+spack load automake
+spack load c-lime
+spack load fftw
+spack load llvm
+spack load gmp
+spack load mpfr
+spack load cuda@11.8
+spack load openmpi
+spack load openssl
+spack load hdf5
+
+export FFTW=`spack find --paths fftw    | grep ^fftw   | awk '{print $2}' `
+export HDF5=`spack find --paths hdf5    | grep ^hdf5   | awk '{print $2}' `
+export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
+export MPFR=`spack find --paths mpfr    | grep ^mpfr  | awk '{print $2}' `
+export GMP=`spack find --paths gmp      | grep ^gmp | awk '{print $2}' `
+export NVIDIA=$CUDA_HOME
+export NVIDIALIB=$NVIDIA/targets/x86_64-linux/lib/
+export LD_LIBRARY_PATH=$NVIDIALIB:$FFTW/lib/:$MPFR/lib:$LD_LIBRARY_PATH
@@ -0,0 +1,43 @@
+#!/bin/bash -l
+#SBATCH --job-name=bench
+##SBATCH --partition=small-g
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-node=8
+#SBATCH --time=00:10:00
+#SBATCH --account=phy157_dwf
+#SBATCH --gpu-bind=none
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+cat << EOF > select_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3 7 6 5 4)
+export NUMA_MAP=(3 3 1 1 2 2 0 0)
+export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
+export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
+export HIP_VISIBLE_DEVICES=\$GPU
+unset ROCR_VISIBLE_DEVICES
+echo RANK \$SLURM_LOCALID using GPU \$GPU    
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+
+chmod +x ./select_gpu
+
+root=$HOME/Frontier/Grid/systems/Frontier/
+source ${root}/sourceme.sh
+
+export OMP_NUM_THREADS=7
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+
+for vol in 32.32.32.64
+do
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
+
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
+done
+
@@ -0,0 +1,23 @@
+CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
+../../configure --enable-comms=mpi-auto \
+--with-lime=$CLIME \
+--enable-unified=no \
+--enable-shm=nvlink \
+--enable-tracing=timer \
+--enable-accelerator=hip \
+--enable-gen-simd-width=64 \
+--disable-gparity \
+--disable-fermion-reps \
+--enable-simd=GPU \
+--enable-accelerator-cshift \
+--with-gmp=$OLCF_GMP_ROOT \
+--with-fftw=$FFTW_DIR/.. \
+--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
+--disable-fermion-reps \
+CXX=hipcc MPICXX=mpicxx \
+CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
+ LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
+
+
+
+
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+lrank=$SLURM_LOCALID
+lgpu=(0 1 2 3 7 6 5 4)
+
+export ROCR_VISIBLE_DEVICES=${lgpu[$lrank]}
+
+echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES "
+
+$*
+
+
+
@@ -0,0 +1,13 @@
+. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
+spack load c-lime
+#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib
+module load emacs 
+module load PrgEnv-gnu
+module load rocm/5.3.0
+module load cray-mpich/8.1.23
+module load gmp
+module load cray-fftw
+module load craype-accel-amd-gfx90a
+export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+#Hack for lib
+#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+export HIP_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES
+unset ROCR_VISIBLE_DEVICES
+
+#rank=$SLURM_PROCID
+#rocprof -d rocprof.$rank -o rocprof.$rank/results.rank$SLURM_PROCID.csv --sys-trace $@
+
+$@
@@ -1,8 +1,9 @@
 #!/bin/bash

 num_tile=2
-gpu_id=$(( (MPI_LOCALRANKID / num_tile ) ))
-tile_id=$((MPI_LOCALRANKID % num_tile))
+
+gpu_id=$(( (MPI_LOCAL_RANKID % num_tile ) ))
+tile_id=$((MPI_LOCAL_RANKID / num_tile))

 export ZE_AFFINITY_MASK=$gpu_id.$tile_id

@@ -0,0 +1,62 @@
+#!/bin/sh
+##SBATCH -p PVC-SPR-QZEH 
+##SBATCH -p PVC-ICX-QZNW
+#SBATCH -p QZ1J-ICX-PVC
+##SBATCH -p QZ1J-SPR-PVC-2C
+
+#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+
+export NT=8
+
+export I_MPI_OFFLOAD=1
+export I_MPI_OFFLOAD_TOPOLIB=level_zero
+export I_MPI_OFFLOAD_DOMAIN_SIZE=-1
+
+# export IGC_EnableLSCFenceUGMBeforeEOT=0
+# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
+export SYCL_DEVICE_FILTER=gpu,level_zero
+#export IGC_ShaderDumpEnable=1 
+#export IGC_DumpToCurrentDir=1
+export I_MPI_OFFLOAD_CELL=tile
+export EnableImplicitScaling=0
+export EnableWalkerPartition=0
+export ZE_AFFINITY_MASK=0.0
+mpiexec -launcher ssh -n 1 -host localhost  ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --device-mem 32768
+
+export ZE_AFFINITY_MASK=0
+export I_MPI_OFFLOAD_CELL=device
+export EnableImplicitScaling=1
+export EnableWalkerPartition=1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#mpiexec -launcher ssh -n 2 -host localhost  vtune -collect gpu-hotspots -knob gpu-sampling-interval=1 -data-limit=0 -r ./vtune_run4 -- ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
+
+#mpiexec  -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
+
+#mpiexec  -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1
+
+#mpiexec  -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
+
+#mpiexec  -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
+
+#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 16.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 0
+#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 1
+
@@ -0,0 +1,33 @@
+#!/bin/bash
+##SBATCH -p PVC-SPR-QZEH 
+##SBATCH -p PVC-ICX-QZNW
+
+#SBATCH -p QZ1J-ICX-PVC
+
+#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+
+export NT=16
+
+# export IGC_EnableLSCFenceUGMBeforeEOT=0
+# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
+#export IGC_ShaderDumpEnable=1 
+#export IGC_DumpToCurrentDir=1
+export I_MPI_OFFLOAD=1
+export I_MPI_OFFLOAD_TOPOLIB=level_zero
+export I_MPI_OFFLOAD_DOMAIN_SIZE=-1
+export SYCL_DEVICE_FILTER=gpu,level_zero
+export I_MPI_OFFLOAD_CELL=tile
+export EnableImplicitScaling=0
+export EnableWalkerPartition=0
+#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
+#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
+
+for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+do
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 1.1.1.2.log$i
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 2.1.1.1.log$i 
+done
+
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
+
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
+
+echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
+
+
+  $@
+
@@ -0,0 +1,16 @@
+INSTALL=/nfs/site/home/paboylx/prereqs/
+../../configure \
+	--enable-simd=GPU \
+	--enable-gen-simd-width=64 \
+	--enable-comms=mpi-auto \
+	--disable-accelerator-cshift \
+	--disable-gparity \
+	--disable-fermion-reps \
+	--enable-shm=nvlink \
+	--enable-accelerator=sycl \
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=dpcpp \
+	LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
+	CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare"
+
@@ -0,0 +1,18 @@
+export https_proxy=http://proxy-chain.intel.com:911
+#export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH
+
+module load intel-release
+module load intel-comp-rt/embargo-ci-neo
+
+#source /opt/intel/oneapi/PVC_setup.sh
+#source /opt/intel/oneapi/ATS_setup.sh
+#module load intel-nightly/20230331
+#module load intel-comp-rt/ci-neo-master/026093
+
+#module load intel/mpich
+module load intel/mpich/pvc45.3
+export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH
+
+#clsh embargo-ci-neo-022845
+#source /opt/intel/vtune_amplifier/amplxe-vars.sh
@@ -20,7 +20,7 @@ unset OMP_PLACES

 cd $PBS_O_WORKDIR

-#qsub jobscript.pbs
+qsub jobscript.pbs

 echo Jobid: $PBS_JOBID
 echo Running on host `hostname`
@@ -44,4 +44,3 @@ CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -enva
 	./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
 	--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"

-$CMD
@@ -45,8 +45,8 @@ echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_A

 if [ $PALS_LOCAL_RANKID = 0 ]
 then
-#    onetrace --chrome-device-timeline "$@"
-    "$@"
+    onetrace --chrome-device-timeline "$@"
+#    "$@"
 else
 "$@"
 fi
@@ -11,6 +11,6 @@ TOOLS=$HOME/tools
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
-	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
+	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -lapmidg -L$TOOLS/lib64/" \
 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"

@@ -1,4 +1,3 @@
 BREW=/opt/local/
 MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug

-
@@ -0,0 +1,319 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+gridblasHandle_t GridBLAS::gridblasHandle;
+int            GridBLAS::gridblasInit;
+
+///////////////////////
+// Tells little dirac op to use MdagM as the .Op()
+///////////////////////
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    wrapped.HermOp(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    wrapped.HermOp(in,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    wrapped.HermOp(in,out);
+  }
+};
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=4;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/4;
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
+							    GridDefaultSimd(Nd,vComplex::Nsimd()),
+							    GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+  //  Umu=Zero();
+  
+  RealD mass=0.1;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  const int nbasis = 62;
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  std::vector<LatticeFermion> subspace(nbasis,FGrid);
+
+  std::cout<<GridLogMessage<<"Calling Aggregation class" <<std::endl;
+
+  ///////////////////////////////////////////////////////////
+  // Squared operator is in HermOp
+  ///////////////////////////////////////////////////////////
+  MdagMLinearOperator<DomainWallFermionD,LatticeFermion> HermDefOp(Ddwf);
+
+  ///////////////////////////////////////////////////
+  // Random aggregation space
+  ///////////////////////////////////////////////////
+  std::cout<<GridLogMessage << "Building random aggregation class"<< std::endl;
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse5d,FGrid,cb);
+  Aggregates.CreateSubspaceRandom(RNG5);
+
+  ///////////////////////////////////////////////////
+  // Build little dirac op
+  ///////////////////////////////////////////////////
+  std::cout<<GridLogMessage << "Building little Dirac operator"<< std::endl;
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
+  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse5d);
+  LittleDiracOperator LittleDiracOpCol(geom,FGrid,Coarse5d);
+
+  HermOpAdaptor<LatticeFermionD> HOA(HermDefOp);
+
+  LittleDiracOp.CoarsenOperator(HOA,Aggregates);
+  
+  ///////////////////////////////////////////////////
+  // Test the operator
+  ///////////////////////////////////////////////////
+  CoarseVector c_src (Coarse5d);
+  CoarseVector c_res (Coarse5d);
+  CoarseVector c_res_dag(Coarse5d);
+  CoarseVector c_proj(Coarse5d);
+
+  subspace=Aggregates.subspace;
+
+  //  random(CRNG,c_src);
+  c_src = 1.0;
+
+  blockPromote(c_src,err,subspace);
+
+  prom=Zero();
+  for(int b=0;b<nbasis;b++){
+    prom=prom+subspace[b];
+  }
+  err=err-prom; 
+  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  HermDefOp.HermOp(prom,tmp);
+
+  blockProject(c_proj,tmp,subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  std::cout<<GridLogMessage<<" Calling little Dirac Op "<<std::endl;
+  LittleDiracOp.M(c_src,c_res);
+  LittleDiracOp.Mdag(c_src,c_res_dag);
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+  std::cout<<GridLogMessage<<"Little dop dag : "<<norm2(c_res_dag)<<std::endl;
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+
+  c_res_dag = c_res_dag - c_res;
+  std::cout<<GridLogMessage<<"Little dopDag - dop: "<<norm2(c_res_dag)<<std::endl;
+
+  std::cout<<GridLogMessage << "Testing Hermiticity stochastically "<< std::endl;
+  CoarseVector phi(Coarse5d);
+  CoarseVector chi(Coarse5d);
+  CoarseVector Aphi(Coarse5d);
+  CoarseVector Achi(Coarse5d);
+
+  random(CRNG,phi);
+  random(CRNG,chi);
+
+  std::cout<<GridLogMessage<<"Made randoms "<<norm2(phi)<<" " << norm2(chi)<<std::endl;
+
+  LittleDiracOp.M(phi,Aphi);
+
+  LittleDiracOp.Mdag(chi,Achi);
+
+  std::cout<<GridLogMessage<<"Aphi "<<norm2(Aphi)<<" A chi" << norm2(Achi)<<std::endl;
+
+  ComplexD pAc = innerProduct(chi,Aphi);
+  ComplexD cAp = innerProduct(phi,Achi);
+  ComplexD cAc = innerProduct(chi,Achi);
+  ComplexD pAp = innerProduct(phi,Aphi);
+
+  std::cout<<GridLogMessage<< "pAc "<<pAc<<" cAp "<< cAp<< " diff "<<pAc-adj(cAp)<<std::endl;
+  std::cout<<GridLogMessage<< "pAp "<<pAp<<" cAc "<< cAc<<"Should be real"<< std::endl;
+
+  std::cout<<GridLogMessage<<"Testing linearity"<<std::endl;
+  CoarseVector PhiPlusChi(Coarse5d);
+  CoarseVector APhiPlusChi(Coarse5d);
+  CoarseVector linerr(Coarse5d);
+  PhiPlusChi = phi+chi;
+  LittleDiracOp.M(PhiPlusChi,APhiPlusChi);
+
+  linerr= APhiPlusChi-Aphi;
+  linerr= linerr-Achi;
+  std::cout<<GridLogMessage<<"**Diff "<<norm2(linerr)<<std::endl;
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  //////////////////////////////////////////////////////////////////////////////////////
+  //  Create a higher dim coarse grid
+  //////////////////////////////////////////////////////////////////////////////////////
+
+  const int nrhs=vComplex::Nsimd()*3;
+
+  Coordinate mpi=GridDefaultMpi();
+  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
+  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
+  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
+
+  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
+
+  
+  MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
+  typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t;
+  
+  //////////////////////////////////////////
+  // Test against single RHS
+  //////////////////////////////////////////
+  {
+    GridParallelRNG          rh_CRNG(CoarseMrhs);rh_CRNG.SeedFixedIntegers(cseeds);
+    CoarseVector rh_phi(CoarseMrhs);
+    CoarseVector rh_res(CoarseMrhs);
+    random(rh_CRNG,rh_phi);
+
+    std::cout << "Warmup"<<std::endl;
+    mrhs.M(rh_phi,rh_res);
+    const int ncall=5;
+    RealD t0=-usecond();
+    for(int i=0;i<ncall;i++){
+      std::cout << "Call "<<i<<"/"<<ncall<<std::endl;
+      mrhs.M(rh_phi,rh_res);
+    }
+    t0+=usecond();
+    RealD t1=0;
+    for(int r=0;r<nrhs;r++){
+      std::cout << " compare to single RHS "<<r<<"/"<<nrhs<<std::endl;
+      ExtractSlice(phi,rh_phi,r,0);
+      ExtractSlice(chi,rh_res,r,0);
+      LittleDiracOp.M(phi,Aphi);
+      t1-=usecond();
+      for(int i=0;i<ncall;i++){
+	std::cout << "Call "<<i<<"/"<<ncall<<std::endl;
+	LittleDiracOp.M(phi,Aphi);
+      }
+      t1+=usecond();
+      Coordinate site({0,0,0,0,0});
+      auto  bad = peekSite(chi,site);
+      auto good = peekSite(Aphi,site);
+      std::cout << " mrhs [" <<r <<"] "<< norm2(chi)<<std::endl;
+      std::cout << " srhs [" <<r <<"] "<< norm2(Aphi)<<std::endl;
+      chi=chi-Aphi;
+      RealD diff =norm2(chi);
+      std::cout << r << " diff " << diff<<std::endl;
+      assert(diff < 1.0e-10);
+    }
+    std::cout << nrhs<< " mrhs " << t0/ncall/nrhs <<" us"<<std::endl;
+    std::cout << nrhs<< " srhs " << t1/ncall/nrhs <<" us"<<std::endl;
+  }
+
+  //////////////////////////////////////////
+  // Test against single RHS
+  //////////////////////////////////////////
+  {
+    typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> HermMatrix;
+    HermMatrix MrhsCoarseOp     (mrhs);
+
+    GridParallelRNG          rh_CRNG(CoarseMrhs);rh_CRNG.SeedFixedIntegers(cseeds);
+    ConjugateGradient<CoarseVector>  mrhsCG(1.0e-8,2000,true);
+    CoarseVector rh_res(CoarseMrhs);
+    CoarseVector rh_src(CoarseMrhs);
+    random(rh_CRNG,rh_src);
+    rh_res= Zero();
+    mrhsCG(MrhsCoarseOp,rh_src,rh_res);
+  }
+  
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,426 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_general_coarse_hdcg.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+//#include <Grid/algorithms/GeneralCoarsenedMatrix.h>
+#include <Grid/algorithms/iterative/AdefGeneric.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Coarsened>
+void SaveOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Operator.Grid()->IsBoss());
+  assert(Operator._A.size()==Operator.geom.npoint);
+  WR.open(file);
+  for(int p=0;p<Operator._A.size();p++){
+    auto tmp = Operator.Cell.Extract(Operator._A[p]);
+    WR.writeScidacFieldRecord(tmp,record);
+  }
+  WR.close();
+#endif
+}
+template<class Coarsened>
+void LoadOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  Grid::ScidacReader RD ;
+  RD.open(file);
+  assert(Operator._A.size()==Operator.geom.npoint);
+  for(int p=0;p<Operator.geom.npoint;p++){
+    conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
+    RD.readScidacFieldRecord(Operator._A[p],record);
+  }    
+  RD.close();
+  Operator.ExchangeCoarseLinks();
+#endif
+}
+template<class aggregation>
+void SaveBasis(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg.FineGrid->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    WR.writeScidacFieldRecord(Agg.subspace[b],record);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void LoadBasis(aggregation &Agg, std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    RD.readScidacFieldRecord(Agg.subspace[b],record);
+  }    
+  RD.close();
+#endif
+}
+
+
+template<class Field> class TestSolver : public LinearFunction<Field> {
+public:
+  TestSolver() {};
+  void operator() (const Field &in, Field &out){    out = Zero();  }     
+};
+
+
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+// Want Op in CoarsenOp to call MatPcDagMatPc
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
+  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
+  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
+  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+};
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  Chebyshev<Field> Cheby;
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {
+    std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    tmp = in;
+    Cheby(_SmootherOperator,tmp,out);
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+  const int nbasis = 40;
+  const int cb = 0 ;
+  RealD mass=0.00078;
+  RealD M5=1.8;
+  RealD b=1.5;
+  RealD c=0.5;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid with 4^4 cell
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
+							    GridDefaultSimd(Nd,vComplex::Nsimd()),
+							    GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  ///////////////////////// RNGs /////////////////////////////////
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  ///////////////////////// Configuration /////////////////////////////////
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  //////////////////////// Fermion action //////////////////////////////////
+  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+
+  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
+
+  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
+  HermFineMatrix FineHermOp(HermOpEO);
+  
+  LatticeFermion result(FrbGrid); result=Zero();
+
+  LatticeFermion    src(FrbGrid); random(RNG5,src);
+
+  // Run power method on FineHermOp
+  PowerMethod<LatticeFermion>       PM;   PM(HermOpEO,src);
+
+ 
+  ////////////////////////////////////////////////////////////
+  ///////////// Coarse basis and Little Dirac Operator ///////
+  ////////////////////////////////////////////////////////////
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
+  NearestStencilGeometry5D geom_nn(Coarse5d);
+  
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse5d,FrbGrid,cb);
+
+  ////////////////////////////////////////////////////////////
+  // Need to check about red-black grid coarsening
+  ////////////////////////////////////////////////////////////
+  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
+
+  bool load=false;
+  if ( load ) {
+    LoadBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
+    LoadOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
+  } else {
+    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+				       95.0,0.1,
+				       //				     400,200,200 -- 48 iters
+				       //				     600,200,200 -- 38 iters, 162s
+				       //				     600,200,100 -- 38 iters, 169s
+				       //				     600,200,50  -- 88 iters. 370s 
+				       800,
+				       200,
+				       100,
+				       0.0);
+    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
+    SaveBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
+    SaveOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
+  }
+  
+  // Try projecting to one hop only
+  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
+  LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
+
+  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
+  HermMatrix CoarseOp     (LittleDiracOp);
+  HermMatrix CoarseOpProj (LittleDiracOpProj);
+  
+  //////////////////////////////////////////
+  // Build a coarse lanczos
+  //////////////////////////////////////////
+  Chebyshev<CoarseVector>      IRLCheby(0.2,40.0,71);  // 1 iter
+  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
+  PlainHermOp<CoarseVector>    IRLOp    (CoarseOp);
+  int Nk=48;
+  int Nm=64;
+  int Nstop=Nk;
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-5,20);
+
+  int Nconv;
+  std::vector<RealD>            eval(Nm);
+  std::vector<CoarseVector>     evec(Nm,Coarse5d);
+  CoarseVector c_src(Coarse5d);
+  //c_src=1.0;
+  random(CRNG,c_src);
+
+  CoarseVector c_res(Coarse5d); 
+  CoarseVector c_ref(Coarse5d); 
+
+  PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src);
+
+  IRL.calc(eval,evec,c_src,Nconv);
+  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+
+  //////////////////////////////////////////
+  // Build a coarse space solver
+  //////////////////////////////////////////
+  int maxit=20000;
+  ConjugateGradient<CoarseVector>  CG(1.0e-8,maxit,false);
+  ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,10000,false);
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+
+  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
+  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
+  c_res=Zero();
+  HPDSolve(c_src,c_res); c_ref = c_res;
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
+  //////////////////////////////////////////////////////////////////////////
+  // Deflated (with real op EV's) solve for the projected coarse op
+  // Work towards ADEF1 in the coarse space
+  //////////////////////////////////////////////////////////////////////////
+  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
+  c_res=Zero();
+  HPDSolveProj(c_src,c_res);
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
+  c_res = c_res - c_ref;
+  std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Coarse ADEF1 with deflation space
+  //////////////////////////////////////////////////////////////////////
+  ChebyshevSmoother<CoarseVector,HermMatrix >
+    CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence
+    //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  //
+  //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s
+  //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s
+  //    CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter
+  //  CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter
+  //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj);  // 311
+
+  ////////////////////////////////////////////////////////
+  // CG, Cheby mode spacing 200,200
+  // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
+  // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s
+  // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
+  ////////////////////////////////////////////////////////
+  // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs 
+  ////////////////////////////////////////////////////////
+  // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain
+  // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
+  // HDCG 38 iters 162s
+  //
+  // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs 
+  // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain
+  // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
+  // HDCG 38 iters 169s
+
+  TwoLevelADEF1defl<CoarseVector>
+    cADEF1(1.0e-8, 500,
+	   CoarseOp,
+	   CoarseSmoother,
+	   evec,eval);
+
+  c_res=Zero();
+  cADEF1(c_src,c_res);
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
+  c_res = c_res - c_ref;
+  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
+  
+  //  cADEF1.Tolerance = 4.0e-2;
+  //  cADEF1.Tolerance = 1.0e-1;
+  cADEF1.Tolerance = 5.0e-2;
+  c_res=Zero();
+  cADEF1(c_src,c_res);
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
+  c_res = c_res - c_ref;
+  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
+  
+  //////////////////////////////////////////
+  // Build a smoother
+  //////////////////////////////////////////
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp);  //383
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp);  //328
+  //  std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1
+  //  std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations
+  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters
+  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters
+
+  //
+  // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40
+  // Need to measure cost of coarse space.
+  //
+  // -- i) Reduce coarse residual   -- 0.04
+  // -- ii) Lanczos on coarse space -- done
+  // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and
+  //         use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec
+  //
+  std::vector<RealD> los({3.0}); // Nbasis 40 == 36,36 iters
+
+  //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
+  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)  
+
+  for(int l=0;l<los.size();l++){
+
+    RealD lo = los[l];
+
+    for(int o=0;o<ords.size();o++){
+
+      ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false);
+      HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
+      
+      //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
+      ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,ords[o],FineHermOp);  // 311
+
+      //////////////////////////////////////////
+      // Build a HDCG solver
+      //////////////////////////////////////////
+      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
+	HDCG(1.0e-8, 100,
+	     FineHermOp,
+	     Smoother,
+	     HPDSolveSloppy,
+	     HPDSolve,
+	     Aggregates);
+
+      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
+	HDCGdefl(1.0e-8, 100,
+		 FineHermOp,
+		 Smoother,
+		 cADEF1,
+		 HPDSolve,
+		 Aggregates);
+      
+      result=Zero();
+      HDCGdefl(src,result);
+
+      result=Zero();
+      HDCG(src,result);
+
+      
+    }
+  }
+
+  // Standard CG
+  result=Zero();
+  CGfine(HermOpEO, src, result);
+  
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,641 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_general_coarse_hdcg.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+//#include <Grid/algorithms/GeneralCoarsenedMatrix.h>
+#include <Grid/algorithms/iterative/AdefGeneric.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Coarsened>
+void SaveOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Operator.Grid()->IsBoss());
+  assert(Operator._A.size()==Operator.geom.npoint);
+  WR.open(file);
+  for(int p=0;p<Operator._A.size();p++){
+    auto tmp = Operator.Cell.Extract(Operator._A[p]);
+    WR.writeScidacFieldRecord(tmp,record,0,0);
+    //    WR.writeScidacFieldRecord(tmp,record,0,BINARYIO_LEXICOGRAPHIC);
+  }
+  WR.close();
+#endif
+}
+template<class Coarsened>
+void LoadOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  Grid::ScidacReader RD ;
+  RD.open(file);
+  assert(Operator._A.size()==Operator.geom.npoint);
+  for(int p=0;p<Operator.geom.npoint;p++){
+    conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
+    //    RD.readScidacFieldRecord(Operator._A[p],record,BINARYIO_LEXICOGRAPHIC);
+    RD.readScidacFieldRecord(Operator._A[p],record,0);
+  }    
+  RD.close();
+  Operator.ExchangeCoarseLinks();
+#endif
+}
+template<class Coarsened>
+void ReLoadOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  Grid::ScidacReader RD ;
+  RD.open(file);
+  assert(Operator._A.size()==Operator.geom.npoint);
+  for(int p=0;p<Operator.geom.npoint;p++){
+    auto tmp=Operator.Cell.Extract(Operator._A[p]);
+    RD.readScidacFieldRecord(tmp,record,0);
+    Operator._A[p] = Operator.Cell.ExchangePeriodic(tmp);
+  }    
+  RD.close();
+#endif
+}
+template<class aggregation>
+void SaveBasis(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg.FineGrid->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    //WR.writeScidacFieldRecord(Agg.subspace[b],record,0,BINARYIO_LEXICOGRAPHIC);
+    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,0);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void LoadBasis(aggregation &Agg, std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,BINARYIO_LEXICOGRAPHIC);
+    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+// Want Op in CoarsenOp to call MatPcDagMatPc
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
+  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
+  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
+  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+};
+template<class Field> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  Chebyshev<Field> Cheby;
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {
+    std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    tmp = in;
+    Cheby(_SmootherOperator,tmp,out);
+  }
+};
+
+template<class Field> class CGSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  int iters;
+  CGSmoother(int _iters, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    iters(_iters)
+  {
+    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
+    CG(_SmootherOperator,in,out);
+  }
+};
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+  const int nbasis = 62;
+  //  const int nbasis = 56;
+  //  const int nbasis = 44;
+  const int cb = 0 ;
+  RealD mass=0.00078;
+  RealD M5=1.8;
+  RealD b=1.5;
+  RealD c=0.5;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid with 4^4 cell
+  Coordinate Block({4,4,6,4});
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/Block[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
+							    GridDefaultSimd(Nd,vComplex::Nsimd()),
+							    GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  ///////////////////////// RNGs /////////////////////////////////
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  ///////////////////////// Configuration /////////////////////////////////
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.1000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  //////////////////////// Fermion action //////////////////////////////////
+  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+
+  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
+
+  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
+  HermFineMatrix FineHermOp(HermOpEO);
+
+  LatticeFermion result(FrbGrid); result=Zero();
+
+  LatticeFermion    src(FrbGrid); random(RNG5,src);
+
+  // Run power method on FineHermOp
+  PowerMethod<LatticeFermion>       PM;   PM(HermOpEO,src);
+ 
+  ////////////////////////////////////////////////////////////
+  ///////////// Coarse basis and Little Dirac Operator ///////
+  ////////////////////////////////////////////////////////////
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
+  NearestStencilGeometry5D geom_nn(Coarse5d);
+  
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse5d,FrbGrid,cb);
+
+  ////////////////////////////////////////////////////////////
+  // Need to check about red-black grid coarsening
+  ////////////////////////////////////////////////////////////
+  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
+
+  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.rat.scidac.62");
+  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.rat.scidac.62");
+  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.rat.scidac.62");
+  bool load_agg=true;
+  bool load_refine=true;
+  bool load_mat=true;
+  if ( load_agg ) {
+    LoadBasis(Aggregates,subspace_file);
+  } else {
+
+    // NBASIS=40
+    // Best so far: ord 2000 [0.01,95], 500,500  -- 466 iters
+    // slurm-398626.out:Grid : Message : 141.295253 s : 500 filt [1] <n|MdagM|n> 0.000103622063
+
+
+    //Grid : Message : 33.870465 s :  Chebyshev subspace pass-1 : ord 2000 [0.001,95]
+    //Grid : Message : 33.870485 s :  Chebyshev subspace pass-2 : nbasis40 min 1000 step 1000 lo0
+    //slurm-1482200.out : filt ~ 0.004 -- not as low mode projecting -- took 626 iters
+
+    // To try: 2000 [0.1,95]  ,2000,500,500 -- slurm-1482213.out 586 iterations
+
+    // To try: 2000 [0.01,95] ,2000,500,500 -- 469 (think I bumped 92 to 95) (??)
+    // To try: 2000 [0.025,95],2000,500,500
+    // To try: 2000 [0.005,95],2000,500,500
+
+    // NBASIS=44 -- HDCG paper was 64 vectors; AMD compiler craps out at 48
+    // To try: 2000 [0.01,95] ,2000,500,500 -- 419 lowest slurm-1482355.out
+    // To try: 2000 [0.025,95] ,2000,500,500 -- 487 
+    // To try: 2000 [0.005,95] ,2000,500,500
+    /*
+      Smoother [3,92] order 16
+slurm-1482355.out:Grid : Message : 35.239686 s :  Chebyshev subspace pass-1 : ord 2000 [0.01,95]
+slurm-1482355.out:Grid : Message : 35.239714 s :  Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0
+slurm-1482355.out:Grid : Message : 5561.305552 s : HDCG: Pcg converged in 419 iterations and 2616.202598 s
+
+slurm-1482367.out:Grid : Message : 43.157235 s :  Chebyshev subspace pass-1 : ord 2000 [0.025,95]
+slurm-1482367.out:Grid : Message : 43.157257 s :  Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0
+slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 iterations and 3131.185821 s
+    */
+		 /*
+		   Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+				       95.0,0.0075,
+				       2500,
+				       500,
+				       500,
+				       0.0);
+		 */
+
+		 /*
+		   Aggregates.CreateSubspaceChebyshevPowerLaw(RNG5,HermOpEO,nbasis,
+							      95.0,
+							      2000);
+		 */
+
+    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
+					0.0003,1.0e-5,2000); // Lo, tol, maxit
+  /*
+    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+				       95.0,0.05,
+				       2000,
+				       500,
+				       500,
+				       0.0);
+ */
+    /*
+      Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+				       95.0,0.01,
+				       2000,
+				       500,
+				       500,
+				       0.0);
+    */
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); -- running slurm-1484934.out nbasis 56
+
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); <== last run
+    SaveBasis(Aggregates,subspace_file);
+  }
+
+  int refine=1;
+  if(refine){
+    if ( load_refine ) {
+      LoadBasis(Aggregates,refine_file);
+    } else {
+      // HDCG used Pcg to refine
+      Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000);
+      SaveBasis(Aggregates,refine_file);
+    }
+  }
+
+  Aggregates.Orthogonalise();
+  if ( load_mat ) {
+    LoadOperator(LittleDiracOp,ldop_file);
+  } else {
+    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
+    SaveOperator(LittleDiracOp,ldop_file);
+  }
+
+  // I/O test:
+  CoarseVector c_src(Coarse5d);   random(CRNG,c_src);
+  CoarseVector c_res(Coarse5d); 
+  CoarseVector c_ref(Coarse5d);
+
+  // Try projecting to one hop only
+  //  LittleDiracOp.ShiftMatrix(1.0e-4);
+  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
+  LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
+
+  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
+  HermMatrix CoarseOp     (LittleDiracOp);
+  HermMatrix CoarseOpProj (LittleDiracOpProj);
+  
+  //////////////////////////////////////////
+  // Build a coarse lanczos
+  //////////////////////////////////////////
+  //  Chebyshev<CoarseVector>      IRLCheby(0.012,40.0,201);  //500 HDCG iters
+  //  int Nk=512; // Didn't save much
+  //  int Nm=640;
+  //  int Nstop=400;
+
+  //  Chebyshev<CoarseVector>      IRLCheby(0.005,40.0,201);  //319 HDCG iters @ 128//160 nk.
+  //  int Nk=128;
+  //  int Nm=160;
+  Chebyshev<CoarseVector>      IRLCheby(0.005,40.0,201);  //319 HDCG iters @ 128//160 nk.
+  int Nk=192;
+  int Nm=256;
+  int Nstop=Nk;
+  
+  //  Chebyshev<CoarseVector>      IRLCheby(0.010,45.0,201);  // 1 iter
+  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
+  PlainHermOp<CoarseVector>    IRLOp    (CoarseOp);
+  
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1e-5,10);
+
+  int Nconv;
+  std::vector<RealD>            eval(Nm);
+  std::vector<CoarseVector>     evec(Nm,Coarse5d);
+
+  PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src);
+
+  IRL.calc(eval,evec,c_src,Nconv);
+  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+
+  //////////////////////////////////////////
+  // Build a coarse space solver
+  //////////////////////////////////////////
+  int maxit=30000;
+  ConjugateGradient<CoarseVector>  CG(1.0e-10,maxit,false);
+  ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+
+  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
+  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
+  c_res=Zero();
+  //  HPDSolve(c_src,c_res); c_ref = c_res;
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
+  //////////////////////////////////////////////////////////////////////////
+  // Deflated (with real op EV's) solve for the projected coarse op
+  // Work towards ADEF1 in the coarse space
+  //////////////////////////////////////////////////////////////////////////
+  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
+  c_res=Zero();
+  //  HPDSolveProj(c_src,c_res);
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
+  //  c_res = c_res - c_ref;
+  //  std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Coarse ADEF1 with deflation space
+  //////////////////////////////////////////////////////////////////////
+  ChebyshevSmoother<CoarseVector >  CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence
+    //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  //
+  //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s
+  //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s
+  //    CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter
+  //  CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter
+  //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj);  // 311
+
+  ////////////////////////////////////////////////////////
+  // CG, Cheby mode spacing 200,200
+  // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
+  // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s
+  // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
+  ////////////////////////////////////////////////////////
+  // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs 
+  ////////////////////////////////////////////////////////
+  // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain
+  // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
+  // HDCG 38 iters 162s
+  //
+  // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs 
+  // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain
+  // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
+  // HDCG 38 iters 169s
+
+  TwoLevelADEF1defl<CoarseVector>
+    cADEF1(1.0e-8, 500,
+	   CoarseOp,
+	   CoarseSmoother,
+	   evec,eval);
+
+  //  c_res=Zero();
+  //  cADEF1(c_src,c_res);
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
+  //  c_res = c_res - c_ref;
+  //  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
+  
+  //  cADEF1.Tolerance = 4.0e-2;
+  //  cADEF1.Tolerance = 1.0e-1;
+  //  cADEF1.Tolerance = 5.0e-2;
+  //  c_res=Zero();
+  //  cADEF1(c_src,c_res);
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
+  //  c_res = c_res - c_ref;
+  //  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
+  
+  //////////////////////////////////////////
+  // Build a smoother
+  //////////////////////////////////////////
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp);  //383
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp);  //328
+  //  std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1
+  //  std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations
+  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters
+  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters
+
+  //
+  // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40
+  // Need to measure cost of coarse space.
+  //
+  // -- i) Reduce coarse residual   -- 0.04
+  // -- ii) Lanczos on coarse space -- done
+  // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and
+  //         use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec
+  //
+  //
+  //
+  //
+  
+  std::vector<RealD> los({2.0,2.5}); // Nbasis 40 == 36,36 iters
+
+  //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
+  //  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)
+  std::vector<int> ords({9}); // Nbasis 40 == 40 iters (320 mults)  
+
+ /*
+   Smoother opt @56 nbasis, 0.04 convergence, 192 evs
+ ord lo
+
+ 16   0.1  no converge -- likely sign indefinite
+ 32   0.1  no converge -- likely sign indefinite(?)
+
+ 16   0.5  422
+ 32   0.5  302
+ 
+ 8   1.0  575
+ 12  1.0  449
+ 16  1.0  375
+ 32  1.0  302
+
+ 12  3.0  476
+ 16  3.0  319
+ 32  3.0  306
+
+ Powerlaw setup 62 vecs
+slurm-1494943.out:Grid : Message : 4874.186617 s : HDCG: Pcg converged in 171 iterations and 1706.548006 s 1.0 32
+slurm-1494943.out:Grid : Message : 6490.121648 s : HDCG: Pcg converged in 194 iterations and 1616.219654 s 1.0 16
+
+ Cheby setup: 56vecs
+ -- CG smoother O(16): 487
+ 
+Power law setup, 56 vecs -- lambda^-5
+slurm-1494383.out:Grid : Message : 4377.173265 s : HDCG: Pcg converged in 204 iterations and 1153.548935 s 1.0 32
+
+Power law setup, 56 vecs -- lambda^-3
+
+slurm-1494242.out:Grid : Message : 4370.464814 s : HDCG: Pcg converged in 204 iterations and 1143.494776 s  1.0 32
+slurm-1494242.out:Grid : Message : 5432.414820 s : HDCG: Pcg converged in 237 iterations and 1061.455882 s  1.0 16
+slurm-1494242.out:Grid : Message : 6588.727977 s : HDCG: Pcg converged in 205 iterations and 1156.565210 s  0.5 32
+
+ Power law setup, 56 vecs -- lambda^-4
+ -- CG smoother    O(16): 290
+ -- Cheby smoother O(16): 218 -- getting close to the deflation level I expect 169 from BFM paper @O(7) smoother and 64 nbasis
+
+Grid : Message : 2790.797194 s : HDCG: Pcg converged in 190 iterations and 1049.563182 s 1.0 32
+Grid : Message : 3766.374396 s : HDCG: Pcg converged in 218 iterations and 975.455668 s  1.0 16
+Grid : Message : 4888.746190 s : HDCG: Pcg converged in 191 iterations and 1122.252055 s 0.5 32
+Grid : Message : 5956.679661 s : HDCG: Pcg converged in 231 iterations and 1067.812850 s 0.5 16
+
+Grid : Message : 2767.405829 s : HDCG: Pcg converged in 218 iterations and 967.214067 s -- 16
+Grid : Message : 3816.165905 s : HDCG: Pcg converged in 251 iterations and 1048.636269 s -- 12
+Grid : Message : 5121.206572 s : HDCG: Pcg converged in 318 iterations and 1304.916168 s -- 8
+
+ 
+[paboyle@login2.crusher debug]$ grep -v Memory slurm-402426.out  | grep converged | grep HDCG -- [1.0,16] cheby
+Grid : Message : 5185.521063 s : HDCG: Pcg converged in 377 iterations and 1595.843529 s
+
+[paboyle@login2.crusher debug]$ grep HDCG  slurm-402184.out | grep onver
+Grid : Message : 3760.438160 s : HDCG: Pcg converged in 422 iterations and 2129.243141 s
+Grid : Message : 5660.588015 s : HDCG: Pcg converged in 308 iterations and 1900.026821 s
+
+ 
+Grid : Message : 4238.206528 s : HDCG: Pcg converged in 575 iterations and 2657.430676 s
+Grid : Message : 6345.880344 s : HDCG: Pcg converged in 449 iterations and 2108.505208 s
+
+grep onverg slurm-401663.out | grep HDCG
+Grid : Message : 3900.817781 s : HDCG: Pcg converged in 476 iterations and 1992.591311 s
+Grid : Message : 5647.202699 s : HDCG: Pcg converged in 306 iterations and 1746.838660 s
+
+
+[paboyle@login2.crusher debug]$ grep converged slurm-401775.out | grep HDCG
+Grid : Message : 3583.177025 s : HDCG: Pcg converged in 375 iterations and 1800.896037 s
+Grid : Message : 5348.342243 s : HDCG: Pcg converged in 302 iterations and 1765.045018 s
+
+Conclusion: higher order smoother is doing better. Much better. Use a Krylov smoother instead Mirs as in BFM version.
+
+ */
+				      //
+  for(int l=0;l<los.size();l++){
+
+    RealD lo = los[l];
+
+    for(int o=0;o<ords.size();o++){
+
+      ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false);
+      HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
+      
+      //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
+      ChebyshevSmoother<LatticeFermionD > ChebySmooth(lo,95,ords[o],FineHermOp);  // 311
+
+      /*
+       * CG smooth 11 iter: 
+       slurm-403825.out:Grid : Message : 4369.824339 s : HDCG: fPcg converged in 215 iterations 3.0
+       slurm-403908.out:Grid : Message : 3955.897470 s : HDCG: fPcg converged in 236 iterations 1.0
+       slurm-404273.out:Grid : Message : 3843.792191 s : HDCG: fPcg converged in 210 iterations 2.0
+       * CG smooth 9 iter: 
+      */
+      //
+      RealD MirsShift = lo;
+      ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
+      CGSmoother<LatticeFermionD> CGsmooth(ords[o],ShiftedFineHermOp) ;
+  
+      //////////////////////////////////////////
+      // Build a HDCG solver
+      //////////////////////////////////////////
+      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
+	HDCG(1.0e-8, 700,
+	     FineHermOp,
+	     //	     ChebySmooth,
+	     CGsmooth,
+	     HPDSolveSloppy,
+	     HPDSolve,
+	     Aggregates);
+
+      /*
+	TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
+	HDCGdefl(1.0e-8, 700,
+		 FineHermOp,
+		 Smoother,
+		 cADEF1,
+		 HPDSolve,
+		 Aggregates);
+      */
+      
+      //      result=Zero();
+      //      HDCGdefl(src,result);
+
+      result=Zero();
+      HDCG(src,result);
+      
+    }
+  }
+
+  // Standard CG
+  result=Zero();
+  CGfine(HermOpEO, src, result);
+  
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,618 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_general_coarse_hdcg.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Coarsened>
+void SaveOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Operator.Grid()->IsBoss());
+  assert(Operator._A.size()==Operator.geom.npoint);
+  WR.open(file);
+  for(int p=0;p<Operator._A.size();p++){
+    auto tmp = Operator.Cell.Extract(Operator._A[p]);
+    WR.writeScidacFieldRecord(tmp,record,0,0);
+    //    WR.writeScidacFieldRecord(tmp,record,0,BINARYIO_LEXICOGRAPHIC);
+  }
+  WR.close();
+#endif
+}
+template<class Coarsened>
+void LoadOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  Grid::ScidacReader RD ;
+  RD.open(file);
+  assert(Operator._A.size()==Operator.geom.npoint);
+  for(int p=0;p<Operator.geom.npoint;p++){
+    conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
+    //    RD.readScidacFieldRecord(Operator._A[p],record,BINARYIO_LEXICOGRAPHIC);
+    RD.readScidacFieldRecord(Operator._A[p],record,0);
+  }    
+  RD.close();
+  Operator.ExchangeCoarseLinks();
+#endif
+}
+template<class Coarsened>
+void ReLoadOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  Grid::ScidacReader RD ;
+  RD.open(file);
+  assert(Operator._A.size()==Operator.geom.npoint);
+  for(int p=0;p<Operator.geom.npoint;p++){
+    auto tmp=Operator.Cell.Extract(Operator._A[p]);
+    RD.readScidacFieldRecord(tmp,record,0);
+    Operator._A[p] = Operator.Cell.ExchangePeriodic(tmp);
+  }    
+  RD.close();
+#endif
+}
+template<class aggregation>
+void SaveBasis(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg.FineGrid->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    //WR.writeScidacFieldRecord(Agg.subspace[b],record,0,BINARYIO_LEXICOGRAPHIC);
+    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,0);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void LoadBasis(aggregation &Agg, std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,BINARYIO_LEXICOGRAPHIC);
+    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+template<class CoarseVector>
+void SaveEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(evec[0].Grid()->IsBoss());
+  WR.open(evec_file);
+  for(int b=0;b<evec.size();b++){
+    WR.writeScidacFieldRecord(evec[b],record,0,0);
+  }
+  WR.close();
+  XmlWriter WRx(eval_file);
+  write(WRx,"evals",eval);
+#endif
+}
+template<class CoarseVector>
+void LoadEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+    XmlReader RDx(eval_file);
+    read(RDx,"evals",eval);
+    emptyUserRecord record;
+
+    Grid::ScidacReader RD ;
+    RD.open(evec_file);
+    assert(evec.size()==eval.size());
+    for(int k=0;k<eval.size();k++) {
+      RD.readScidacFieldRecord(evec[k],record);
+    }
+    RD.close();
+#endif
+}
+
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+// Want Op in CoarsenOp to call MatPcDagMatPc
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
+  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
+  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
+  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+};
+template<class Field> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  Chebyshev<Field> Cheby;
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {
+    std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    tmp = in;
+    Cheby(_SmootherOperator,tmp,out);
+  }
+};
+
+template<class Field> class CGSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  int iters;
+  CGSmoother(int _iters, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    iters(_iters)
+  {
+    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
+    out=Zero();
+    CG(_SmootherOperator,in,out);
+  }
+};
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+  const int nbasis = 62;
+  //  const int nbasis = 56;
+  //  const int nbasis = 44;
+  //  const int nbasis = 36;
+  const int cb = 0 ;
+  RealD mass=0.00078;
+  RealD M5=1.8;
+  RealD b=1.5;
+  RealD c=0.5;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid with 4^4 cell
+  Coordinate Block({4,4,6,4});
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/Block[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
+							    GridDefaultSimd(Nd,vComplex::Nsimd()),
+							    GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  ///////////////////////// RNGs /////////////////////////////////
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  ///////////////////////// Configuration /////////////////////////////////
+  LatticeGaugeField Umu(UGrid);
+  MemoryManager::Print();
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.1000");
+  NerscIO::readConfiguration(Umu,header,file);
+  MemoryManager::Print();
+
+  //////////////////////// Fermion action //////////////////////////////////
+  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+
+  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
+
+  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
+  HermFineMatrix FineHermOp(HermOpEO);
+
+  // Run power method on FineHermOp
+  //  PowerMethod<LatticeFermion>       PM;   PM(HermOpEO,src);
+ 
+  ////////////////////////////////////////////////////////////
+  ///////////// Coarse basis and Little Dirac Operator ///////
+  ////////////////////////////////////////////////////////////
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
+  
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse5d,FrbGrid,cb);
+
+  ////////////////////////////////////////////////////////////
+  // Need to check about red-black grid coarsening
+  ////////////////////////////////////////////////////////////
+  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
+
+  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.rat.18node.62");
+  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.rat.18node.62");
+  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.rat.18node.62");
+  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
+  std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
+  bool load_agg=true;
+  bool load_refine=true;
+  bool load_mat=true;
+  bool load_evec=false;
+  MemoryManager::Print();
+
+  int refine=1;
+  if ( load_agg ) {
+    if ( !(refine) || (!load_refine) ) { 
+      LoadBasis(Aggregates,subspace_file);
+    }
+  } else {
+    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
+					0.0003,1.0e-5,2000); // Lo, tol, maxit
+
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); <== last run
+    SaveBasis(Aggregates,subspace_file);
+  }
+
+  if(refine){
+    if ( load_refine ) {
+      LoadBasis(Aggregates,refine_file);
+    } else {
+      // HDCG used Pcg to refine
+      Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000);
+      SaveBasis(Aggregates,refine_file);
+    }
+  }
+
+  Aggregates.Orthogonalise();
+  if ( load_mat ) {
+    LoadOperator(LittleDiracOp,ldop_file);
+  } else {
+    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
+    //    SaveOperator(LittleDiracOp,ldop_file);
+  }
+  
+  // I/O test:
+  CoarseVector c_src(Coarse5d);   random(CRNG,c_src);
+  CoarseVector c_res(Coarse5d); 
+  CoarseVector c_ref(Coarse5d);
+
+  if (0){
+    ///////////////////////////////////////////////////
+    // Test the operator
+    ///////////////////////////////////////////////////
+    CoarseVector c_proj(Coarse5d);
+    LatticeFermionD    tmp(FrbGrid);
+    LatticeFermionD    prom(FrbGrid);
+    
+    blockPromote(c_src,prom,Aggregates.subspace);
+
+    FineHermOp.HermOp(prom,tmp);
+
+    std::cout<<GridLogMessage<<" Calling big dirac op "<<norm2(tmp)<<std::endl;
+    blockProject(c_proj,tmp,Aggregates.subspace);
+
+    std::cout<<GridLogMessage<<" Calling little Dirac Op "<<std::endl;
+
+    LittleDiracOp.M(c_src,c_res);
+
+    std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+    std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+
+    c_proj = c_proj - c_res;
+    std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+  }
+
+  //////////////////////////////////////
+  // mrhs coarse operator
+  //  Create a higher dim coarse grid
+  //////////////////////////////////////////////////////////////////////////////////////
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Building MultiRHS Coarse operator"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  ConjugateGradient<CoarseVector>  coarseCG(4.0e-2,20000,true);
+    
+  const int nrhs=vComplex::Nsimd()*3;
+    
+  Coordinate mpi=GridDefaultMpi();
+  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
+  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
+  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
+    
+  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
+  //  MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
+  typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
+  MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
+  //  mrhs.CopyMatrix(LittleDiracOp);
+  //  mrhs.SetMatrix(LittleDiracOp.);
+  mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
+  //  mrhs.CheckMatrix(LittleDiracOp);
+  
+  //////////////////////////////////////////
+  // Build a coarse lanczos
+  //////////////////////////////////////////
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Building Coarse Lanczos               "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+
+  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
+  HermMatrix CoarseOp     (LittleDiracOp);
+
+  int Nk=192;
+  int Nm=256;
+  int Nstop=Nk;
+  
+  Chebyshev<CoarseVector>      IRLCheby(0.005,40.0,201);  // 1 iter
+  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
+  PlainHermOp<CoarseVector>    IRLOp    (CoarseOp);
+  
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1e-5,10);
+
+  int Nconv;
+  std::vector<RealD>            eval(Nm);
+  std::vector<CoarseVector>     evec(Nm,Coarse5d);
+
+  PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src);
+
+  if ( load_evec ) {
+    eval.resize(Nstop);
+    evec.resize(Nstop,Coarse5d);
+    LoadEigenvectors(eval,evec,evec_file,eval_file);
+  } else { 
+    IRL.calc(eval,evec,c_src,Nconv);
+    assert(Nstop==eval.size());
+    SaveEigenvectors(eval,evec,evec_file,eval_file);
+  }
+
+  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+
+  MultiRHSDeflation<CoarseVector> MrhsGuesser;
+  
+  //////////////////////////////////////////
+  // Build a coarse space solver
+  //////////////////////////////////////////
+  int maxit=30000;
+  ConjugateGradient<CoarseVector>  CG(1.0e-10,maxit,false);
+  ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  
+  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
+  c_res=Zero();
+
+  /////////// MRHS test .////////////
+  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
+  MrhsHermMatrix MrhsCoarseOp     (mrhs);
+
+#if 1
+  { 
+    CoarseVector rh_res(CoarseMrhs);
+    CoarseVector rh_guess(CoarseMrhs);
+    CoarseVector rh_src(CoarseMrhs);
+
+    rh_res= Zero();
+    rh_guess= Zero();
+
+    std::cout << "*************************"<<std::endl;
+    std::cout << " MrhsGuesser importing"<<std::endl;
+    std::cout << "*************************"<<std::endl;
+    MrhsGuesser.ImportEigenBasis(evec,eval);
+    std::vector<CoarseVector> BlasGuess(nrhs,Coarse5d);
+    std::vector<CoarseVector> BlasSource(nrhs,Coarse5d);
+    for(int r=0;r<nrhs;r++){
+      random(CRNG,BlasSource[r]);
+    }
+
+    MrhsGuesser.DeflateSources(BlasSource,BlasGuess);
+
+    for(int r=0;r<nrhs;r++){
+      std::cout << "*************************"<<std::endl;
+      std::cout << "**** DeflCoarseGuesser &&&&& "<<std::endl;
+      std::cout << "*************************"<<std::endl;
+      c_src=BlasSource[r];
+      DeflCoarseGuesser(c_src,c_res);
+      std::cout << "Deflated guess      "<< norm2(c_res)<<std::endl;
+      std::cout << "Blas deflated guess "<< norm2(BlasGuess[r])<<std::endl;
+      std::cout << "*************************"<<std::endl;
+      BlasGuess[r] = BlasGuess[r] - c_res;
+      std::cout << "Diff " <<norm2(BlasGuess[r])<<std::endl;
+      std::cout << "*************************"<<std::endl;
+      InsertSlice(c_res,rh_res,r,0);
+      InsertSlice(c_res,rh_guess,r,0);
+      InsertSlice(c_src,rh_src,r,0);
+    }
+
+    std::cout << " Calling the multiRHS coarse CG"<<std::endl;
+    coarseCG(MrhsCoarseOp,rh_src,rh_res);
+
+    //redo with block CG ?
+    for(int r=0;r<nrhs;r++){
+      std::cout << " compare to single RHS "<<r<<"/"<<nrhs<<std::endl;
+      ExtractSlice(c_src,rh_src,r,0);
+      ExtractSlice(c_res,rh_res,r,0);
+      ExtractSlice(c_ref,rh_guess,r,0);
+      coarseCG(CoarseOp,c_src,c_ref);
+      std::cout << " mrhs [" <<r <<"] "<< norm2(c_res)<<std::endl;
+      std::cout << " srhs [" <<r <<"] "<< norm2(c_ref)<<std::endl;
+      c_ref=c_ref-c_res;
+      RealD diff =norm2(c_ref)/norm2(c_src);
+      std::cout << r << " diff " << diff<<std::endl;
+      assert(diff < 1.0e-1);
+    }
+  }
+#endif
+
+  //////////////////////////////////////
+  // fine solve
+  //////////////////////////////////////
+  
+  std::vector<RealD> los({2.0});
+  std::vector<int> ords({7}); 
+
+ /*
+ Powerlaw setup 62 vecs
+slurm-1494943.out:Grid : Message : 4874.186617 s : HDCG: Pcg converged in 171 iterations and 1706.548006 s 1.0 32
+slurm-1494943.out:Grid : Message : 6490.121648 s : HDCG: Pcg converged in 194 iterations and 1616.219654 s 1.0 16
+
+ Cheby setup: 56vecs
+ -- CG smoother O(16): 487
+ 
+Power law setup, 56 vecs -- lambda^-5
+slurm-1494383.out:Grid : Message : 4377.173265 s : HDCG: Pcg converged in 204 iterations and 1153.548935 s 1.0 32
+
+Power law setup, 56 vecs -- lambda^-3
+
+slurm-1494242.out:Grid : Message : 4370.464814 s : HDCG: Pcg converged in 204 iterations and 1143.494776 s  1.0 32
+slurm-1494242.out:Grid : Message : 5432.414820 s : HDCG: Pcg converged in 237 iterations and 1061.455882 s  1.0 16
+slurm-1494242.out:Grid : Message : 6588.727977 s : HDCG: Pcg converged in 205 iterations and 1156.565210 s  0.5 32
+
+ Power law setup, 56 vecs -- lambda^-4
+ -- CG smoother    O(16): 290
+ -- Cheby smoother O(16): 218 -- getting close to the deflation level I expect 169 from BFM paper @O(7) smoother and 64 nbasis
+
+Conclusion: higher order smoother is doing better. Much better. Use a Krylov smoother instead Mirs as in BFM version.
+ */
+				      //
+  MemoryManager::Print();
+  for(int l=0;l<los.size();l++){
+
+    RealD lo = los[l];
+
+    for(int o=0;o<ords.size();o++){
+
+      ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false);
+      HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
+      
+      //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
+      ChebyshevSmoother<LatticeFermionD > ChebySmooth(lo,95,ords[o],FineHermOp);  // 311
+
+      RealD MirsShift = lo;
+      ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
+      CGSmoother<LatticeFermionD> CGsmooth(ords[o],ShiftedFineHermOp) ;
+  
+      //////////////////////////////////////////
+      // Build a HDCG solver
+      //////////////////////////////////////////
+      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
+	HDCG(1.0e-8, 700,
+	     FineHermOp,
+	     CGsmooth,
+	     HPDSolveSloppy,
+	     HPDSolve,
+	     Aggregates);
+      //      result=Zero();
+      //      std::cout << "Calling HDCG single RHS"<<std::endl;
+      //      HDCG(src,result);
+
+      //////////////////////////////////////////
+      // Build a HDCG mrhs solver
+      //////////////////////////////////////////
+#if 1
+  MemoryManager::Print();
+      DoNothingGuesser<CoarseVector> DoNothing;
+      HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
+      HPDSolver<CoarseVector> HPDSolveMrhsSloppy(MrhsCoarseOp,CGsloppy,DoNothing);
+      TwoLevelADEF2mrhs<LatticeFermion,CoarseVector,Subspace>
+	HDCGmrhs(1.0e-8, 500,
+		 FineHermOp,
+		 CGsmooth,
+		 //		 HPDSolveSloppy, // Never used
+		 //		 HPDSolve,       // Used in Vstart
+		 HPDSolveMrhsSloppy,    // Used in M1
+		 HPDSolveMrhs,          // Used in Vstart
+		 DeflCoarseGuesser, // single RHS guess used in M1
+		 CoarseMrhs,        // Grid needed to Mrhs grid
+		 Aggregates);
+
+      std::cout << "Calling mRHS HDCG"<<std::endl;
+      FrbGrid->Barrier();
+      
+      std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
+      std::cout << " mRHS source"<<std::endl;
+      std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
+      std::cout << " mRHS result"<<std::endl;
+
+  random(RNG5,src_mrhs[0]);
+  for(int r=0;r<nrhs;r++){
+	if(r>0)src_mrhs[r]=src_mrhs[0];
+	res_mrhs[r]=Zero();
+	std::cout << "Setup mrhs source "<<r<<std::endl;
+  }
+  std::cout << "Calling the mRHS HDCG"<<std::endl;
+  MemoryManager::Print();
+  HDCGmrhs(src_mrhs,res_mrhs);
+  MemoryManager::Print();
+#endif
+    }
+  }
+
+  // Standard CG
+#if 1
+  {
+    LatticeFermion result(FrbGrid); result=Zero();
+    LatticeFermion    src(FrbGrid); random(RNG5,src);
+    result=Zero();
+    CGfine(HermOpEO, src, result);
+  }
+#endif  
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,267 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    wrapped.HermOp(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    wrapped.HermOp(in,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    wrapped.HermOp(in,out);
+  }
+  
+};
+
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp"<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    _PV.M(out,tmp);
+    _Mat.Mdag(tmp,out);
+    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+    
+  }
+};
+
+template<class Field> class DumbOperator  : public LinearOperatorBase<Field> {
+public:
+  LatticeComplex scale;
+  DumbOperator(GridBase *grid) : scale(grid)
+  {
+    scale = 0.0;
+    LatticeComplex scalesft(grid);
+    LatticeComplex scaletmp(grid);
+    for(int d=0;d<4;d++){
+      Lattice<iScalar<vInteger> > x(grid); LatticeCoordinate(x,d+1);
+      LatticeCoordinate(scaletmp,d+1);
+      scalesft = Cshift(scaletmp,d+1,1);
+      scale = 100.0*scale + where( mod(x    ,2)==(Integer)0, scalesft,scaletmp);
+    }
+    std::cout << " scale\n" << scale << std::endl;
+  }
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {};
+  void OpDir  (const Field &in, Field &out,int dir,int disp){};
+  void OpDirAll  (const Field &in, std::vector<Field> &out) {};
+
+  void Op     (const Field &in, Field &out){
+    out = scale * in;
+  }
+  void AdjOp  (const Field &in, Field &out){
+    out = scale * in;
+  }
+  void HermOp(const Field &in, Field &out){
+    double n1, n2;
+    HermOpAndNorm(in,out,n1,n2);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
+    ComplexD dot;
+
+    out = scale * in;
+
+    dot= innerProduct(in,out);
+    n1=real(dot);
+
+    dot = innerProduct(out,out);
+    n2=real(dot);
+  }
+};
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=2;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+  //Umu = 1.0;
+  
+  RealD mass=0.5;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  const int nbasis = 1;
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNearestStencilGeometry5D geom(Coarse5d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  
+  PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM(Ddwf,Dpv);
+  HermOpAdaptor<LatticeFermionD> HOA(PVdagM);
+
+  // Run power method on HOA??
+  PowerMethod<LatticeFermion>       PM;   PM(HOA,src);
+ 
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace AggregatesPD(Coarse5d,FGrid,cb);
+  AggregatesPD.CreateSubspaceChebyshev(RNG5,
+				       HOA,
+				       nbasis,
+				       5000.0,
+				       0.02,
+				       100,
+				       50,
+				       50,
+				       0.0);
+  
+  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
+  LittleDiracOpPV.CoarsenOperator(PVdagM,AggregatesPD);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  CoarseVector c_src (Coarse5d);
+  CoarseVector c_res (Coarse5d);
+  CoarseVector c_proj(Coarse5d);
+
+  std::vector<LatticeFermion> subspace(nbasis,FGrid);
+  subspace=AggregatesPD.subspace;
+
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+  blockPromote(c_src,err,subspace);
+
+  prom=Zero();
+  for(int b=0;b<nbasis;b++){
+    prom=prom+subspace[b];
+  }
+  err=err-prom; 
+  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  PVdagM.Op(prom,tmp);
+  blockProject(c_proj,tmp,subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOpPV.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -83,15 +83,8 @@ int main(int argc, char **argv)
  // need wrappers of the fermionic classes 
  // that have a complex construction
  // standard
-  RealD beta = 6.6 ; 
-
-#if 0
+  RealD beta = 5.6 ;
  WilsonGaugeActionR Waction(beta);
-#else
-  std::vector<Complex> boundaryG = {1,1,1,0};
-  WilsonGaugeActionR::ImplParams ParamsG(boundaryG);
-  WilsonGaugeActionR Waction(beta,ParamsG);
-#endif
  
  ActionLevel<HMCWrapper::Field> Level1(1);
  Level1.push_back(&Waction);
@@ -1,238 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./tests/Test_hmc_WilsonFermionGauge.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-
-#undef USE_OBC
-#define DO_IMPLICIT
-
-
-int main(int argc, char **argv) 
-{
-  using namespace Grid;
-
-  Grid_init(&argc, &argv);
-  GridLogLayout();
-  
-  std::string arg;
- 
-  HMCparameters HMCparams;
-#if 1
-  {
-    XmlReader  HMCrd("HMCparameters.xml");
-    read(HMCrd,"HMCparameters",HMCparams);
-  }
-#else
-//IntegratorParameters MD;
-  std::vector<int> steps(0);
-  if( GridCmdOptionExists(argv,argv+argc,"--MDsteps") ){
-    arg= GridCmdOptionPayload(argv,argv+argc,"--MDsteps");
-    GridCmdOptionIntVector(arg,steps);
-    assert(steps.size()==1);
-  }
-  MD.trajL   = 0.001*std::sqrt(2.);
-  MD.MDsteps = 1;
-  if (steps.size()>0) MD.MDsteps = steps[0];
-  if( GridCmdOptionExists(argv,argv+argc,"--trajL") ){
-    arg= GridCmdOptionPayload(argv,argv+argc,"--trajL");
-    std::vector<int> traj(0);
-    GridCmdOptionIntVector(arg,traj);
-    assert(traj.size()==1);
-    MD.trajL *= double(traj[0]);
-  }
-  MD.RMHMCTol=1e-8;
-  MD.RMHMCCGTol=1e-8;
-  std::cout << "RMHMCTol= "<<  MD.RMHMCTol<<" RMHMCCGTol= "<<MD.RMHMCCGTol<<std::endl;
-
-  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 0;
-  HMCparams.Trajectories     = 1;
-  HMCparams.NoMetropolisUntil=  100;
-  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  HMCparams.StartingType     =std::string("ColdStart");
-  HMCparams.Kappa=0.01; //checking against trivial. Pathetic.
-  HMCparams.MD = MD;
-#endif
-
-
-
-   // Typedefs to simplify notation
-#ifdef DO_IMPLICIT
-  typedef GenericHMCRunner<ImplicitMinimumNorm2> HMCWrapper;  // Uses the default minimum norm
-//  typedef GenericHMCRunner<ImplicitCampostrini> HMCWrapper;  // 4th order
-  HMCparams.MD.name    = std::string("ImplicitMinimumNorm2");
-#else
-  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;  // Uses the default minimum norm
-  HMCparams.MD.name    = std::string("MinimumNorm2");
-#endif
-
-
-
-  // Possibile to create the module by hand 
-  // hardcoding parameters or using a Reader
-
-
-  // Checkpointer definition
-  CheckpointerParameters CPparams;  
-  CPparams.config_prefix = "ckpoint_lat";
-  CPparams.rng_prefix = "ckpoint_rng";
-  CPparams.saveInterval = 1;
-  CPparams.format = "IEEE64BIG";
-  
-  HMCWrapper TheHMC(HMCparams);
-  // Grid from the command line
-  TheHMC.Resources.AddFourDimGrid("gauge");
-  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-
-  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5";
-  RNGpar.parallel_seeds = "6 7 8 9 10";
-  TheHMC.Resources.SetRNGSeeds(RNGpar);
-
-  // Construct observables
-  // here there is too much indirection 
-  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
-  typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
-  TheHMC.Resources.AddObservable<PlaqObs>();
-  TopologyObsParameters TopParams;
-  TopParams.interval = 1;
-  TopParams.do_smearing = true;
-//  TopParams.Smearing.steps = 1600;
-//  TopParams.Smearing.step_size = 0.01;
-  TopParams.Smearing.init_step_size = 0.01;
-  TopParams.Smearing.meas_interval = 10;
-  TopParams.Smearing.maxTau = 16.0; 
-//  TheHMC.Resources.AddObservable<QObs>(TopParams);
-  //////////////////////////////////////////////
-
-  /////////////////////////////////////////////////////////////
-  // Collect actions, here use more encapsulation
-  // need wrappers of the fermionic classes 
-  // that have a complex construction
-  // standard
-
-  RealD beta = 6.6;
-  std::cout << "Wilson Gauge beta= " <<beta <<std::endl;
-#ifndef USE_OBC
-  WilsonGaugeActionR Waction(beta);
-#else
-  std::vector<Complex> boundaryG = {1,1,1,0};
-  WilsonGaugeActionR::ImplParams ParamsG(boundaryG);
-  WilsonGaugeActionR Waction(beta,ParamsG);
-  std::cout << "boundaryG = " <<boundaryG  <<std::endl;
-#endif
-
-  
-  ActionLevel<HMCWrapper::Field> Level1(1);
-  Level1.push_back(&Waction);
-  TheHMC.TheAction.push_back(Level1);
-
-  TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file
-  std::cout << "trajL= " <<TheHMC.Parameters.MD.trajL <<" steps= "<<TheHMC.Parameters.MD.MDsteps << " integrator= "<<TheHMC.Parameters.MD.name<<std::endl;
-
-  NoSmearing<HMCWrapper::ImplPolicy> S;
-#ifndef DO_IMPLICIT
-  TrivialMetric<HMCWrapper::ImplPolicy::Field> Mtr;
-#else
-// g_x3_2
-    LaplacianRatParams gpar(2),mpar(2);
-    gpar.offset = 1.;
-    gpar.a0[0] = 500.;
-    gpar.a1[0] = 0.;
-    gpar.b0[0] = 0.25;
-    gpar.b1[0] = 1.;
-    gpar.a0[1] = -500.;
-    gpar.a1[1] = 0.;
-    gpar.b0[1] = 0.36;
-    gpar.b1[1] = 1.2;
-    gpar.b2=1.;
-
-    mpar.offset = 1.;
-    mpar.a0[0] =  -0.850891906532;
-    mpar.a1[0] = -1.54707654538;
-    mpar. b0[0] = 2.85557166137;
-    mpar. b1[0] = 5.74194794773;
-    mpar.a0[1] = -13.5120056831218384729709214298;
-    mpar.a1[1] = 1.54707654538396877086370295729;
-    mpar.b0[1] = 19.2921090880640520026645390317;
-    mpar.b1[1] = -3.54194794773029020262811172870;
-    mpar.b2=1.;
-    for(int i=0;i<2;i++){
-       gpar.a1[i] *=16.;
-       gpar.b1[i] *=16.;
-       mpar.a1[i] *=16.;
-       mpar.b1[i] *=16.;
-    }
-    gpar.b2 *= 16.*16.;
-    mpar.b2 *= 16.*16.;
-
-    ConjugateGradient<LatticeGaugeField> CG(1.0e-8,10000);
-    LaplacianParams LapPar(0.0001, 1.0, 10000, 1e-8, 12, 64);
-
-    std::cout << GridLogMessage << "LaplacianRat " << std::endl;
-
-    gpar.tolerance=HMCparams.MD.RMHMCCGTol;
-    mpar.tolerance=HMCparams.MD.RMHMCCGTol;
-
-    std::cout << GridLogMessage << "gpar offset= " << gpar.offset <<std::endl;
-    std::cout << GridLogMessage << " a0= " << gpar.a0 <<std::endl;
-    std::cout << GridLogMessage << " a1= " << gpar.a1 <<std::endl;
-    std::cout << GridLogMessage << " b0= " << gpar.b0 <<std::endl;
-    std::cout << GridLogMessage << " b1= " << gpar.b1 <<std::endl;
-    std::cout << GridLogMessage << " b2= " << gpar.b2 <<std::endl ;;
-
-    std::cout << GridLogMessage << "mpar offset= " << mpar.offset <<std::endl;
-    std::cout << GridLogMessage << " a0= " << mpar.a0 <<std::endl;
-    std::cout << GridLogMessage << " a1= " << mpar.a1 <<std::endl;
-    std::cout << GridLogMessage << " b0= " << mpar.b0 <<std::endl;
-    std::cout << GridLogMessage << " b1= " << mpar.b1 <<std::endl;
-    std::cout << GridLogMessage << " b2= " << mpar.b2 <<std::endl;
-//  Assumes PeriodicGimplR or D at the moment
-    Coordinate latt  = GridDefaultLatt();
-    Coordinate mpi   = GridDefaultMpi();
-    auto UGrid = TheHMC.Resources.GetCartesian("gauge");
-    Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
-    auto UGrid_f   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
-    std::cout << GridLogMessage << " UGrid= " << UGrid <<std::endl;
-    std::cout << GridLogMessage << " UGrid_f= " << UGrid_f <<std::endl;
-
-    LaplacianAdjointRat<HMCWrapper::ImplPolicy, PeriodicGimplF> Mtr(UGrid, UGrid_f,CG, gpar, mpar);
-#endif
- 
-  {
-    XmlWriter HMCwr("HMCparameters.xml.out");
-    write(HMCwr,"HMCparameters",TheHMC.Parameters);
-  }
-
-  TheHMC.Run(S,Mtr);  // no smearing
-
-  Grid_finalize();
-
-} // main
Author	SHA1	Message	Date
Peter Boyle	070b61f08f	Simplifying the MultiRHS solver to make it do SRHS and MRHS	2024-03-06 14:04:33 -05:00
Peter Boyle	ee3b3c4c56	relocate deflation support	2024-02-27 11:52:23 -05:00
Peter Boyle	462d706a63	Move to a blas directory	2024-02-27 11:51:04 -05:00
Peter Boyle	ee0d460c8e	Blas based block project & deflate for multiRHS	2024-02-27 11:41:44 -05:00
Peter Boyle	cd15abe9d1	Mrhs prep	2024-02-27 11:41:13 -05:00
Peter Boyle	9f40467e24	Warning squash	2024-02-27 11:40:36 -05:00
Peter Boyle	d0b6593823	More verbose on checksum	2024-02-27 11:40:14 -05:00
Peter Boyle	79fc821d8d	reorg headers	2024-02-27 11:39:37 -05:00
Peter Boyle	d7fdb9a7e6	Reorg headers	2024-02-27 11:39:06 -05:00
Peter Boyle	b74de51c18	Reorder headers	2024-02-27 11:38:52 -05:00
Peter Boyle	44b466e072	Make InsertSliceFast the default at some point in future. Should I do this now?	2024-02-21 14:51:24 -05:00
Peter Boyle	5e5b471bb2	Put/Get and DEviceToDevice	2024-02-21 14:47:06 -05:00
Peter Boyle	9c2565f64e	Working and faster version	2024-02-21 14:46:43 -05:00
Peter Boyle	e1d0a7cec3	Batched blas	2024-02-21 14:38:20 -05:00
Peter Boyle	b19ae8f465	Nbasis method for convenience	2024-02-21 14:36:19 -05:00
Peter Boyle	cdff2c8e18	Updated mrhs adef	2024-02-21 14:27:19 -05:00
Peter Boyle	eb702f581b	Running on 12 rhs on 18 nodes of frontier	2024-01-22 17:44:15 -05:00
Peter Boyle	3d13fd56c5	Precompute phases, save memory in hermitian	2024-01-22 17:43:35 -05:00
Peter Boyle	6f51b49ef8	Use stderr	2024-01-22 17:41:09 -05:00
Peter Boyle	addc638856	Fast localCopyRegion, blockProjectFast	2024-01-22 17:40:38 -05:00
Peter Boyle	42ae36bc28	WOrking	2024-01-17 16:39:14 -05:00
Peter Boyle	c69f73ff9f	Working	2024-01-17 16:38:46 -05:00
Peter Boyle	ca5ae8a2e6	Revert to working.	2024-01-17 16:32:05 -05:00
Peter Boyle	d967eb53de	Working for first time	2024-01-17 16:31:12 -05:00
Peter Boyle	839f9f1bbe	Don't log memory by default	2024-01-17 16:25:50 -05:00
Peter Boyle	b754a152c6	Flag guard correctly	2024-01-17 16:25:28 -05:00
Peter Boyle	e07cb2b9de	Accelerator memory	2024-01-17 16:24:31 -05:00
Peter Boyle	a1f8bbb078	accelerator memory print	2024-01-17 16:24:09 -05:00
Peter Boyle	7909683f3b	MultiRHS	2024-01-17 16:21:07 -05:00
Peter Boyle	25f71913b7	MultiRHS coarse	2024-01-04 12:01:17 -05:00
Peter Boyle	34ddd2b7b1	MultiRHS coarse space	2024-01-04 12:00:53 -05:00
Peter Boyle	d5fd90b2f3	Add 48^3 rtest	2024-01-04 12:00:01 -05:00
Peter Boyle	b7c7000d0d	Don't need the numerical rounding tolerance in multigrid	2023-12-22 18:10:23 -05:00
Peter Boyle	551f6c4edd	Synchronise changes	2023-12-22 18:09:11 -05:00
Peter Boyle	defd814750	Speed up the coarsened matrix matrix evaluation. It is block project limited. Could be sped up with calls to Batched GEMM and a data layout change.	2023-12-22 18:07:03 -05:00
Peter Boyle	3d517bbd2a	Synchronise decouple from the launch Speeds up multileg stencils	2023-12-22 18:06:13 -05:00
Peter Boyle	78ab955fec	Better padded cell exchange	2023-12-22 18:05:41 -05:00
Peter Boyle	dd13937bb6	Better opt face gather scatter	2023-12-22 18:03:38 -05:00
Peter Boyle	66a1b63aa9	Faster grid/blas layout change. Halo exchange is now the only slow part. Revisit	2023-12-21 20:50:18 -05:00
Peter Boyle	22c611bd1a	Delete temp file	2023-12-21 18:32:31 -05:00
Peter Boyle	c9bb1bf8ea	Passing new BLAs based	2023-12-21 18:31:17 -05:00
Peter Boyle	9e489887cf	General coarse multiRHS move to BLAS implementation	2023-12-21 15:24:48 -05:00
Peter Boyle	9feb801bb9	Much simpler GPU implementation	2023-12-21 15:24:06 -05:00
Peter Boyle	c00b495933	Multigrid	2023-12-21 15:23:31 -05:00
Peter Boyle	d22eebe553	BLas options	2023-12-21 15:23:03 -05:00
Peter Boyle	8bcbd82680	BLAS based layout and implementation	2023-12-21 15:21:24 -05:00
Peter Boyle	dfa617c439	Batched SGEMM/DGEMM/ZGEMM/CGEMM Hip, Cuda version and vanilla CPU One MKL stub in comments, to be tested as different.	2023-12-21 14:01:18 -05:00
Peter Boyle	48d1f0df89	Optimised partially, working	2023-12-21 12:33:47 -05:00
Peter Boyle	b75cb7a12c	Blas batched partial implementation on Frontier only for now	2023-12-21 12:31:33 -05:00
Peter Boyle	332563e037	Debugged, reducing verbose	2023-12-21 12:30:57 -05:00
Peter Boyle	0cce97a4fe	verbosity only	2023-12-20 21:30:10 -05:00
Peter Boyle	95a8e4be64	rocblas	2023-12-20 21:27:59 -05:00
Peter Boyle	abcd6b8cb6	Faster version	2023-12-19 15:17:46 -05:00
Peter Boyle	e8f21c9b6d	Memmory verbose control improvement	2023-12-19 15:16:58 -05:00
Peter Boyle	e054078b11	Verbose	2023-12-05 16:15:17 -05:00
Peter Boyle	6835a7f208	Better logging, test on 81 point stencil	2023-11-29 19:20:47 -05:00
Peter Boyle	f59993b979	Nbasis§	2023-11-29 09:47:36 -05:00
Peter Boyle	2290b8f680	Verbose	2023-11-29 09:47:04 -05:00
Peter Boyle	2c54be651c	Further updates	2023-11-29 09:43:29 -05:00
Peter Boyle	e859a199df	Reduce volume to interior for coarse stencil -- worth up to 4x gain	2023-11-28 10:23:16 -05:00
Peter Boyle	0a3682ad0b	MultiRHS work	2023-11-28 07:43:37 -05:00
Peter Boyle	59abaeb5cd	Time stamp	2023-11-24 12:56:45 -05:00
Peter Boyle	3e448435d3	Restrict to interior	2023-11-23 18:23:29 -05:00
Peter Boyle	a294bc3c5b	Relax constraints for multiRHS	2023-11-23 18:20:42 -05:00
Peter Boyle	b302ad3d49	multiRHS test in place, passes Yay!	2023-11-23 18:20:15 -05:00
Peter Boyle	82fc4b1e94	Finalise	2023-11-23 18:19:41 -05:00
Peter Boyle	b4f1740380	Finalise message	2023-11-23 18:19:16 -05:00
Peter Boyle	031f85247c	multRHS initial support -- needs optimisation for multi project/promote. Bug fix in freeing intermediate grids to stop double free	2023-11-23 18:18:35 -05:00
Peter Boyle	639cc6f73a	better support for multiRHS coarse space Still to add restriction of domain of last loop to interior of padded cell (expect about 4.5x on test volume on Crusher)	2023-11-23 18:16:26 -05:00
Peter Boyle	09946cf1ba	Improved, works on 48^3 moving to multiRHS optimisations	2023-11-15 18:03:05 -05:00
Peter Boyle	f4fa95e7cb	Use 5.3.0	2023-11-15 18:01:38 -05:00
Peter Boyle	100e29e35e	Allow expression as argument to norm2	2023-11-15 18:00:44 -05:00
Peter Boyle	4cbe471a83	devVector	2023-11-15 18:00:07 -05:00
Peter Boyle	8bece1f861	Faster to transpose the matrix and apply with column major order	2023-11-15 17:58:38 -05:00
Peter Boyle	a3ca71ec01	Lots more setup options, still working on them	2023-11-15 17:58:04 -05:00
Peter Boyle	e0543e8af5	Implement flexible preconditioned CG	2023-11-15 17:57:39 -05:00
Peter Boyle	c1eb80d01a	Print which have converged	2023-11-15 17:57:08 -05:00
Peter Boyle	a26121d97b	Better printing	2023-11-15 17:56:45 -05:00
Peter Boyle	043031a757	Report resid on failed convergence	2023-11-15 17:56:22 -05:00
Peter Boyle	807aeebe4c	Resize tol in constructor	2023-11-15 17:55:57 -05:00
Peter Boyle	8aa1a37aad	For Mirs preconditioner solver	2023-11-15 17:55:32 -05:00
Peter Boyle	4efa042f50	C++17 change	2023-10-24 10:57:50 -04:00
Peter Boyle	c7cb37e970	c++17 accepted	2023-10-24 10:57:24 -04:00
Peter Boyle	d34b207eab	Avoid HIP warnings	2023-10-24 10:57:04 -04:00
Peter Boyle	0e6fa6f6b8	DOn't need the Cshift for the period optimisation	2023-10-24 10:56:31 -04:00
Peter Boyle	38b87de53f	This works around a stacksize limit on AMD GPU	2023-10-24 10:56:07 -04:00
Peter Boyle	aa5047a9e4	Faster blockProject blockPromote	2023-10-24 10:49:55 -04:00
Peter Boyle	24b6ee0df9	M4 file	2023-10-24 10:36:48 -04:00
Peter Boyle	1e79cc9cbe	Avoid compiler error	2023-10-24 10:36:09 -04:00
Peter Boyle	b3925df9c3	Verbose on CPU-GPU xfer, remove performance by default	2023-10-24 10:25:01 -04:00
Peter Boyle	351795ac3a	Better messaging	2023-10-20 19:33:04 -04:00
Peter Boyle	9c9c42d0df	Tests on frontier with real speed up . 3.5x on 16^3 at mq=0.01	2023-10-20 19:27:13 -04:00
Peter Boyle	b6ad1bafc7	Normal memory SendToRecvFrom asynchronous for use in general stencil code	2023-10-20 19:27:13 -04:00
Peter Boyle	a5ca40f446	Better verbose -- track CPU GPU motion under --log Memory, others go to debug output stream	2023-10-20 19:27:13 -04:00
Peter Boyle	9ab54c5565	Overlap comms & data copy/buffer assembly in Ghost zone exchange	2023-10-20 19:27:13 -04:00
Peter Boyle	4341d96bde	Massively sped up coarse grid mult, comms Save 3ms spend (60% of time !) on cudaMalloc !!	2023-10-20 19:27:13 -04:00
Peter Boyle	5fac47a26d	Faster halo exchange	2023-10-20 19:27:13 -04:00
Peter Boyle	e064f17346	Faster halo exchange	2023-10-20 19:27:13 -04:00
Peter Boyle	afe10ba2a2	More digits	2023-10-20 19:27:13 -04:00
Peter Boyle	7cc3435ba8	Imporved General coarsened matrix	2023-10-20 19:27:13 -04:00
Peter Boyle	541772313c	Verbosity	2023-10-20 19:27:13 -04:00
Peter Boyle	3747494a09	Notify delet public	2023-10-20 19:27:13 -04:00
Peter Boyle	f2b98d0dcc	Const safety	2023-10-20 19:27:13 -04:00
Peter Boyle	80471bf762	Alternate implementation involving face operations	2023-10-20 19:27:13 -04:00
Peter Boyle	a06f63c110	Improved I/O and non-lexico option exposed to SciDAC format	2023-10-20 19:27:13 -04:00
Peter Boyle	0ae4478cd9	Checkpoint the subspace and ldop	2023-10-20 19:27:13 -04:00
Peter Boyle	ae4e705e09	Use random vec as easier for debug	2023-10-20 19:27:13 -04:00
Peter Boyle	f5dcea9dbf	Updates for Frontier	2023-10-20 19:27:12 -04:00
Peter Boyle	2207309f8a	Spack rules	2023-10-16 18:38:24 -04:00
Peter Boyle	2111e7ab5f	Run at physical mass	2023-10-06 21:20:21 -04:00
Peter Boyle	d29abfdcaf	Transfer code to Frontier now	2023-10-06 21:03:34 -04:00
Peter Boyle	a751c42cc5	Checkpoint restore the setup	2023-10-06 21:03:08 -04:00
Peter Boyle	6a3bc9865e	Verbose change	2023-10-06 21:02:04 -04:00
Peter Boyle	4d5f7e4377	Verbose change	2023-10-06 21:01:37 -04:00
Peter Boyle	78b117fb78	Comment fix	2023-10-06 21:01:15 -04:00
Peter Boyle	ded63a1319	Verbose change/pretty print	2023-10-06 21:00:53 -04:00
Peter Boyle	df3e4d1e9c	Return fix	2023-10-06 21:00:21 -04:00
Peter Boyle	b58fd80379	I/O for coarse op and reorganise multigrid headers	2023-10-06 13:43:46 -04:00
Peter Boyle	7f6e0f57d0	No IO in file	2023-10-06 13:39:53 -04:00
Peter Boyle	cae27678d8	gpermute	2023-10-06 13:39:19 -04:00
Peter Boyle	48ff655bad	Slightly less verbose	2023-10-06 10:47:52 -04:00
Peter Boyle	2525ad4623	Slight clean up	2023-10-06 10:47:32 -04:00
Peter Boyle	e7020017c5	Reorganise multigrid	2023-10-06 10:47:12 -04:00
Peter Boyle	eacebfad74	Reorganise multigrid into multiple headers	2023-10-06 10:46:21 -04:00
Peter Boyle	3bc2da5321	Merge branch 'feature/scidac-wp1' of https://github.com/paboyle/Grid into feature/scidac-wp1	2023-10-05 16:57:59 -04:00
Peter Boyle	2d710d6bfd	Optimised parameters for 16^3	2023-10-05 16:56:55 -04:00
Peter Boyle	6532b7f32b	Eliminate older inefficient coarsening implementation	2023-10-05 16:56:15 -04:00
Peter Boyle	7b41b92d99	Only need to bad non-local dimensions	2023-10-05 16:55:48 -04:00
Peter Boyle	dd557af84b	ADEF1 and ADEF2 2 level CG	2023-10-05 16:55:19 -04:00
Peter Boyle	59b9d0e030	coalesceRead the blockSum	2023-10-05 16:54:48 -04:00
Peter Boyle	b82eee4733	Hermitian dealing with	2023-10-05 16:54:14 -04:00
Peter Boyle	6a87487544	Running on Frontier, fix RNG big volume y2k, affecting 5D RNG	2023-10-05 16:50:59 -04:00
Peter Boyle	fcf5023845	Running on Frontier	2023-10-05 16:50:59 -04:00
Peter Boyle	c8adad6d8b	First runs on Summit. PopulateAdag needs work	2023-10-05 16:50:54 -04:00
Peter Boyle	737d3ffb98	ADEF1 and 1 hop projection	2023-10-03 14:22:18 -04:00
Peter Boyle	b01e67bab1	coalescedReadGeneralPermute now working	2023-10-02 17:46:57 -04:00
Peter Boyle	8a70314f54	Merge branch 'develop' into feature/scidac-wp1	2023-10-02 17:24:55 -04:00
Peter Boyle	36ae6e5aba	Fastest GPU version. Need to work on the PaddedCell now to make much faster	2023-09-29 18:26:51 -04:00
Peter Boyle	9db585cfeb	Temporary commit while optimisation is carried out	2023-09-29 17:11:35 -04:00
Peter Boyle	c564611ba7	Annoying hack that is useful to preserve for profiling	2023-09-29 17:11:12 -04:00
Peter Boyle	e187bcb85c	Updating	2023-09-29 17:10:17 -04:00
Peter Boyle	be18ffe3b4	Further tuning and lanczos	2023-09-27 16:21:58 -04:00
Peter Boyle	0d63dce4e2	Timing info	2023-09-27 16:21:14 -04:00
Peter Boyle	26b30e1551	Flop count and projection to nearest neighbour (keeps redundant flops)	2023-09-27 16:20:11 -04:00
Peter Boyle	7fc58ac293	Verbose subspace init	2023-09-27 16:19:45 -04:00
Peter Boyle	3a86cce8c1	Compile	2023-09-27 16:19:18 -04:00
Peter Boyle	37884d369f	Coarse space is expensive, but gives a speed up in fine matrix multiplies now. Down to optimisation	2023-09-25 17:24:19 -04:00
Peter Boyle	9246e653cd	Basic non-local coarsening of operator test	2023-09-25 17:20:58 -04:00
Peter Boyle	64283c8673	Normal equations becomes linear function for easy base class pass aroudn	2023-09-25 17:19:39 -04:00
Peter Boyle	755002da9c	Comparison convenience	2023-09-25 17:16:33 -04:00
Peter Boyle	31b8e8b437	Better messaging	2023-09-25 17:16:14 -04:00
Peter Boyle	0ec0de97e6	Adef2 implemented and working in an HDCG like context	2023-09-25 17:15:03 -04:00
Peter Boyle	6c3ade5d89	Improved the coarsening	2023-09-25 17:14:40 -04:00
Peter Boyle	980c5f9a34	Update chebyshev setup	2023-09-25 17:12:22 -04:00
Peter Boyle	471ca5f281	Power method more iterations	2023-09-07 10:55:05 -04:00
Peter Boyle	e82ddcff5d	Working getting closer to HDCG but some low level engineering work still needed + MUCH work on optimisation	2023-09-07 10:53:51 -04:00
Peter Boyle	b9dcad89e8	Test cases for coarsening with non-local stencil	2023-09-07 10:53:22 -04:00
Peter Boyle	993f43ef4a	Even odd use case	2023-09-07 10:53:06 -04:00
Peter Boyle	2b43308208	First cut non-local coarsening	2023-08-25 17:38:07 -04:00
Peter Boyle	04a1ac3a76	First cut for non-local coarsening	2023-08-25 17:37:38 -04:00
Peter Boyle	990b8798bd	Merge remote-tracking branch 'refs/remotes/origin/develop' into develop	2023-08-25 17:36:45 -04:00
Peter Boyle	b334a73a44	Stencil improvement	2023-08-25 17:35:10 -04:00
Peter Boyle	5d113d1c70	Odd address sanitizer complain	2023-08-25 17:34:18 -04:00
Peter Boyle	c14977aeab	Random vector option for test purposes	2023-08-25 17:33:31 -04:00
Peter Boyle	3e94838204	Spread out improvement	2023-08-25 17:31:28 -04:00
Peter Boyle	c0a0b8ca62	NEON and address sanitiser	2023-08-25 17:30:30 -04:00