Adding simple lanczos, boundary to specflow(!)

Adding mass step
Move out src initialization for re-use / Adding antiperiodic BC
2026-06-25 13:03:30 +01:00 · 2025-08-06 23:41:53 +00:00 · 2025-08-06 16:52:51 +00:00 · 2025-08-06 16:51:14 +00:00 · 2025-07-11 15:57:23 -04:00 · 2025-04-25 10:48:41 -04:00
256 changed files with 15642 additions and 4430 deletions
@@ -0,0 +1,2 @@
+
+mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
@@ -0,0 +1,5 @@
+CXX=hipcc
+MPICXX=mpicxx 
+CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
+LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
+hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench
@@ -0,0 +1,2 @@
+
+mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
@@ -30,9 +30,14 @@ directory

 #include <type_traits>
 #include <cassert>
+#include <exception>

 #define NAMESPACE_BEGIN(A) namespace A {
 #define NAMESPACE_END(A)   }
 #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
 #define GRID_NAMESPACE_END   NAMESPACE_END(Grid)
 #define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  ); 
+
+#define EXCEPTION_CHECK_BEGIN(A) try {
+#define EXCEPTION_CHECK_END(A)   } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }
+
@@ -50,6 +50,7 @@ NAMESPACE_CHECK(approx);
 #include <Grid/algorithms/deflation/Deflation.h>
 #include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
 #include <Grid/algorithms/deflation/MultiRHSDeflation.h>
+#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
 NAMESPACE_CHECK(deflation);
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 NAMESPACE_CHECK(ConjGrad);
@@ -72,6 +73,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/SimpleLanczos.h>
 #include <Grid/algorithms/iterative/PowerMethod.h>
 #include <Grid/algorithms/iterative/AdefGeneric.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
@@ -168,6 +168,7 @@ public:
  template<class vobj>
  void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
 #ifndef HAVE_FFTW
+    std::cerr << "FFTW is not compiled but is called"<<std::endl;
    assert(0);
 #else
    conformable(result.Grid(),vgrid);
@@ -190,7 +191,8 @@ public:
      
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
-
+    //std::cout << "CPU view" << std::endl;
+    
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
      
@@ -213,6 +215,7 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
      
+    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -226,6 +229,7 @@ public:
    }
      
    // Barrel shift and collect global pencil
+    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@@ -247,6 +251,7 @@ public:
      }
    }
      
+    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@@ -269,6 +274,7 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
      
+    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@@ -285,6 +291,7 @@ public:
    }
    result = result*div;
      
+    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
@@ -103,6 +103,38 @@ public:
    _Mat.MdagM(in,out);
  }
 };
+template<class Matrix,class Field>
+class MMdagLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+public:
+  MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
+
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    _Mat.Mdiag(in,out);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    _Mat.MMdag(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+    _Mat.MMdag(in,out);
+  }
+};

 ////////////////////////////////////////////////////////////////////
 // Construct herm op and shift it for mgrid smoother
@@ -245,6 +277,38 @@ public:
    assert(0);
  }
 };
+template<class Matrix,class Field>
+class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD shift;
+public:
+  ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    _Mat.Mdiag(in,out);
+    out = out + shift*in;
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};

 //////////////////////////////////////////////////////////
 // Even Odd Schur decomp operators; there are several
@@ -45,6 +45,11 @@ public:
    M(in,tmp);
    Mdag(tmp,out);
  }
+  virtual void  MMdag(const Field &in, Field &out) {
+    Field tmp (in.Grid());
+    Mdag(in,tmp);
+    M(tmp,out);
+  }
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
@@ -59,7 +59,7 @@ public:
    RealD diff = hi-lo;
    RealD delta = diff*1.0e-9;
    for (RealD x=lo; x<hi; x+=delta) {
-      delta*=1.1;
+      delta*=1.02;
      RealD f = approx(x);
      out<< x<<" "<<f<<std::endl;
    }
@@ -131,6 +131,26 @@ public:
      Coeffs[j] = s * 2.0/order;
    }
  };
+  template<class functor>
+  void Init(RealD _lo,RealD _hi,int _order, functor & func)
+  {
+    lo=_lo;
+    hi=_hi;
+    order=_order;
+      
+    if(order < 2) exit(-1);
+    Coeffs.resize(order);
+    for(int j=0;j<order;j++){
+      RealD s=0;
+      for(int k=0;k<order;k++){
+	RealD y=std::cos(M_PI*(k+0.5)/order);
+	RealD x=0.5*(y*(hi-lo)+(hi+lo));
+	RealD f=func(x);
+	s=s+f*std::cos( j*M_PI*(k+0.5)/order );
+      }
+      Coeffs[j] = s * 2.0/order;
+    }
+  };

    
  void JacksonSmooth(void){
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
  typedef cublasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
-  typedef cl::sycl::queue *gridblasHandle_t;
+  typedef sycl::queue *gridblasHandle_t;
 #endif
 #ifdef GRID_ONE_MKL
-  typedef cl::sycl::queue *gridblasHandle_t;
+  typedef sycl::queue *gridblasHandle_t;
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
  typedef int32_t gridblasHandle_t;
@@ -89,9 +89,10 @@ public:
      gridblasHandle = theGridAccelerator;
 #endif
 #ifdef GRID_ONE_MKL
-      cl::sycl::cpu_selector selector;
-      cl::sycl::device selectedDevice { selector };
-      gridblasHandle =new sycl::queue (selectedDevice);
+      sycl::gpu_selector selector;
+      sycl::device selectedDevice { selector };
+      sycl::property_list q_prop{sycl::property::queue::in_order()};
+      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 #endif
      gridblasInit=1;
    }
@@ -207,6 +208,9 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);

+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);
+
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -266,26 +270,169 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
-#endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    // Need a default/reference implementation
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  ComplexD c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+
+      oneapi::mkl::transpose iOpA;
+      oneapi::mkl::transpose iOpB;
+      
+      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
+      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
+      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
+      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
+      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
+      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
+
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &iOpA,
+						  &iOpB,
+						  &m64,&n64,&k64,
+						  (ComplexD *) &alpha_p[0],
+						  (const ComplexD **)&Amk[0], (const int64_t *)&lda64,
+						  (const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
+						  (ComplexD *) &beta_p[0],
+						  (ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+      synchronise();
+#if 0
+      // This code was used to check the mat mul on Sunspot/OneMKL
+      std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl;
+      std::vector<ComplexD> A(m*k);  // pointer list to matrices
+      std::vector<ComplexD> B(k*n);
+      std::vector<ComplexD> C(m*n);
+      //      int sda = lda*k;
+      //      int sdb = ldb*k;
+      //      int sdc = ldc*n;
+      std::cerr << " Checking the GEMM results "<<std::endl;
+      for (int p = 0; p < 1; ++p) {
+	ComplexD * Amk_p;  // pointer list to matrices
+	ComplexD * Bkn_p;  // pointer list to matrices
+	ComplexD * Cmn_p;  // pointer list to matrices
+	acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*));
+	acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*));
+	acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*));
+	std::cerr << " p " << p << " copied pointers "<<std::endl;
+	acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD));
+	acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD));
+	acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD));
+	std::cerr << " p " << p << " copied matrices "<<std::endl;
+	std::cerr << " C[0] "<<C[0]<<std::endl;
+	std::cerr << " A[0] "<<A[0]<<std::endl;
+	std::cerr << " B[0] "<<B[0]<<std::endl;
+	std::cerr << " m "<<m<<std::endl;
+	std::cerr << " n "<<n<<std::endl;
+	std::cerr << " k "<<k<<std::endl;
+	for (int mm = 0; mm < m; ++mm) {
+	  for (int nn = 0; nn < n; ++nn) {
+	    ComplexD c_mn(0.0);
+	    for (int kk = 0; kk < k; ++kk) {
+	      int idx_a, idx_b;
+	      //    int lda = m; // m x k column major
+	      //    int ldb = k; // k x n column major
+	      //    int ldc = m; // m x b column major
+	      if(OpA!=GridBLAS_OP_N) {
+		idx_a =kk + mm*lda;
+	      } else {
+		idx_a =mm + kk*lda;
+	      }
+	      if(OpB!=GridBLAS_OP_N) {
+		idx_b =nn + kk*ldb;
+	      } else {
+		idx_b =kk + nn*ldb;
+	      }
+	      //	      std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl;
+
+	      ComplexD Ac = A[idx_a];
+	      ComplexD Bc = B[idx_b];
+	      if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac);
+	      if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc);
+	      
+	      c_mn += Ac*Bc;
+	    }
+	    std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
+	  }
 	}
      }
-    }
 #endif
-    //    synchronise();
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    // Need a default/reference implementation; use Eigen
+      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
+        });
+      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  } );
+      } else { 
+	assert(0);
+      }
+#endif
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
@@ -306,6 +453,9 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);
+
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -366,26 +516,111 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+
+      oneapi::mkl::transpose iOpA;
+      oneapi::mkl::transpose iOpB;
+      
+      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
+      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
+      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
+      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
+      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
+      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
+
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &iOpA,
+						  &iOpB,
+						  &m64,&n64,&k64,
+						  (ComplexF *) &alpha_p[0],
+						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64,
+						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
+						  (ComplexF *) &beta_p[0],
+						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
-    ComplexF alphaf(real(alpha),imag(alpha));
-    ComplexF betaf(real(beta),imag(beta));
-    // Need a default/reference implementation
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  ComplexF c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
-	}
+    // Need a default/reference implementation; use Eigen
+      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  } );
+      } else { 
+	assert(0);
      }
-    }
 #endif
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
@@ -408,6 +643,9 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

+    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
+    assert(OpB!=GridBLAS_OP_C);
+
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -467,24 +705,81 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+
+      oneapi::mkl::transpose iOpA;
+      oneapi::mkl::transpose iOpB;
+      
+      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
+      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
+      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
+      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
+      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
+      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
+
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &iOpA,
+						  &iOpB,
+						  &m64,&n64,&k64,
+						  (float *) &alpha_p[0],
+						  (const float **)&Amk[0], (const int64_t *)&lda64,
+						  (const float **)&Bkn[0], (const int64_t *)&ldb64,
+						  (float *) &beta_p[0],
+						  (float **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
-    // Need a default/reference implementation
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  RealD c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
-	}
+    // Need a default/reference implementation; use Eigen
+      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  });
+      } else { 
+	assert(0);
      }
-    }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
@@ -495,7 +790,6 @@ public:
  ///////////////////////////////////////////////////////////////////////////
  // Double precision real GEMM
  ///////////////////////////////////////////////////////////////////////////
-
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
 		   int m,int n, int k,
@@ -508,6 +802,9 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

+    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
+    assert(OpB!=GridBLAS_OP_C);
+
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -568,160 +865,136 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    /*
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
      int64_t batchCount64=batchCount;
-      oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
-      onemkl::transpose::N,
-      onemkl::transpose::N,
-      &m64,&n64,&k64,
-      (double *) &alpha_p[0],
-      (double **)&Amk[0], lda,
-      (double **)&Bkn[0], ldb,
-      (double *) &beta_p[0],
-      (double **)&Cmn[0], ldc,
-      1,&batchCount64);
-     */
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+
+      oneapi::mkl::transpose iOpA;
+      oneapi::mkl::transpose iOpB;
+      
+      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
+      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
+      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
+      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
+      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
+      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
+
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &iOpA,
+						  &iOpB,
+						  &m64,&n64,&k64,
+						  (double *) &alpha_p[0],
+						  (const double **)&Amk[0], (const int64_t *)&lda64,
+						  (const double **)&Bkn[0], (const int64_t *)&ldb64,
+						  (double *) &beta_p[0],
+						  (double **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
-    // Need a default/reference implementation
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  RealD c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
-	}
+    // Need a default/reference implementation; use Eigen
+      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  });
+      } else { 
+	assert(0);
      }
-    }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
  }
-  
-
-  
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // Strided case used by benchmark, but generally unused in Grid
-  // Keep a code example in double complex, but don't generate the single and real variants for now
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  
-  void gemmStridedBatched(int m,int n, int k,
-			  ComplexD alpha,
-			  ComplexD* Amk,  // pointer list to matrices
-			  ComplexD* Bkn,
-			  ComplexD beta,
-			  ComplexD* Cmn,
-			  int batchCount)
-  {
-    // Use C-row major storage, so transpose calls
-    int lda = m; // m x k column major
-    int ldb = k; // k x n column major
-    int ldc = m; // m x b column major
-    int sda = m*k;
-    int sdb = k*n;
-    int sdc = m*n;
-    deviceVector<ComplexD> alpha_p(1);
-    deviceVector<ComplexD> beta_p(1);
-    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
-    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
-    //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
-    //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
-    //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
-#ifdef GRID_HIP
-    auto err = hipblasZgemmStridedBatched(gridblasHandle,
-					  HIPBLAS_OP_N,
-					  HIPBLAS_OP_N,
-					  m,n,k,
-					  (hipblasDoubleComplex *) &alpha_p[0],
-					  (hipblasDoubleComplex *) Amk, lda, sda,
-					  (hipblasDoubleComplex *) Bkn, ldb, sdb,
-					  (hipblasDoubleComplex *) &beta_p[0],
-					  (hipblasDoubleComplex *) Cmn, ldc, sdc,
-					  batchCount);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
-#endif
-#ifdef GRID_CUDA
-    cublasZgemmStridedBatched(gridblasHandle,
-			      CUBLAS_OP_N,
-			      CUBLAS_OP_N,
-			      m,n,k,
-			      (cuDoubleComplex *) &alpha_p[0],
-			      (cuDoubleComplex *) Amk, lda, sda,
-			      (cuDoubleComplex *) Bkn, ldb, sdb,
-			      (cuDoubleComplex *) &beta_p[0],
-			      (cuDoubleComplex *) Cmn, ldc, sdc,
-			      batchCount);
-#endif
-#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
-    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
-						oneapi::mkl::transpose::N,
-						oneapi::mkl::transpose::N,
-						m,n,k,
-						alpha,
-						(const ComplexD *)Amk,lda,sda,
-						(const ComplexD *)Bkn,ldb,sdb,
-						beta,
-						(ComplexD *)Cmn,ldc,sdc,
-						batchCount);
-#endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
-     // Need a default/reference implementation
-     for (int p = 0; p < batchCount; ++p) {
-       for (int mm = 0; mm < m; ++mm) {
-	 for (int nn = 0; nn < n; ++nn) {
-	   ComplexD c_mn(0.0);
-	   for (int kk = 0; kk < k; ++kk)
-	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
-	   Cmn[mm + nn*ldc + p*sdc] =  (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
-	 }
-       }
-     }
-#endif
-  }

+  template<class CComplex>
  double benchmark(int M, int N, int K, int BATCH)
  {
    int32_t N_A = M*K*BATCH;
    int32_t N_B = K*N*BATCH;
    int32_t N_C = M*N*BATCH;
-    deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
-    deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
-    deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
-    ComplexD alpha(1.0);
-    ComplexD beta (1.0);
+    deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
+    deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
+    deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
+    CComplex alpha(1.0);
+    CComplex beta (1.0);
    RealD flops = 8.0*M*N*K*BATCH;
-    int ncall=10;
+    int ncall=1000;
+    deviceVector<CComplex *> As(BATCH);
+    deviceVector<CComplex *> Bs(BATCH);
+    deviceVector<CComplex *> Cs(BATCH);
+    for(int b = 0 ; b < BATCH;b++) {
+      CComplex *ptr;
+      ptr = &A[b*M*K];      acceleratorPut(As[b],ptr);
+      ptr = &B[b*K*N];      acceleratorPut(Bs[b],ptr);
+      ptr = &C[b*M*N];      acceleratorPut(Cs[b],ptr);
+    }
+
+    // Warm up call
+    gemmBatched(M,N,K,
+		alpha,
+		As, // m x k 
+		Bs, // k x n
+		beta, 
+		Cs);
+    synchronise();
+
    RealD t0 = usecond();
    for(int i=0;i<ncall;i++){
-      gemmStridedBatched(M,N,K,
-			 alpha,
-			 &A[0], // m x k 
-			 &B[0], // k x n
-			 beta, 
-			 &C[0], // m x n
-			 BATCH);
+      gemmBatched(M,N,K,
+		  alpha,
+		  As, // m x k 
+		  Bs, // k x n
+		  beta, 
+		  Cs);
+      synchronise();
    }
-    synchronise();
    RealD t1 = usecond();
-    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
+    RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH;
    flops = 8.0*M*N*K*BATCH*ncall;
    flops = flops/(t1-t0)/1.e3;
    return flops; // Returns gigaflops
  }

-
-
-
 };

 NAMESPACE_END(Grid);
@@ -0,0 +1,376 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: MultiRHSBlockCGLinalg.h
+
+    Copyright (C) 2024
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+
+/* Need helper object for BLAS accelerated mrhs blockCG */
+template<class Field>
+class MultiRHSBlockCGLinalg
+{
+public:
+
+  typedef typename Field::scalar_type   scalar;
+  typedef typename Field::scalar_object scalar_object;
+  typedef typename Field::vector_object vector_object;
+
+  deviceVector<scalar> BLAS_X;      // nrhs x vol -- the sources
+  deviceVector<scalar> BLAS_Y;      // nrhs x vol -- the result
+  deviceVector<scalar> BLAS_C;      // nrhs x nrhs -- the coefficients 
+  deviceVector<scalar> BLAS_Cred;   // nrhs x nrhs x oSites -- reduction buffer
+  deviceVector<scalar *> Xdip;
+  deviceVector<scalar *> Ydip;
+  deviceVector<scalar *> Cdip;
+  
+  MultiRHSBlockCGLinalg() {};
+  ~MultiRHSBlockCGLinalg(){ Deallocate(); };
+  
+  void Deallocate(void)
+  {
+    Xdip.resize(0);
+    Ydip.resize(0);
+    Cdip.resize(0);
+    BLAS_Cred.resize(0);
+    BLAS_C.resize(0);
+    BLAS_X.resize(0);
+    BLAS_Y.resize(0);
+  }
+  void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
+  {
+    std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
+    for(int r=0;r<AP.size();r++){
+      Y_copy[r] = Y[r];
+    }
+    MulMatrix(AP,m,X);
+    for(int r=0;r<AP.size();r++){
+      AP[r] = scale*AP[r]+Y_copy[r];
+    }
+  }
+  void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
+  {
+    typedef typename Field::scalar_type scomplex;
+    GridBase *grid;
+    uint64_t vol;
+    uint64_t words;
+
+    int nrhs = Y.size();
+    grid  = X[0].Grid();
+    vol   = grid->lSites();
+    words = sizeof(scalar_object)/sizeof(scalar);
+    int64_t vw = vol * words;
+
+    RealD t0 = usecond();
+    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
+    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
+    BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
+    RealD t1 = usecond();
+
+    /////////////////////////////////////////////
+    // Copy in the multi-rhs sources
+    /////////////////////////////////////////////
+    for(int r=0;r<nrhs;r++){
+      int64_t offset = r*vw;
+      autoView(x_v,X[r],AcceleratorRead);
+      acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
+    }
+
+    // Assumes Eigen storage contiguous
+    acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
+    
+  /*
+   * in Fortran column major notation (cuBlas order)
+   *
+   * Xxr = [X1(x)][..][Xn(x)]
+   * Yxr = [Y1(x)][..][Ym(x)]
+   * Y = X . C
+   */
+    deviceVector<scalar *> Xd(1);
+    deviceVector<scalar *> Yd(1);
+    deviceVector<scalar *> Cd(1);
+
+    scalar * Xh = & BLAS_X[0];
+    scalar * Yh = & BLAS_Y[0];
+    scalar * Ch = & BLAS_C[0];
+
+    acceleratorPut(Xd[0],Xh);
+    acceleratorPut(Yd[0],Yh);
+    acceleratorPut(Cd[0],Ch);
+
+    RealD t2 = usecond();
+    GridBLAS BLAS;
+    /////////////////////////////////////////
+    // Y = X*C (transpose?)
+    /////////////////////////////////////////
+    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
+    		     vw,nrhs,nrhs,
+		     scalar(1.0),
+		     Xd,
+		     Cd,
+		     scalar(0.0),  // wipe out Y
+		     Yd);
+    BLAS.synchronise();
+    RealD t3 = usecond();
+
+    // Copy back Y = m X 
+    for(int r=0;r<nrhs;r++){
+      int64_t offset = r*vw;
+      autoView(y_v,Y[r],AcceleratorWrite);
+      acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
+    }    
+    RealD t4 = usecond();
+    std::cout <<GridLogPerformance << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
+  }
+  
+  void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
+  {
+#if 0    
+    int nrhs;
+    GridBase *grid;
+    uint64_t vol;
+    uint64_t words;
+
+    nrhs = X.size();
+    assert(X.size()==Y.size());
+    conformable(X[0],Y[0]);
+
+    grid  = X[0].Grid();
+    vol   = grid->lSites();
+    words = sizeof(scalar_object)/sizeof(scalar);
+    int64_t vw = vol * words;
+
+    RealD t0 = usecond();
+    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
+    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
+    BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
+    RealD t1 = usecond();
+
+    /////////////////////////////////////////////
+    // Copy in the multi-rhs sources
+    /////////////////////////////////////////////
+    for(int r=0;r<nrhs;r++){
+      int64_t offset = r*vw;
+      autoView(x_v,X[r],AcceleratorRead);
+      acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
+      autoView(y_v,Y[r],AcceleratorRead);
+      acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
+    }
+    RealD t2 = usecond();
+
+  /*
+   * in Fortran column major notation (cuBlas order)
+   *
+   * Xxr = [X1(x)][..][Xn(x)]
+   *
+   * Yxr = [Y1(x)][..][Ym(x)]
+   *
+   * C_rs = X^dag Y
+   */
+    deviceVector<scalar *> Xd(1);
+    deviceVector<scalar *> Yd(1);
+    deviceVector<scalar *> Cd(1);
+
+    scalar * Xh = & BLAS_X[0];
+    scalar * Yh = & BLAS_Y[0];
+    scalar * Ch = & BLAS_C[0];
+
+    acceleratorPut(Xd[0],Xh);
+    acceleratorPut(Yd[0],Yh);
+    acceleratorPut(Cd[0],Ch);
+
+    GridBLAS BLAS;
+
+    RealD t3 = usecond();
+    /////////////////////////////////////////
+    // C_rs = X^dag Y
+    /////////////////////////////////////////
+    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
+    		     nrhs,nrhs,vw,
+		     ComplexD(1.0),
+		     Xd,
+		     Yd,
+		     ComplexD(0.0),  // wipe out C
+		     Cd);
+    BLAS.synchronise();
+    RealD t4 = usecond();
+
+    std::vector<scalar> HOST_C(BLAS_C.size());      // nrhs . nrhs -- the coefficients 
+    acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
+    grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
+
+    RealD t5 = usecond();
+    for(int rr=0;rr<nrhs;rr++){
+      for(int r=0;r<nrhs;r++){
+	int off = r+nrhs*rr;
+	m(r,rr)=HOST_C[off];
+      }
+    }
+    RealD t6 = usecond();
+    uint64_t M=nrhs;
+    uint64_t N=nrhs;
+    uint64_t K=vw;
+    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
+    RealD flops = 8.0*M*N*K;
+    flops = flops/(t4-t3)/1.e3;
+    bytes = bytes/(t4-t3)/1.e3;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
+#else
+    int nrhs;
+    GridBase *grid;
+    uint64_t vol;
+    uint64_t words;
+
+    nrhs = X.size();
+    assert(X.size()==Y.size());
+    conformable(X[0],Y[0]);
+
+    grid  = X[0].Grid();
+    int rd0 =  grid->_rdimensions[0] * grid->_rdimensions[1];
+    vol   = grid->oSites()/rd0;
+    words = rd0*sizeof(vector_object)/sizeof(scalar);
+    int64_t vw = vol * words;
+    assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
+
+    RealD t0 = usecond();
+    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
+    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
+    BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
+    RealD t1 = usecond();
+
+    /////////////////////////////////////////////
+    // Copy in the multi-rhs sources -- layout batched BLAS ready
+    /////////////////////////////////////////////
+    for(int r=0;r<nrhs;r++){
+      autoView(x_v,X[r],AcceleratorRead);
+      autoView(y_v,Y[r],AcceleratorRead);
+      scalar *from_x=(scalar *)&x_v[0];
+      scalar *from_y=(scalar *)&y_v[0];
+      scalar *BX = &BLAS_X[0];
+      scalar *BY = &BLAS_Y[0];
+      accelerator_for(ssw,vw,1,{
+	  uint64_t ss=ssw/words;
+	  uint64_t  w=ssw%words;
+	  uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
+	  BX[offset] = from_x[ssw];
+	  BY[offset] = from_y[ssw];
+	});
+    }
+    RealD t2 = usecond();
+
+  /*
+   * in Fortran column major notation (cuBlas order)
+   *
+   * Xxr = [X1(x)][..][Xn(x)]
+   *
+   * Yxr = [Y1(x)][..][Ym(x)]
+   *
+   * C_rs = X^dag Y
+   */
+    Xdip.resize(vol);
+    Ydip.resize(vol);
+    Cdip.resize(vol);
+    std::vector<scalar *> Xh(vol);
+    std::vector<scalar *> Yh(vol);
+    std::vector<scalar *> Ch(vol);
+    for(uint64_t ss=0;ss<vol;ss++){
+
+      Xh[ss] = & BLAS_X[ss*nrhs*words];
+      Yh[ss] = & BLAS_Y[ss*nrhs*words];
+      Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
+
+    }
+    acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
+    acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
+    acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
+    
+    GridBLAS BLAS;
+
+    RealD t3 = usecond();
+    /////////////////////////////////////////
+    // C_rs = X^dag Y
+    /////////////////////////////////////////
+    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
+    		     nrhs,nrhs,words,
+		     ComplexD(1.0),
+		     Xdip,
+		     Ydip,
+		     ComplexD(0.0),  // wipe out C
+		     Cdip);
+    BLAS.synchronise();
+    RealD t4 = usecond();
+
+    std::vector<scalar> HOST_C(BLAS_Cred.size());      // nrhs . nrhs -- the coefficients 
+    acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
+
+    RealD t5 = usecond();
+    m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
+    for(int ss=0;ss<vol;ss++){
+      Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
+      m = m + eC;
+    }
+    RealD t6l = usecond();
+    grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
+    RealD t6 = usecond();
+    uint64_t M=nrhs;
+    uint64_t N=nrhs;
+    uint64_t K=vw;
+    RealD xybytes = grid->lSites()*sizeof(scalar_object);
+    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
+    RealD flops = 8.0*M*N*K;
+    flops = flops/(t4-t3)/1.e3;
+    bytes = bytes/(t4-t3)/1.e3;
+    xybytes = 4*xybytes/(t2-t1)/1.e3;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
+#endif
+  }
+};
+
+NAMESPACE_END(Grid);
@@ -447,10 +447,10 @@ public:
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
    		     nbasis,nrhs,vw,
-		     ComplexD(1.0),
+		     scalar(1.0),
 		     Vd,
 		     Fd,
-		     ComplexD(0.0),  // wipe out C
+		     scalar(0.0),  // wipe out C
 		     Cd);
    BLAS.synchronise();
    //    std::cout << "BlockProject done"<<std::endl;
@@ -497,10 +497,10 @@ public:
    int64_t vw = block_vol * words;
    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
    		     vw,nrhs,nbasis,
-		     ComplexD(1.0),
+		     scalar(1.0),
 		     Vd,
 		     Cd,
-		     ComplexD(0.0),  // wipe out C
+		     scalar(0.0),  // wipe out C
 		     Fd);
    BLAS.synchronise();
    //    std::cout << " blas call done"<<std::endl;
@@ -182,10 +182,10 @@ public:
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
    		     nev,nrhs,vw,
-		     ComplexD(1.0),
+		     scalar(1.0),
 		     Ed,
 		     Rd,
-		     ComplexD(0.0),  // wipe out C
+		     scalar(0.0),  // wipe out C
 		     Cd);
    BLAS.synchronise();

@@ -210,10 +210,10 @@ public:
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
 		     vw,nrhs,nev,
-		     ComplexD(1.0),
+		     scalar(1.0),
 		     Ed, // x . nev
 		     Cd, // nev . nrhs
-		     ComplexD(0.0),
+		     scalar(0.0),
 		     Gd);
    BLAS.synchronise();

@@ -53,6 +53,7 @@ class TwoLevelCGmrhs
  // Fine operator, Smoother, CoarseSolver
  LinearOperatorBase<Field>   &_FineLinop;
  LinearFunction<Field>   &_Smoother;
+  MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;

  GridStopWatch ProjectTimer;
  GridStopWatch PromoteTimer;
@@ -62,7 +63,12 @@ class TwoLevelCGmrhs
  GridStopWatch SmoothTimer;
  GridStopWatch InsertTimer;

-  
+  /*
+    Field rrr;
+  Field sss;
+  Field qqq;
+  Field zzz;
+  */  
  // more most opertor functions
  TwoLevelCGmrhs(RealD tol,
 		 Integer maxit,
@@ -73,12 +79,313 @@ class TwoLevelCGmrhs
    MaxIterations(maxit),
    _FineLinop(FineLinop),
    _Smoother(Smoother)
+    /*
+    rrr(fine),
+    sss(fine),
+    qqq(fine),
+    zzz(fine)
+*/
  {
    grid       = fine;
  };
  
  // Vector case
  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
+  {
+    //    SolveSingleSystem(src,x);
+    SolvePrecBlockCG(src,x);
+  }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Thin QR factorisation (google it)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  //Dimensions
+  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
+  //
+  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
+  //
+  //   Q  C = R => Q = R C^{-1}
+  //
+  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
+  //
+  // Set C = L^{dag}, and then Q^dag Q = ident 
+  //
+  // Checks:
+  // Cdag C = Rdag R ; passes.
+  // QdagQ  = 1      ; passes
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  void ThinQRfact (Eigen::MatrixXcd &m_zz,
+		   Eigen::MatrixXcd &C,
+		   Eigen::MatrixXcd &Cinv,
+		   std::vector<Field> &  Q,
+		   std::vector<Field> & MQ,
+		   const std::vector<Field> & Z,
+		   const std::vector<Field> & MZ)
+  {
+    RealD t0=usecond();
+    _BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
+    RealD t1=usecond();
+
+    m_zz = 0.5*(m_zz+m_zz.adjoint());
+    
+    Eigen::MatrixXcd L    = m_zz.llt().matrixL(); 
+    
+    C    = L.adjoint();
+    Cinv = C.inverse();
+    
+    RealD t3=usecond();
+    _BlockCGLinalg.MulMatrix( Q,Cinv,Z);
+    _BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
+    RealD t4=usecond();
+    std::cout << " ThinQRfact IP    :"<< t1-t0<<" us"<<std::endl;
+    std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
+    std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
+  }
+
+  virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
+  {
+    std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
+    src[0].Grid()->Barrier();
+    int nrhs = src.size();
+    //    std::vector<RealD> f(nrhs);
+    //    std::vector<RealD> rtzp(nrhs);
+    //    std::vector<RealD> rtz(nrhs);
+    //    std::vector<RealD> a(nrhs);
+    //    std::vector<RealD> d(nrhs);
+    //    std::vector<RealD> b(nrhs);
+    //    std::vector<RealD> rptzp(nrhs);
+
+    ////////////////////////////////////////////
+    //Initial residual computation & set up
+    ////////////////////////////////////////////
+    std::vector<RealD> ssq(nrhs);
+    for(int rhs=0;rhs<nrhs;rhs++){
+      ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
+    }      
+
+    ///////////////////////////
+    // Fields -- eliminate duplicates between fPcg and block cg
+    ///////////////////////////
+    std::vector<Field> Mtmp(nrhs,grid);
+    std::vector<Field> tmp(nrhs,grid);
+    std::vector<Field>   Z(nrhs,grid); // Rename Z to R
+    std::vector<Field>  MZ(nrhs,grid); // Rename MZ to Z
+    std::vector<Field>   Q(nrhs,grid); // 
+    std::vector<Field>  MQ(nrhs,grid); // Rename to P
+    std::vector<Field>   D(nrhs,grid);
+    std::vector<Field>  AD(nrhs,grid);
+    
+    /************************************************************************
+     * Preconditioned Block conjugate gradient rQ
+     * Generalise Sebastien Birk Thesis, after Dubrulle 2001.
+     * Introduce preconditioning following Saad Ch9
+     ************************************************************************
+     * Dimensions:
+     *
+     *   X,B etc... ==(Nferm x nrhs)
+     *  Matrix A==(Nferm x Nferm)
+     *  
+     * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+     * QC => Thin QR factorisation (google it)
+     *
+     * R = B-AX
+     * Z = Mi R
+     * QC = Z
+     * D = Q 
+     * for k: 
+     *   R  = AD
+     *   Z  = Mi R
+     *   M  = [D^dag R]^{-1}
+     *   X  = X + D M C
+     *   QS = Q - Z.M
+     *   D  = Q + D S^dag
+     *   C  = S C
+     */
+    Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(nrhs,nrhs);
+    Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(nrhs,nrhs);
+    Eigen::MatrixXcd m_zz     = Eigen::MatrixXcd::Zero(nrhs,nrhs);
+    Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(nrhs,nrhs);
+    
+    Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(nrhs,nrhs);
+    Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(nrhs,nrhs);
+    Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(nrhs,nrhs);
+    Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(nrhs,nrhs);
+    
+    Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(nrhs,nrhs);
+    Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(nrhs,nrhs);
+
+    GridStopWatch HDCGTimer;
+
+    //////////////////////////
+    // x0 = Vstart -- possibly modify guess
+    //////////////////////////
+    Vstart(X,src);
+
+    //////////////////////////
+    // R = B-AX
+    //////////////////////////
+    for(int rhs=0;rhs<nrhs;rhs++){
+      // r0 = b -A x0
+      _FineLinop.HermOp(X[rhs],tmp[rhs]);
+      axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]);    // Computes R=Z=src - A X0
+    }
+
+    //////////////////////////////////
+    // Compute MZ = M1 Z = M1 B - M1 A x0
+    //////////////////////////////////
+    PcgM1(Z,MZ);  
+
+    //////////////////////////////////
+    // QC = Z
+    //////////////////////////////////
+    ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
+
+    //////////////////////////////////
+    // D=MQ
+    //////////////////////////////////
+    for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
+
+    std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
+
+    ProjectTimer.Reset();
+    PromoteTimer.Reset();
+    DeflateTimer.Reset();
+    CoarseTimer.Reset();
+    SmoothTimer.Reset();
+    FineTimer.Reset();
+    InsertTimer.Reset();
+
+    GridStopWatch M1Timer;
+    GridStopWatch M2Timer;
+    GridStopWatch M3Timer;
+    GridStopWatch LinalgTimer;
+    GridStopWatch InnerProdTimer;
+
+    HDCGTimer.Start();
+
+    std::vector<RealD> rn(nrhs);
+    for (int k=0;k<=MaxIterations;k++){
+
+      ////////////////////
+      // Z  = AD
+      ////////////////////
+      M3Timer.Start();
+      for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);      
+      M3Timer.Stop();
+
+      ////////////////////
+      // MZ  = M1 Z <==== the Multigrid preconditioner
+      ////////////////////
+      M1Timer.Start();
+      PcgM1(Z,MZ);
+      M1Timer.Stop();
+
+      FineTimer.Start();
+      ////////////////////
+      // M  = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
+      ////////////////////
+      InnerProdTimer.Start();
+      _BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
+      InnerProdTimer.Stop();
+      m_M       = m_DZ.inverse();
+
+      ///////////////////////////
+      // X  = X + D MC
+      ///////////////////////////
+      m_tmp     = m_M * m_C;
+      LinalgTimer.Start();
+      _BlockCGLinalg.MaddMatrix(X,m_tmp, D,X);     // D are the search directions and X takes the updates 
+      LinalgTimer.Stop();
+
+      ///////////////////////////
+      // QS = Q - M Z
+      // (MQ) S = MQ - M (M1Z)
+      ///////////////////////////
+      LinalgTimer.Start();
+      _BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
+      _BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
+      ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
+      LinalgTimer.Stop();
+
+      ////////////////////////////
+      // D  = MQ + D S^dag
+      ////////////////////////////
+      m_tmp = m_S.adjoint();
+      LinalgTimer.Start();
+      _BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
+      LinalgTimer.Stop();
+
+      ////////////////////////////
+      // C  = S C
+      ////////////////////////////
+      m_C = m_S*m_C;
+      
+      ////////////////////////////
+      // convergence monitor
+      ////////////////////////////
+      m_rr = m_C.adjoint() * m_C;
+      
+      FineTimer.Stop();
+
+      RealD max_resid=0;
+      RealD rrsum=0;
+      RealD sssum=0;
+      RealD rr;
+
+      for(int b=0;b<nrhs;b++) {
+	rrsum+=real(m_rr(b,b));
+	sssum+=ssq[b];
+	rr = real(m_rr(b,b))/ssq[b];
+	if ( rr > max_resid ) max_resid = rr;
+      }
+      std::cout << GridLogMessage <<
+	  "\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
+
+
+      if ( max_resid < Tolerance*Tolerance ) { 
+
+	HDCGTimer.Stop();
+	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
+	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg  "<<LinalgTimer.Elapsed()<<std::endl;;
+	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H  "<<M3Timer.Elapsed()<<std::endl;;
+	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
+	std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
+	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
+	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
+	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
+	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse  "<<CoarseTimer.Elapsed()<<std::endl;;
+	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine    "<<FineTimer.Elapsed()<<std::endl;;
+	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth  "<<SmoothTimer.Elapsed()<<std::endl;;
+	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert  "<<InsertTimer.Elapsed()<<std::endl;;
+
+	for(int rhs=0;rhs<nrhs;rhs++){
+
+	  _FineLinop.HermOp(X[rhs],tmp[rhs]);			  
+
+	  Field mytmp(grid);
+	  axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
+      
+	  RealD  xnorm   = sqrt(norm2(X[rhs]));
+	  RealD  srcnorm = sqrt(norm2(src[rhs]));
+	  RealD  tmpnorm = sqrt(norm2(mytmp));
+	  RealD  true_residual = tmpnorm/srcnorm;
+	  std::cout<<GridLogMessage
+		   <<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
+		   <<" solution "<<xnorm
+		   <<" source "<<srcnorm
+		   <<std::endl;
+	}
+	return;
+      }
+      
+    }
+    HDCGTimer.Stop();
+    std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
+    assert(0);
+  }
+
+  virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
  {
    std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
    src[0].Grid()->Barrier();
@@ -361,15 +668,26 @@ public:
    CoarseField PleftProjMrhs(this->coarsegridmrhs);
    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);

-    for(int rhs=0;rhs<nrhs;rhs++) {
+    //    this->rrr=in[0];

+#undef SMOOTHER_BLOCK_SOLVE
+#if SMOOTHER_BLOCK_SOLVE
+    this->SmoothTimer.Start();
+    this->_Smoother(in,Min);
+    this->SmoothTimer.Stop();
+#else
+    for(int rhs=0;rhs<nrhs;rhs++) {
      this->SmoothTimer.Start();
      this->_Smoother(in[rhs],Min[rhs]);
      this->SmoothTimer.Stop();
-
+    }
+#endif
+    //    this->sss=Min[0];
+    
+    for(int rhs=0;rhs<nrhs;rhs++) {
+      
      this->FineTimer.Start();
      this->_FineLinop.HermOp(Min[rhs],out[rhs]);
-
      axpy(tmp[rhs],-1.0,out[rhs],in[rhs]);          // resid  = in - A Min
      this->FineTimer.Stop();

@@ -401,13 +719,15 @@ public:
    this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]  
    this->PromoteTimer.Stop();
    this->FineTimer.Start();
+    //    this->qqq=tmp[0];
    for(int rhs=0;rhs<nrhs;rhs++) {
      axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
    }
+    //    this->zzz=out[0];
    this->FineTimer.Stop();
  }
 };
-  
+

 NAMESPACE_END(Grid);

@@ -31,6 +31,58 @@ directory

 NAMESPACE_BEGIN(Grid);

+template<class Field>
+void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
+  typedef typename Field::scalar_type scomplex;
+  int Nblock = X.size();
+  for(int b=0;b<Nblock;b++){
+  for(int bp=0;bp<Nblock;bp++) {
+    m(b,bp) = innerProduct(X[b],Y[bp]);  
+  }}
+}
+template<class Field>
+void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
+  // Should make this cache friendly with site outermost, parallel_for
+  // Deal with case AP aliases with either Y or X
+  //
+  //Could pack "X" and "AP" into a Nblock x Volume dense array.
+  // AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
+  typedef typename Field::scalar_type scomplex;
+  int Nblock = AP.size();
+  std::vector<Field> tmp(Nblock,X[0]);
+  for(int b=0;b<Nblock;b++){
+    tmp[b]   = Y[b];
+    for(int bp=0;bp<Nblock;bp++) {
+      tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp]; 
+    }
+  }
+  for(int b=0;b<Nblock;b++){
+    AP[b] = tmp[b];
+  }
+}
+template<class Field>
+void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
+  // Should make this cache friendly with site outermost, parallel_for
+  typedef typename Field::scalar_type scomplex;
+  int Nblock = AP.size();
+  for(int b=0;b<Nblock;b++){
+    AP[b] = Zero();
+    for(int bp=0;bp<Nblock;bp++) {
+      AP[b] += scomplex(m(bp,b))*X[bp]; 
+    }
+  }
+}
+template<class Field>
+double normv(const std::vector<Field> &P){
+  int Nblock = P.size();
+  double nn = 0.0;
+  for(int b=0;b<Nblock;b++) {
+    nn+=norm2(P[b]);
+  }
+  return nn;
+}
+
+
 enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };

 //////////////////////////////////////////////////////////////////////////
@@ -87,10 +139,19 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  sliceInnerProductMatrix(m_rr,R,R,Orthog);

  // Force manifest hermitian to avoid rounding related
+  /*
+  int rank=m_rr.rows();
+  for(int r=0;r<rank;r++){
+  for(int s=0;s<rank;s++){
+    std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
+  }}
+  */
  m_rr = 0.5*(m_rr+m_rr.adjoint());

  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 

+//  ComplexD det = L.determinant();
+//  std::cout << " Det m_rr "<<det<<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -110,11 +171,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 const std::vector<Field> & R)
 {
  InnerProductMatrix(m_rr,R,R);
-
+  /*
+  int rank=m_rr.rows();
+  for(int r=0;r<rank;r++){
+  for(int s=0;s<rank;s++){
+    std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
+  }}
+  */
  m_rr = 0.5*(m_rr+m_rr.adjoint());

  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 

+  //  ComplexD det = L.determinant();
+  //  std::cout << " Det m_rr "<<det<<std::endl;
+
  C    = L.adjoint();
  Cinv = C.inverse();

@@ -186,6 +256,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  sliceNorm(ssq,B,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+  for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;

  sliceNorm(residuals,B,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
@@ -221,6 +292,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  Linop.HermOp(X, AD);
  tmp = B - AD;  

+  sliceNorm(residuals,tmp,Orthog);
+  for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
+  
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  D=Q;

@@ -236,6 +310,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  GridStopWatch SolverTimer;
  SolverTimer.Start();

+  RealD max_resid=0;
+
  int k;
  for (k = 1; k <= MaxIterations; k++) {

@@ -280,7 +356,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
     */
    m_rr = m_C.adjoint() * m_C;

-    RealD max_resid=0;
+    max_resid=0;
    RealD rrsum=0;
    RealD rr;

@@ -322,7 +398,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    }

  }
-  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
+	    <<" residual "<< std::sqrt(max_resid)<< std::endl;

  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
@@ -466,43 +544,6 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
  IterationsToComplete = k;
 }

-void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
-  for(int b=0;b<Nblock;b++){
-  for(int bp=0;bp<Nblock;bp++) {
-    m(b,bp) = innerProduct(X[b],Y[bp]);  
-  }}
-}
-void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
-  // Should make this cache friendly with site outermost, parallel_for
-  // Deal with case AP aliases with either Y or X
-  std::vector<Field> tmp(Nblock,X[0]);
-  for(int b=0;b<Nblock;b++){
-    tmp[b]   = Y[b];
-    for(int bp=0;bp<Nblock;bp++) {
-      tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp]; 
-    }
-  }
-  for(int b=0;b<Nblock;b++){
-    AP[b] = tmp[b];
-  }
-}
-void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
-  // Should make this cache friendly with site outermost, parallel_for
-  for(int b=0;b<Nblock;b++){
-    AP[b] = Zero();
-    for(int bp=0;bp<Nblock;bp++) {
-      AP[b] += scomplex(m(bp,b))*X[bp]; 
-    }
-  }
-}
-double normv(const std::vector<Field> &P){
-  double nn = 0.0;
-  for(int b=0;b<Nblock;b++) {
-    nn+=norm2(P[b]);
-  }
-  return nn;
-}
-
 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQvec implementation:
 //--------------------------
@@ -549,6 +590,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field

  RealD sssum=0;
  for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];

  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
@@ -585,6 +627,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
  for(int b=0;b<Nblock;b++) {
    Linop.HermOp(X[b], AD[b]);
    tmp[b] = B[b] - AD[b];  
+    std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
  }

  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
@@ -38,12 +38,13 @@ NAMESPACE_BEGIN(Grid);
 // single input vec, single output vec.
 /////////////////////////////////////////////////////////////

+
 template <class Field>
 class ConjugateGradient : public OperatorFunction<Field> {
 public:

  using OperatorFunction<Field>::operator();
-
+  
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
@@ -54,11 +55,26 @@ public:
  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol),
      MaxIterations(maxit),
-      ErrorOnNoConverge(err_on_no_conv){};
+      ErrorOnNoConverge(err_on_no_conv)
+  {};

-  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
+  virtual void LogIteration(int k,RealD a,RealD b){
+    //    std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
+  };
+  virtual void LogBegin(void){
+    std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
+  };

-    GRID_TRACE("ConjugateGradient");
+    void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
+
+      this->LogBegin();
+
+      GRID_TRACE("ConjugateGradient");
+    GridStopWatch PreambleTimer;
+    GridStopWatch ConstructTimer;
+    GridStopWatch NormTimer;
+    GridStopWatch AssignTimer;
+    PreambleTimer.Start();
    psi.Checkerboard() = src.Checkerboard();

    conformable(psi, src);
@@ -66,22 +82,32 @@ public:
    RealD cp, c, a, d, b, ssq, qq;
    //RealD b_pred;

-    Field p(src);
-    Field mmp(src);
-    Field r(src);
+    // Was doing copies
+    ConstructTimer.Start();
+    Field p  (src.Grid());
+    Field mmp(src.Grid());
+    Field r  (src.Grid());
+    ConstructTimer.Stop();

    // Initial residual computation & set up
-    RealD guess = norm2(psi);
-    assert(std::isnan(guess) == 0);
-    
-    Linop.HermOpAndNorm(psi, mmp, d, b);
-    
-    r = src - mmp;
-    p = r;
-
-    a = norm2(p);
-    cp = a;
+    NormTimer.Start();
    ssq = norm2(src);
+    RealD guess = norm2(psi);
+    NormTimer.Stop();
+    assert(std::isnan(guess) == 0);
+    AssignTimer.Start();
+    if ( guess == 0.0 ) {
+      r = src;
+      p = r;
+      a = ssq;
+    } else { 
+      Linop.HermOpAndNorm(psi, mmp, d, b);
+      r = src - mmp;
+      p = r;
+      a = norm2(p);
+    }
+    cp = a;
+    AssignTimer.Stop();

    // Handle trivial case of zero src
    if (ssq == 0.){
@@ -103,7 +129,7 @@ public:
    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
      TrueResidual = std::sqrt(a/ssq);
-      std::cout << GridLogMessage << "ConjugateGradient guess is converged already : cp " << cp <<" rsq "<<rsq <<" ssq "<<ssq<< std::endl;
+      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
      IterationsToComplete = 0;	
      return;
    }
@@ -111,6 +137,7 @@ public:
    std::cout << GridLogIterative << std::setprecision(8)
              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;

+    PreambleTimer.Stop();
    GridStopWatch LinalgTimer;
    GridStopWatch InnerTimer;
    GridStopWatch AxpyNormTimer;
@@ -156,6 +183,7 @@ public:
      }
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
+      LogIteration(k,a,b);

      IterationTimer.Stop();
      if ( (k % 500) == 0 ) {
@@ -183,7 +211,8 @@ public:
 		  << "\tTrue residual " << true_residual
 		  << "\tTarget " << Tolerance << std::endl;

-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	//	std::cout << GridLogMessage << "\tPreamble   " << PreambleTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tSolver Elapsed    " << SolverTimer.Elapsed() <<std::endl;
        std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
 	std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	std::cout << GridLogPerformance << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
@@ -202,18 +231,143 @@ public:
      }
    }
    // Failed. Calculate true residual before giving up                                                         
-    Linop.HermOpAndNorm(psi, mmp, d, qq);
-    p = mmp - src;
-
-    TrueResidual = sqrt(norm2(p)/ssq);
+    // Linop.HermOpAndNorm(psi, mmp, d, qq);
+    //    p = mmp - src;
+    //TrueResidual = sqrt(norm2(p)/ssq);
+    //    TrueResidual = 1;

    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
-	      <<" residual "<< TrueResidual<< std::endl;
+    	      <<" residual "<< std::sqrt(cp / ssq)<< std::endl;
+    SolverTimer.Stop();
+    std::cout << GridLogMessage << "\tPreamble   " << PreambleTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage << "\tConstruct  " << ConstructTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage << "\tNorm       " << NormTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage << "\tAssign     " << AssignTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage << "\tSolver     " << SolverTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
+    std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage<< "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\t\tInner      " << InnerTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\t\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;

  }
 };
+
+
+template <class Field>
+class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
+public:
+  // Optionally record the CG polynomial
+  std::vector<double> ak;
+  std::vector<double> bk;
+  std::vector<double> poly_p;
+  std::vector<double> poly_r;
+  std::vector<double> poly_Ap;
+  std::vector<double> polynomial;
+
+public:
+  ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
+    : ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
+  { };
+  void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
+  {
+    Field tmp(src.Grid());
+    Field AtoN(src.Grid());
+    AtoN = src;
+    psi=AtoN*polynomial[0];
+    for(int n=1;n<polynomial.size();n++){
+      tmp = AtoN;
+      Linop.HermOp(tmp,AtoN);
+      psi = psi + polynomial[n]*AtoN;
+    }
+  }
+  void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
+  {
+    Field Ap(src.Grid());
+    Field r(src.Grid());
+    Field p(src.Grid());
+    p=src;
+    r=src;
+    x=Zero();
+    x.Checkerboard()=src.Checkerboard();
+    for(int k=0;k<ak.size();k++){
+      x = x + ak[k]*p;
+      Linop.HermOp(p,Ap);
+      r = r - ak[k] * Ap;
+      p = r + bk[k] * p;
+    }
+  }
+  void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
+  {
+    psi=Zero();
+    this->operator ()(Linop,src,psi);
+  }
+  virtual void LogBegin(void)
+  {
+    std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
+    ak.resize(0);
+    bk.resize(0);
+    polynomial.resize(0);
+    poly_Ap.resize(0);
+    poly_Ap.resize(0);
+    poly_p.resize(1);
+    poly_r.resize(1);
+    poly_p[0]=1.0;
+    poly_r[0]=1.0;
+  };
+  virtual void LogIteration(int k,RealD a,RealD b)
+  {
+    // With zero guess,
+    // p = r = src
+    //
+    // iterate:
+    //   x =  x + a p
+    //   r =  r - a A p
+    //   p =  r + b p
+    //
+    // [0]
+    // r = x
+    // p = x
+    // Ap=0
+    //
+    // [1]
+    // Ap = A x + 0  ==> shift poly P right by 1 and add 0.
+    // x  = x + a p  ==> add polynomials term by term 
+    // r  = r - a A p  ==> add polynomials term by term
+    // p  = r + b p  ==> add polynomials term by term
+    //
+    std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
+    ak.push_back(a);
+    bk.push_back(b);
+    //  Ap= right_shift(p)
+    poly_Ap.resize(k+1);
+    poly_Ap[0]=0.0;
+    for(int i=0;i<k;i++){
+      poly_Ap[i+1]=poly_p[i];
+    }
+
+    //  x = x + a p
+    polynomial.resize(k);
+    polynomial[k-1]=0.0;
+    for(int i=0;i<k;i++){
+      polynomial[i] = polynomial[i] + a * poly_p[i];
+    }
+    
+    //  r = r - a Ap
+    //  p = r + b p
+    poly_r.resize(k+1);
+    poly_p.resize(k+1);
+    poly_r[k] = poly_p[k] = 0.0;
+    for(int i=0;i<k+1;i++){
+      poly_r[i] = poly_r[i] - a * poly_Ap[i];
+      poly_p[i] = poly_r[i] + b * poly_p[i];
+    }
+  }
+};
+
 NAMESPACE_END(Grid);
 #endif
@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
      //Compute double precision rsd and also new RHS vector.
      Linop_d.HermOp(sol_d, tmp_d);
      RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
-      
+      std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;

      if(norm < OuterLoopNormMult * stop){
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 	break;
      }
-      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+      while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??

      PrecChangeTimer.Start();
      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
@@ -102,11 +102,11 @@ public:
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
  
-    // dynamic sized arrays on stack; 2d is a pain with vector
-    RealD  bs[nshift];
-    RealD  rsq[nshift];
-    RealD  z[nshift][2];
-    int     converged[nshift];
+    // remove dynamic sized arrays on stack; 2d is a pain with vector
+    std::vector<RealD>  bs(nshift);
+    std::vector<RealD>  rsq(nshift);
+    std::vector<std::array<RealD,2> >  z(nshift);
+    std::vector<int>     converged(nshift);
  
    const int       primary =0;
  
@@ -123,11 +123,11 @@ public:
    assert(mresidual.size()==nshift);
  
    // dynamic sized arrays on stack; 2d is a pain with vector
-    RealD  bs[nshift];
-    RealD  rsq[nshift];
-    RealD  rsqf[nshift];
-    RealD  z[nshift][2];
-    int     converged[nshift];
+    std::vector<RealD>  bs(nshift);
+    std::vector<RealD>  rsq(nshift);
+    std::vector<RealD>  rsqf(nshift);
+    std::vector<std::array<RealD,2> >  z(nshift);
+    std::vector<int>     converged(nshift);
  
    const int       primary =0;
  
@@ -156,11 +156,11 @@ public:
    assert(mresidual.size()==nshift);
  
    // dynamic sized arrays on stack; 2d is a pain with vector
-    RealD  bs[nshift];
-    RealD  rsq[nshift];
-    RealD  rsqf[nshift];
-    RealD  z[nshift][2];
-    int     converged[nshift];
+    std::vector<RealD>  bs(nshift);
+    std::vector<RealD>  rsq(nshift);
+    std::vector<RealD>  rsqf(nshift);
+    std::vector<std::array<RealD,2> >  z(nshift);
+    std::vector<int>     converged(nshift);
  
    const int       primary =0;
  
@@ -143,7 +143,7 @@ public:
      ip = innerProduct(evec[j],w); 
      if(if_print) 
      if( norm(ip)/norm2(w) > 1e-14)
-      Glog<<"orthogonalize before: "<<j<<" of "<<k<<" "<< ip <<std::endl;
+	Glog<<"orthogonalize before: "<<j<<" of "<<k<<" "<< ip <<std::endl;
      w = w - ip * evec[j];
      if(if_print) {
        ip = innerProduct(evec[j],w); 
@@ -281,14 +281,14 @@ public:
      _sort.push(eval2,Nm);
      //      Glog << "#Ritz value before shift: "<< std::endl;
      for(int i=0; i<Nm; ++i){
-	//        std::cout.precision(13);
-	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+	//	std::cout.precision(13);
+	//	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	//	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
      }
      
      //----------------------------------------------------------------------
      if ( Nm>Nk ) {
-        Glog <<" #Apply shifted QR transformations "<<std::endl;
+	//        Glog <<" #Apply shifted QR transformations "<<std::endl;
        //int k2 = Nk+Nu;
        int k2 = Nk;
      
@@ -297,7 +297,8 @@ public:
        
        unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);

-        for(int ip=Nk; ip<Nm; ++ip){ 
+        for(int ip=Nk; ip<Nm; ++ip){
+	  Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
          shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
        }
        
@@ -325,11 +326,11 @@ public:
        Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
        diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
        _sort.push(eval2,Nk);
-	//        Glog << "#Ritz value after shift: "<< std::endl;
+	//	Glog << "#Ritz value after shift: "<< std::endl;
        for(int i=0; i<Nk; ++i){
-	  //          std::cout.precision(13);
-	  //          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	  //          std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+	  //	  std::cout.precision(13);
+	  //	  std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	  //	  std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
        }
      }
      //----------------------------------------------------------------------
@@ -467,10 +468,10 @@ public:
  
    // set initial vector
    for (int i=0; i<Nu; ++i) {
-      //      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
+      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
      evec[i] = src[i];
      orthogonalize(evec[i],evec,i);
-      //      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
+      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
    }
 //    exit(-43);
    
@@ -506,11 +507,11 @@ public:
      Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
      diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
      _sort.push(eval2,Nr);
-      //      Glog << "#Ritz value: "<< std::endl;
+      Glog << "#Ritz value: "<< std::endl;
      for(int i=0; i<Nr; ++i){
-	//        std::cout.precision(13);
-	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+	std::cout.precision(13);
+	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
      }
      
      // Convergence test
@@ -570,6 +571,7 @@ public:
      Glog << fname + " NOT converged ; Summary :\n";
    } else {
      Glog << fname + " CONVERGED ; Summary :\n";
+      Nstop = Nconv_guess; // Just take them all
      // Sort convered eigenpairs.
      std::vector<Field>  Btmp(Nstop,grid); // waste of space replicating

@@ -779,7 +781,7 @@ private:
    
    for ( int u=0; u<Nu; ++u ) {
      for (int k=0; k<Nk; ++k ) {
-//        Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
+	//	Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
        BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k];
      }
    }
@@ -933,7 +935,7 @@ if (1){
         int Nu, int Nb, int Nk, int Nm,
         Eigen::MatrixXcd& M)
  {
-    //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
+    //    Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
    assert( Nk <= Nm );
    M = Eigen::MatrixXcd::Zero(Nk,Nk);
@@ -951,7 +953,7 @@ if (1){
        M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
      }
    }
-    //Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl; 
+    //    Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl; 
  }
 

@@ -961,7 +963,7 @@ if (1){
         int Nu, int Nb, int Nk, int Nm,
         Eigen::MatrixXcd& M)
  {
-    //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
+    //    Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
    assert( Nk <= Nm );
    
@@ -977,7 +979,7 @@ if (1){
        lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
      }
    }
-    //Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl; 
+    //    Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl; 
  }


@@ -986,7 +988,7 @@ if (1){
 		            RealD Dsh,
 		            Eigen::MatrixXcd& Qprod)
  {
-    //Glog << "shiftedQRDecompEigen() begin" << '\n'; 
+    //    Glog << "shiftedQRDecompEigen() begin" << '\n'; 
    Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
    Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
    Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
@@ -1002,6 +1004,7 @@ if (1){
                        // lower triangular part used to represent series
                        // of Q sequence.

+    //    Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n'; 
    // equivalent operation of Qprod *= Q
    //M = Eigen::MatrixXcd::Zero(Nm,Nm);
    
@@ -1022,6 +1025,7 @@ if (1){
    
    Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);

+    //    Glog << "shiftedQRDecompEigen() Mtmp create" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      for (int j=0; j<Nm-(Nu+1); ++j) {
        for (int k=0; k<Nu+1+j; ++k) {
@@ -1029,6 +1033,7 @@ if (1){
        }
      }
    }
+    //    Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      for (int j=Nm-(Nu+1); j<Nm; ++j) {
        for (int k=0; k<Nm; ++k) {
@@ -1036,6 +1041,7 @@ if (1){
        }
      }
    }
+    //    Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n'; 
    
    //static int ntimes = 2;
    //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
@@ -1061,11 +1067,13 @@ if (1){
        Mtmp(j,i) = conj(Mtmp(i,j));
      }
    }
+    //    Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n'; 

    for (int i=0; i<Nm; ++i) {
      Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
    }
    
+    //    Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n'; 
    M = Mtmp;

    //M = Q.adjoint()*(M*Q);
@@ -1077,7 +1085,7 @@ if (1){
    //  }
    //}
    
-    //Glog << "shiftedQRDecompEigen() end" << endl; 
+    //    Glog << "shiftedQRDecompEigen() end" <<std::endl; 
  }

  void exampleQRDecompEigen(void)
@@ -245,9 +245,10 @@ until convergence
 	_HermOp(src_n,tmp);
 	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
 	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
-	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+//	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+	RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
 	RealD vden = norm2(src_n);
-	RealD na = vnum/vden;
+	RealD na = std::sqrt(vnum/vden);
 	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
@@ -255,6 +256,7 @@ until convergence
 	src_n = tmp;
      }
    }
+    std::cout << GridLogIRL << " Final evalMaxApprox  " << evalMaxApprox << std::endl;
 	
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
@@ -60,6 +60,32 @@ public:
  }     
 };

+template<class Field> class NormalResidual : public LinearFunction<Field>{
+private:
+  SparseMatrixBase<Field> & _Matrix;
+  OperatorFunction<Field> & _HermitianSolver;
+  LinearFunction<Field>   & _Guess;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+ NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
+		 LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    Field res(in.Grid());
+    Field tmp(in.Grid());
+
+    MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
+    _Guess(in,res);
+    _HermitianSolver(MMdagOp,in,res);  // M Mdag res = in ;
+    _Matrix.Mdag(res,out);             // out = Mdag res
+  }     
+};
+
 template<class Field> class HPDSolver : public LinearFunction<Field> {
 private:
  LinearOperatorBase<Field> & _Matrix;
@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
    RealD evalMaxApprox = 0.0; 
    auto src_n = src; 
    auto tmp = src; 
-    const int _MAX_ITER_EST_ = 100; 
+    const int _MAX_ITER_EST_ = 200; 

    for (int i=0;i<_MAX_ITER_EST_;i++) { 
      
@@ -30,18 +30,17 @@ template<class Field> class PowerMethod
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 

-      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
+      std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      
-      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
- 	evalMaxApprox = na; 
-	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
- 	return evalMaxApprox; 
-      } 
+      //      if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) { 
+	// 	evalMaxApprox = na; 
+	// 	return evalMaxApprox; 
+      //      } 
      evalMaxApprox = na; 
      src_n = tmp;
    }
-    assert(0);
-    return 0;
+    std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
+    return evalMaxApprox;
  }
 };
 }
@@ -0,0 +1,76 @@
+#pragma once
+namespace Grid {
+
+class Band
+{
+  RealD lo, hi;
+public:
+  Band(RealD _lo,RealD _hi)
+  {
+    lo=_lo;
+    hi=_hi;
+  }
+  RealD operator() (RealD x){
+    if ( x>lo && x<hi ){
+      return 1.0;
+    } else {
+      return 0.0;
+    }
+  }
+};
+
+class PowerSpectrum
+{ 
+ public: 
+
+  template<typename T>  static RealD normalise(T& v) 
+  {
+    RealD nn = norm2(v);
+    nn = sqrt(nn);
+    v = v * (1.0/nn);
+    return nn;
+  }
+
+  std::vector<RealD> ranges;
+  std::vector<int> order;
+  
+  PowerSpectrum(  std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order)  { };
+
+  template<class Field>
+  RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src) 
+  { 
+    GridBase *grid = src.Grid(); 
+    int N=ranges.size();
+    RealD hi = ranges[N-1];
+
+    RealD lo_band = 0.0;
+    RealD hi_band;
+    RealD nn=norm2(src);
+    RealD ss=0.0;
+
+    Field tmp = src;
+
+    for(int b=0;b<N;b++){
+      hi_band = ranges[b];
+      Band Notch(lo_band,hi_band);
+      
+      Chebyshev<Field> polynomial;
+      polynomial.Init(0.0,hi,order[b],Notch);
+      polynomial.JacksonSmooth();
+
+      polynomial(HermOp,src,tmp) ;
+
+      RealD p=norm2(tmp);
+      ss=ss+p;
+      std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
+      
+      lo_band=hi_band;
+    }
+    std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
+    std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
+
+    return 0;
+  };
+};
+  
+}
@@ -74,7 +74,7 @@ public:

  void operator() (const Field &src, Field &psi){

-    psi=Zero();
+    //    psi=Zero();
    RealD cp, ssq,rsq;
    ssq=norm2(src);
    rsq=Tolerance*Tolerance*ssq;
@@ -499,6 +499,87 @@ namespace Grid {
      }
  };

+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal is identity, left preconditioned by Mee^inv
+  // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv  ) phi =  Mee_inv eta
+  //
+  // Solve:
+  // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag  Mee_inv eta
+  //
+  // Old notation e<->o
+  //
+  // Left precon by Moo^-1
+  //  b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o =  [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1}  eta_o
+  //                                   eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
+      const bool _solnAsInitGuess = false)  
+    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
+
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
+      Mtmp=src_o-Mtmp;                 
+      _Matrix.MooeeInv(Mtmp,tmp);      assert( tmp.Checkerboard() ==Odd);     
+      
+      // get the right MpcDag
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
+    }
+
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field   sol_e(grid);
+
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);    assert(  tmp.Checkerboard()   ==Even);
+      tmp = src_e-tmp;             assert(  src_e.Checkerboard() ==Even);
+      _Matrix.MooeeInv(tmp,sol_e); assert(  sol_e.Checkerboard() ==Even);
+     
+      setCheckerboard(sol,sol_e);  assert(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o);  assert(  sol_o.Checkerboard() ==Odd );
+    };
+
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, right preconditioned by Mee^inv
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
@@ -0,0 +1,931 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LANC_H
+#define GRID_LANC_H
+
+#include <string.h>		//memset
+
+#ifdef USE_LAPACK
+#ifdef USE_MKL
+#include<mkl_lapack.h>
+#else
+void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
+		    double *vl, double *vu, int *il, int *iu, double *abstol,
+		    int *m, double *w, double *z, int *ldz, int *isuppz,
+		    double *work, int *lwork, int *iwork, int *liwork,
+		    int *info);
+//#include <lapacke/lapacke.h>
+#endif
+#endif
+
+//#include <Grid/algorithms/densematrix/DenseMatrix.h>
+
+// eliminate temorary vector in calc()
+#define MEM_SAVE
+
+namespace Grid
+{
+
+  struct Bisection
+  {
+
+#if 0
+    static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
+			  std::vector < RealD > &BETA,
+			  std::vector < RealD > &eig)
+    {
+      int i, j;
+        std::vector < RealD > evec1 (row_num + 3);
+        std::vector < RealD > evec2 (row_num + 3);
+      RealD eps2;
+        ALPHA[1] = 0.;
+        BETHA[1] = 0.;
+      for (i = 0; i < row_num - 1; i++)
+	{
+	  ALPHA[i + 1] = A[i * (row_num + 1)].real ();
+	  BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
+	}
+      ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
+        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
+        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
+
+      // Do we really need to sort here?
+      int begin = 1;
+      int end = row_num;
+      int swapped = 1;
+      while (swapped)
+	{
+	  swapped = 0;
+	  for (i = begin; i < end; i++)
+	    {
+	      if (mag (evec2[i]) > mag (evec2[i + 1]))
+		{
+		  swap (evec2 + i, evec2 + i + 1);
+		  swapped = 1;
+		}
+	    }
+	  end--;
+	  for (i = end - 1; i >= begin; i--)
+	    {
+	      if (mag (evec2[i]) > mag (evec2[i + 1]))
+		{
+		  swap (evec2 + i, evec2 + i + 1);
+		  swapped = 1;
+		}
+	    }
+	  begin++;
+	}
+
+      for (i = 0; i < row_num; i++)
+	{
+	  for (j = 0; j < row_num; j++)
+	    {
+	      if (i == j)
+		H[i * row_num + j] = evec2[i + 1];
+	      else
+		H[i * row_num + j] = 0.;
+	    }
+	}
+    }
+#endif
+
+    static void bisec (std::vector < RealD > &c,
+		       std::vector < RealD > &b,
+		       int n,
+		       int m1,
+		       int m2,
+		       RealD eps1,
+		       RealD relfeh, std::vector < RealD > &x, RealD & eps2)
+    {
+      std::vector < RealD > wu (n + 2);
+
+      RealD h, q, x1, xu, x0, xmin, xmax;
+      int i, a, k;
+
+      b[1] = 0.0;
+      xmin = c[n] - fabs (b[n]);
+      xmax = c[n] + fabs (b[n]);
+      for (i = 1; i < n; i++)
+	{
+	  h = fabs (b[i]) + fabs (b[i + 1]);
+	  if (c[i] + h > xmax)
+	    xmax = c[i] + h;
+	  if (c[i] - h < xmin)
+	    xmin = c[i] - h;
+	}
+      xmax *= 2.;
+
+      eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
+      if (eps1 <= 0.0)
+	eps1 = eps2;
+      eps2 = 0.5 * eps1 + 7.0 * (eps2);
+      x0 = xmax;
+      for (i = m1; i <= m2; i++)
+	{
+	  x[i] = xmax;
+	  wu[i] = xmin;
+	}
+
+      for (k = m2; k >= m1; k--)
+	{
+	  xu = xmin;
+	  i = k;
+	  do
+	    {
+	      if (xu < wu[i])
+		{
+		  xu = wu[i];
+		  i = m1 - 1;
+		}
+	      i--;
+	    }
+	  while (i >= m1);
+	  if (x0 > x[k])
+	    x0 = x[k];
+	  while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
+	    {
+	      x1 = (xu + x0) / 2;
+
+	      a = 0;
+	      q = 1.0;
+	      for (i = 1; i <= n; i++)
+		{
+		  q =
+		    c[i] - x1 -
+		    ((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
+		  if (q < 0)
+		    a++;
+		}
+//      printf("x1=%0.14e a=%d\n",x1,a);
+	      if (a < k)
+		{
+		  if (a < m1)
+		    {
+		      xu = x1;
+		      wu[m1] = x1;
+		    }
+		  else
+		    {
+		      xu = x1;
+		      wu[a + 1] = x1;
+		      if (x[a] > x1)
+			x[a] = x1;
+		    }
+		}
+	      else
+		x0 = x1;
+	    }
+	  printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
+	  x[k] = (x0 + xu) / 2;
+	}
+    }
+  };
+
+/////////////////////////////////////////////////////////////
+// Implicitly restarted lanczos
+/////////////////////////////////////////////////////////////
+
+
+  template < class Field > class SimpleLanczos
+  {
+
+    const RealD small = 1.0e-16;
+  public:
+    int lock;
+    int get;
+    int Niter;
+    int converged;
+
+    int Nstop;			// Number of evecs checked for convergence
+    int Nk;			// Number of converged sought
+    int Np;			// Np -- Number of spare vecs in kryloc space
+    int Nm;			// Nm -- total number of vectors
+
+
+    RealD OrthoTime;
+
+    RealD eresid;
+
+//    SortEigen < Field > _sort;
+
+    LinearFunction < Field > &_Linop;
+
+//    OperatorFunction < Field > &_poly;
+
+    /////////////////////////
+    // Constructor
+    /////////////////////////
+    void init (void)
+    {
+    };
+//    void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector  < RealD > >&evecs);
+
+    SimpleLanczos (LinearFunction < Field > &Linop,	// op
+//		   OperatorFunction < Field > &poly,	// polynmial
+		   int _Nstop,	// sought vecs
+		   int _Nk,	// sought vecs
+		   int _Nm,	// spare vecs
+		   RealD _eresid,	// resid in lmdue deficit 
+		   int _Niter):	// Max iterations
+     
+      _Linop (Linop),
+ //     _poly (poly),
+      Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
+    {
+      Np = Nm - Nk;
+      assert (Np > 0);
+    };
+
+    /////////////////////////
+    // Sanity checked this routine (step) against Saad.
+    /////////////////////////
+    void RitzMatrix (std::vector < Field > &evec, int k)
+    {
+
+      if (1)
+	return;
+
+      GridBase *grid = evec[0].Grid();
+      Field w (grid);
+      std::cout << GridLogMessage << "RitzMatrix " << std::endl;
+      for (int i = 0; i < k; i++)
+	{
+	  _Linop(evec[i], w);
+//      _poly(_Linop,evec[i],w);
+	  std::cout << GridLogMessage << "[" << i << "] ";
+	  for (int j = 0; j < k; j++)
+	    {
+	      ComplexD in = innerProduct (evec[j], w);
+	      if (fabs ((double) i - j) > 1)
+		{
+		  if (abs (in) > 1.0e-9)
+		    {
+		      std::cout << GridLogMessage << "oops" << std::endl;
+		      abort ();
+		    }
+		  else
+		    std::cout << GridLogMessage << " 0 ";
+		}
+	      else
+		{
+		  std::cout << GridLogMessage << " " << in << " ";
+		}
+	    }
+	  std::cout << GridLogMessage << std::endl;
+	}
+    }
+
+    void step (std::vector < RealD > &lmd,
+	       std::vector < RealD > &lme,
+	       Field & last, Field & current, Field & next, uint64_t k)
+    {
+      if (lmd.size () <= k)
+	lmd.resize (k + Nm);
+      if (lme.size () <= k)
+	lme.resize (k + Nm);
+
+
+//      _poly(_Linop,current,next );   // 3. wk:=Avk−βkv_{k−1}
+      _Linop(current, next);	// 3. wk:=Avk−βkv_{k−1}
+      if (k > 0)
+	{
+	  next -= lme[k - 1] * last;
+	}
+//      std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
+
+      ComplexD zalph = innerProduct (current, next);	// 4. αk:=(wk,vk)
+      RealD alph = real (zalph);
+
+      next = next - alph * current;	// 5. wk:=wk−αkvk
+//      std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
+
+      RealD beta = normalise (next);	// 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+      // 7. vk+1 := wk/βk+1
+//       norm=beta;
+
+      int interval = Nm / 100 + 1;
+      if ((k % interval) == 0)
+	std::
+	  cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
+	  beta << std::endl;
+      const RealD tiny = 1.0e-20;
+      if (beta < tiny)
+	{
+	  std::cout << GridLogMessage << " beta is tiny " << beta << std::
+	    endl;
+	}
+      lmd[k] = alph;
+      lme[k] = beta;
+
+    }
+
+    void qr_decomp (std::vector < RealD > &lmd,
+		    std::vector  < RealD > &lme,
+		    int Nk,
+		    int Nm,
+		    std::vector  < RealD > &Qt, RealD Dsh, int kmin, int kmax)
+    {
+      int k = kmin - 1;
+      RealD x;
+
+      RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
+      RealD c = (lmd[k] - Dsh) * Fden;
+      RealD s = -lme[k] * Fden;
+
+      RealD tmpa1 = lmd[k];
+      RealD tmpa2 = lmd[k + 1];
+      RealD tmpb = lme[k];
+
+      lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
+      lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
+      lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
+      x = -s * lme[k + 1];
+      lme[k + 1] = c * lme[k + 1];
+
+      for (int i = 0; i < Nk; ++i)
+	{
+	  RealD Qtmp1 = Qt[i + Nm * k];
+	  RealD Qtmp2 = Qt[i + Nm * (k + 1)];
+	  Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
+	  Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
+	}
+
+      // Givens transformations
+      for (int k = kmin; k < kmax - 1; ++k)
+	{
+
+	  RealD Fden = 1.0 / hypot (x, lme[k - 1]);
+	  RealD c = lme[k - 1] * Fden;
+	  RealD s = -x * Fden;
+
+	  RealD tmpa1 = lmd[k];
+	  RealD tmpa2 = lmd[k + 1];
+	  RealD tmpb = lme[k];
+
+	  lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
+	  lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
+	  lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
+	  lme[k - 1] = c * lme[k - 1] - s * x;
+
+	  if (k != kmax - 2)
+	    {
+	      x = -s * lme[k + 1];
+	      lme[k + 1] = c * lme[k + 1];
+	    }
+
+	  for (int i = 0; i < Nk; ++i)
+	    {
+	      RealD Qtmp1 = Qt[i + Nm * k];
+	      RealD Qtmp2 = Qt[i + Nm * (k + 1)];
+	      Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
+	      Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
+	    }
+	}
+    }
+
+#if 0
+#ifdef USE_LAPACK
+#ifdef USE_MKL
+#define LAPACK_INT MKL_INT
+#else
+#define LAPACK_INT long long
+#endif
+    void diagonalize_lapack (std::vector  < RealD > &lmd, std::vector  < RealD > &lme, int N1,	// all
+			     int N2,	// get
+			     GridBase * grid)
+    {
+      const int size = Nm;
+      LAPACK_INT NN = N1;
+      double evals_tmp[NN];
+      double DD[NN];
+      double EE[NN];
+      for (int i = 0; i < NN; i++)
+	for (int j = i - 1; j <= i + 1; j++)
+	  if (j < NN && j >= 0)
+	    {
+	      if (i == j)
+		DD[i] = lmd[i];
+	      if (i == j)
+		evals_tmp[i] = lmd[i];
+	      if (j == (i - 1))
+		EE[j] = lme[j];
+	    }
+      LAPACK_INT evals_found;
+      LAPACK_INT lwork =
+	((18 * NN) >
+	 (1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
+      LAPACK_INT liwork = 3 + NN * 10;
+      LAPACK_INT iwork[liwork];
+      double work[lwork];
+      LAPACK_INT isuppz[2 * NN];
+      char jobz = 'N';		// calculate evals only
+      char range = 'I';		// calculate il-th to iu-th evals
+      //    char range = 'A'; // calculate all evals
+      char uplo = 'U';		// refer to upper half of original matrix
+      char compz = 'I';		// Compute eigenvectors of tridiagonal matrix
+      int ifail[NN];
+      LAPACK_INT info;
+//  int total = QMP_get_number_of_nodes();
+//  int node = QMP_get_node_number();
+//  GridBase *grid = evec[0]._grid;
+      int total = grid->_Nprocessors;
+      int node = grid->_processor;
+      int interval = (NN / total) + 1;
+      double vl = 0.0, vu = 0.0;
+      LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
+      if (iu > NN)
+	iu = NN;
+      double tol = 0.0;
+      if (1)
+	{
+	  memset (evals_tmp, 0, sizeof (double) * NN);
+	  if (il <= NN)
+	    {
+	      printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
+#ifdef USE_MKL
+	      dstegr (&jobz, &range, &NN,
+#else
+	      LAPACK_dstegr (&jobz, &range, &NN,
+#endif
+			     (double *) DD, (double *) EE, &vl, &vu, &il, &iu,	// these four are ignored if second parameteris 'A'
+			     &tol,	// tolerance
+			     &evals_found, evals_tmp, (double *) NULL, &NN,
+			     isuppz, work, &lwork, iwork, &liwork, &info);
+	      for (int i = iu - 1; i >= il - 1; i--)
+		{
+		  printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
+			  evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
+		  evals_tmp[i] = evals_tmp[i - (il - 1)];
+		  if (il > 1)
+		    evals_tmp[i - (il - 1)] = 0.;
+		}
+	    }
+	  {
+	    grid->GlobalSumVector (evals_tmp, NN);
+	  }
+	}
+// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
+    }
+#undef LAPACK_INT
+#endif
+
+
+    void diagonalize (std::vector  < RealD > &lmd,
+		      std::vector  < RealD > &lme,
+		      int N2, int N1, GridBase * grid)
+    {
+
+#ifdef USE_LAPACK
+      const int check_lapack = 0;	// just use lapack if 0, check against lapack if 1
+
+      if (!check_lapack)
+	return diagonalize_lapack (lmd, lme, N2, N1, grid);
+
+//      diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
+#endif
+    }
+#endif
+
+    static RealD normalise (Field & v)
+    {
+      RealD nn = norm2 (v);
+      nn = sqrt (nn);
+      v = v * (1.0 / nn);
+      return nn;
+    }
+
+    void orthogonalize (Field & w, std::vector < Field > &evec, int k)
+    {
+      double t0 = -usecond () / 1e6;
+      typedef typename Field::scalar_type MyComplex;
+      MyComplex ip;
+
+      if (0)
+	{
+	  for (int j = 0; j < k; ++j)
+	    {
+	      normalise (evec[j]);
+	      for (int i = 0; i < j; i++)
+		{
+		  ip = innerProduct (evec[i], evec[j]);	// are the evecs normalised? ; this assumes so.
+		  evec[j] = evec[j] - ip * evec[i];
+		}
+	    }
+	}
+
+      for (int j = 0; j < k; ++j)
+	{
+	  ip = innerProduct (evec[j], w);	// are the evecs normalised? ; this assumes so.
+	  w = w - ip * evec[j];
+	}
+      normalise (w);
+      t0 += usecond () / 1e6;
+      OrthoTime += t0;
+    }
+
+    void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
+    {
+      for (int i = 0; i < Qt.size (); ++i)
+	Qt[i] = 0.0;
+      for (int k = 0; k < Nm; ++k)
+	Qt[k + k * Nm] = 1.0;
+    }
+
+
+    void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
+    {
+
+      GridBase *grid = src.Grid();
+//      assert(grid == src._grid);
+
+      std::
+	cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
+	endl;
+      std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
+      std::cout << GridLogMessage << " -- size of eval   = " << eval.
+	size () << std::endl;
+
+//      assert(c.size() && Nm == eval.size());
+
+      std::vector < RealD > lme (Nm);
+      std::vector < RealD > lmd (Nm);
+
+
+      Field current (grid);
+      Field last (grid);
+      Field next (grid);
+
+      Nconv = 0;
+
+      RealD beta_k;
+
+      // Set initial vector
+      // (uniform vector) Why not src??
+      //      evec[0] = 1.0;
+      current = src;
+      std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
+	endl;
+      normalise (current);
+      std::
+	cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
+	std::endl;
+
+      // Initial Nk steps
+      OrthoTime = 0.;
+      double t0 = usecond () / 1e6;
+      RealD norm;		// sqrt norm of last vector
+
+      uint64_t iter = 0;
+
+      bool initted = false;
+      std::vector < RealD > low (Nstop * 10);
+      std::vector < RealD > high (Nstop * 10);
+      RealD cont = 0.;
+      while (1) {
+	  cont = 0.;
+	  std::vector < RealD > lme2 (Nm);
+	  std::vector < RealD > lmd2 (Nm);
+	  for (uint64_t k = 0; k < Nm; ++k, iter++) {
+	      step (lmd, lme, last, current, next, iter);
+	      last = current;
+	      current = next;
+	    }
+	  double t1 = usecond () / 1e6;
+	  std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
+	    t0 << "seconds" << std::endl;
+	  t0 = t1;
+	  std::
+	    cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
+	    OrthoTime << "seconds" << std::endl;
+
+	  // getting eigenvalues
+	  lmd2.resize (iter + 2);
+	  lme2.resize (iter + 2);
+	  for (uint64_t k = 0; k < iter; ++k) {
+	      lmd2[k + 1] = lmd[k];
+	      lme2[k + 2] = lme[k];
+	    }
+	  t1 = usecond () / 1e6;
+	  std::cout << GridLogMessage << "IRL:: copy: " << t1 -
+	    t0 << "seconds" << std::endl;
+	  t0 = t1;
+	  {
+	    int total = grid->_Nprocessors;
+	    int node = grid->_processor;
+	    int interval = (Nstop / total) + 1;
+	    int iu = (iter + 1) - (interval * node + 1);
+	    int il = (iter + 1) - (interval * (node + 1));
+	    std::vector < RealD > eval2 (iter + 3);
+	    RealD eps2;
+	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
+			      eps2);
+//        diagonalize(eval2,lme2,iter,Nk,grid);
+	    RealD diff = 0.;
+	    for (int i = il; i <= iu; i++) {
+		if (initted)
+		  diff =
+		    fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
+						      fabs (high[iu-i]));
+		if (initted && (diff > eresid))
+		  cont = 1.;
+		if (initted)
+		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
+			  high[iu-i], diff);
+		high[iu-i] = eval2[i];
+	      }
+	    il = (interval * node + 1);
+	    iu = (interval * (node + 1));
+	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
+			      eps2);
+	    for (int i = il; i <= iu; i++) {
+		if (initted)
+		  diff =
+		    fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
+						fabs (low[i]));
+		if (initted && (diff > eresid))
+		  cont = 1.;
+		if (initted)
+		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
+			  low[i], diff);
+		low[i] = eval2[i];
+	      }
+	    t1 = usecond () / 1e6;
+	    std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
+	      t0 << "seconds" << std::endl;
+	    t0 = t1;
+	  }
+
+	  for (uint64_t k = 0; k < Nk; ++k) {
+//          eval[k] = eval2[k];
+	    }
+	  if (initted)
+	    {
+	      grid->GlobalSumVector (&cont, 1);
+	      if (cont < 1.) return;
+	    }
+	  initted = true;
+	}
+
+    }
+
+
+
+
+
+#if 0
+
+/**
+   There is some matrix Q such that for any vector y
+   Q.e_1 = y and Q is unitary.
+**/
+    template < class T >
+      static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
+    {
+      int N = y.size ();	//Matrix Size
+      Fill (Q, 0.0);
+      T tau;
+      for (int i = 0; i < N; i++)
+	{
+	  Q[i][0] = y[i];
+	}
+      T sig = conj (y[0]) * y[0];
+      T tau0 = fabs (sqrt (sig));
+
+      for (int j = 1; j < N; j++)
+	{
+	  sig += conj (y[j]) * y[j];
+	  tau = abs (sqrt (sig));
+
+	  if (abs (tau0) > 0.0)
+	    {
+
+	      T gam = conj ((y[j] / tau) / tau0);
+	      for (int k = 0; k <= j - 1; k++)
+		{
+		  Q[k][j] = -gam * y[k];
+		}
+	      Q[j][j] = tau0 / tau;
+	    }
+	  else
+	    {
+	      Q[j - 1][j] = 1.0;
+	    }
+	  tau0 = tau;
+	}
+      return tau;
+    }
+
+/**
+	There is some matrix Q such that for any vector y
+	Q.e_k = y and Q is unitary.
+**/
+    template < class T >
+      static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
+    {
+      T tau = orthQ (Q, y);
+      SL (Q);
+      return tau;
+    }
+
+
+/**
+	Wind up with a matrix with the first con rows untouched
+
+say con = 2
+	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
+	and the matrix is upper hessenberg
+	and with f and Q appropriately modidied with Q is the arnoldi factorization
+
+**/
+
+    template < class T > static void Lock (DenseMatrix < T > &H,	///Hess mtx     
+					   DenseMatrix < T > &Q,	///Lock Transform
+					   T val,	///value to be locked
+					   int con,	///number already locked
+					   RealD small, int dfg, bool herm)
+    {
+      //ForceTridiagonal(H);
+
+      int M = H.dim;
+      DenseVector < T > vec;
+      Resize (vec, M - con);
+
+      DenseMatrix < T > AH;
+      Resize (AH, M - con, M - con);
+      AH = GetSubMtx (H, con, M, con, M);
+
+      DenseMatrix < T > QQ;
+      Resize (QQ, M - con, M - con);
+
+      Unity (Q);
+      Unity (QQ);
+
+      DenseVector < T > evals;
+      Resize (evals, M - con);
+      DenseMatrix < T > evecs;
+      Resize (evecs, M - con, M - con);
+
+      Wilkinson < T > (AH, evals, evecs, small);
+
+      int k = 0;
+      RealD cold = abs (val - evals[k]);
+      for (int i = 1; i < M - con; i++)
+	{
+	  RealD cnew = abs (val - evals[i]);
+	  if (cnew < cold)
+	    {
+	      k = i;
+	      cold = cnew;
+	    }
+	}
+      vec = evecs[k];
+
+      ComplexD tau;
+      orthQ (QQ, vec);
+      //orthQM(QQ,AH,vec);
+
+      AH = Hermitian (QQ) * AH;
+      AH = AH * QQ;
+
+      for (int i = con; i < M; i++)
+	{
+	  for (int j = con; j < M; j++)
+	    {
+	      Q[i][j] = QQ[i - con][j - con];
+	      H[i][j] = AH[i - con][j - con];
+	    }
+	}
+
+      for (int j = M - 1; j > con + 2; j--)
+	{
+
+	  DenseMatrix < T > U;
+	  Resize (U, j - 1 - con, j - 1 - con);
+	  DenseVector < T > z;
+	  Resize (z, j - 1 - con);
+	  T nm = norm (z);
+	  for (int k = con + 0; k < j - 1; k++)
+	    {
+	      z[k - con] = conj (H (j, k + 1));
+	    }
+	  normalise (z);
+
+	  RealD tmp = 0;
+	  for (int i = 0; i < z.size () - 1; i++)
+	    {
+	      tmp = tmp + abs (z[i]);
+	    }
+
+	  if (tmp < small / ((RealD) z.size () - 1.0))
+	    {
+	      continue;
+	    }
+
+	  tau = orthU (U, z);
+
+	  DenseMatrix < T > Hb;
+	  Resize (Hb, j - 1 - con, M);
+
+	  for (int a = 0; a < M; a++)
+	    {
+	      for (int b = 0; b < j - 1 - con; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += H[a][con + 1 + c] * U[c][b];
+		    }		//sum += H(a,con+1+c)*U(c,b);}
+		  Hb[b][a] = sum;
+		}
+	    }
+
+	  for (int k = con + 1; k < j; k++)
+	    {
+	      for (int l = 0; l < M; l++)
+		{
+		  H[l][k] = Hb[k - 1 - con][l];
+		}
+	    }			//H(Hb[k-1-con][l] , l,k);}}
+
+	  DenseMatrix < T > Qb;
+	  Resize (Qb, M, M);
+
+	  for (int a = 0; a < M; a++)
+	    {
+	      for (int b = 0; b < j - 1 - con; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += Q[a][con + 1 + c] * U[c][b];
+		    }		//sum += Q(a,con+1+c)*U(c,b);}
+		  Qb[b][a] = sum;
+		}
+	    }
+
+	  for (int k = con + 1; k < j; k++)
+	    {
+	      for (int l = 0; l < M; l++)
+		{
+		  Q[l][k] = Qb[k - 1 - con][l];
+		}
+	    }			//Q(Qb[k-1-con][l] , l,k);}}
+
+	  DenseMatrix < T > Hc;
+	  Resize (Hc, M, M);
+
+	  for (int a = 0; a < j - 1 - con; a++)
+	    {
+	      for (int b = 0; b < M; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += conj (U[c][a]) * H[con + 1 + c][b];
+		    }		//sum += conj( U(c,a) )*H(con+1+c,b);}
+		  Hc[b][a] = sum;
+		}
+	    }
+
+	  for (int k = 0; k < M; k++)
+	    {
+	      for (int l = con + 1; l < j; l++)
+		{
+		  H[l][k] = Hc[k][l - 1 - con];
+		}
+	    }			//H(Hc[k][l-1-con] , l,k);}}
+
+	}
+    }
+#endif
+
+
+  };
+
+}
+#endif
@@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once

+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+
 NAMESPACE_BEGIN(Grid);

 inline RealD AggregatePowerLaw(RealD x)
@@ -95,7 +97,7 @@ public:

    RealD scale;

-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    ConjugateGradient<FineField> CG(1.0e-3,400,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);

@@ -108,7 +110,7 @@ public:
      
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;

-      for(int i=0;i<1;i++){
+      for(int i=0;i<4;i++){

 	CG(hermop,noise,subspace[b]);

@@ -124,6 +126,53 @@ public:
    }
  }

+  virtual void CreateSubspaceGCR(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
+  {
+    RealD scale;
+
+    TrivialPrecon<FineField> simple_fine;
+    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
+    FineField noise(FineGrid);
+    FineField src(FineGrid);
+    FineField guess(FineGrid);
+    FineField Mn(FineGrid);
+
+    for(int b=0;b<nn;b++){
+      
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
+
+      for(int i=0;i<2;i++){
+	//  void operator() (const Field &src, Field &psi){
+#if 1
+	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
+	src = noise;
+	guess=Zero();
+	GCR(src,guess);
+	subspace[b] = guess;
+#else
+	std::cout << GridLogMessage << " inverting on zero "<<std::endl;
+	src=Zero();
+	guess = noise;
+	GCR(src,guess);
+	subspace[b] = guess;
+#endif
+	noise = subspace[b];
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+      }
+
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
+      subspace[b]   = noise;
+
+    }
+  }
+
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
@@ -160,14 +209,21 @@ public:

    int b =0;
    {
+      ComplexD ip;
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+
+      hermop.Op(Mn,tmp);
+      ip= innerProduct(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+
+      hermop.AdjOp(Mn,tmp); 
+      ip = innerProduct(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
      b++;
    }

@@ -213,8 +269,18 @@ public:
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
-	  hermop.Op(Mn,tmp); 
-	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+
+
+	  ComplexD ip;
+
+	  hermop.Op(Mn,tmp);
+	  ip= innerProduct(Mn,tmp); 
+	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+
+	  hermop.AdjOp(Mn,tmp); 
+	  ip = innerProduct(Mn,tmp); 
+	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+	  
 	  b++;
 	}

@@ -228,6 +294,70 @@ public:
    }
    assert(b==nn);
  }
+
+
+  virtual void CreateSubspacePolyCheby(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo1,
+				       int orderfilter,
+				       double lo2,
+				       int orderstep)
+  {
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
+    // Initial matrix element
+    hermop.Op(noise,Mn);
+    std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {
+      // Filter
+      std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
+      Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
+    }
+
+    // Generate a full sequence of Chebyshevs
+    for(int n=1;n<nn;n++){
+      std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
+      Chebyshev<FineField> Cheb(lo2,hi,orderstep);
+      Cheb(hermop,subspace[n-1],Mn);
+
+      for(int m=0;m<n;m++){
+	ComplexD c = innerProduct(subspace[m],Mn);
+	Mn = Mn - c*subspace[m];
+      }
+      
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5);
+      Mn=Mn*scale;
+      
+      subspace[n]=Mn;
+      
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
+
+    }
+  }
+
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
@@ -99,7 +99,7 @@ public:
  CoarseMatrix AselfInvEven;
  CoarseMatrix AselfInvOdd;

-  Vector<RealD> dag_factor;
+  deviceVector<RealD> dag_factor;

  ///////////////////////
  // Interface
@@ -124,9 +124,13 @@ public:
    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
      
-    Vector<Aview> AcceleratorViewContainer;
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
  
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
+    for(int p=0;p<geom.npoint;p++) {
+      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
+      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
+    }
    Aview *Aview_p = & AcceleratorViewContainer[0];

    const int Nsimd = CComplex::Nsimd();
@@ -161,7 +165,7 @@ public:
      coalescedWrite(out_v[ss](b),res);
      });

-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  };

  void Mdag (const CoarseVector &in, CoarseVector &out)
@@ -190,9 +194,14 @@ public:
    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;

-    Vector<Aview> AcceleratorViewContainer;

-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
+  
+    for(int p=0;p<geom.npoint;p++) {
+      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
+      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
+    }
    Aview *Aview_p = & AcceleratorViewContainer[0];

    const int Nsimd = CComplex::Nsimd();
@@ -201,10 +210,10 @@ public:

    int osites=Grid()->oSites();

-    Vector<int> points(geom.npoint, 0);
-    for(int p=0; p<geom.npoint; p++)
-      points[p] = geom.points_dagger[p];
-
+    deviceVector<int> points(geom.npoint);
+    for(int p=0; p<geom.npoint; p++) { 
+      acceleratorPut(points[p],geom.points_dagger[p]);
+    }
    auto points_p = &points[0];

    RealD* dag_factor_p = &dag_factor[0];
@@ -236,7 +245,7 @@ public:
      coalescedWrite(out_v[ss](b),res);
      });

-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  }

  void MdirComms(const CoarseVector &in)
@@ -251,8 +260,14 @@ public:
    out.Checkerboard() = in.Checkerboard();

    typedef LatticeView<Cobj> Aview;
-    Vector<Aview> AcceleratorViewContainer;
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
+
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
+  
+    for(int p=0;p<geom.npoint;p++) {
+      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
+      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
+    }
    Aview *Aview_p = & AcceleratorViewContainer[0];

    autoView( out_v , out, AcceleratorWrite);
@@ -285,7 +300,7 @@ public:
      }
      coalescedWrite(out_v[ss](b),res);
    });
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  }
  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
  {
@@ -469,14 +484,20 @@ public:

    // determine in what order we need the points
    int npoint = geom.npoint-1;
-    Vector<int> points(npoint, 0);
-    for(int p=0; p<npoint; p++)
-      points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
-
+    deviceVector<int> points(npoint);
+    for(int p=0; p<npoint; p++) {
+      int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
+      acceleratorPut(points[p], val);
+    }
    auto points_p = &points[0];

-    Vector<Aview> AcceleratorViewContainer;
-    for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
+  
+    for(int p=0;p<geom.npoint;p++) {
+      hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
+      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
+    }
    Aview *Aview_p = & AcceleratorViewContainer[0];

    const int Nsimd = CComplex::Nsimd();
@@ -539,7 +560,7 @@ public:
      });
    }

-    for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  }
  
  CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	:
@@ -590,11 +611,13 @@ public:
    }

    // GPU readable prefactor
+    std::vector<RealD> h_dag_factor(nbasis*nbasis);
    thread_for(i, nbasis*nbasis, {
      int j = i/nbasis;
      int k = i%nbasis;
-      dag_factor[i] = dag_factor_eigen(j, k);
+      h_dag_factor[i] = dag_factor_eigen(j, k);
    });
+    acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
  }

  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
@@ -441,8 +441,20 @@ public:
    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
  }
 #else
+  //////////////////////////////////////////////////////////////////////
+  // Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    CoarsenOperator(linop,Subspace,Subspace);
+  }
+  //////////////////////////////////////////////////////////////////////
+  // Petrov - Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & U,
+		       Aggregation<Fobj,CComplex,nbasis> & V)
  {
    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
    GridBase *grid = FineGrid();
@@ -458,11 +470,9 @@ public:
    // Orthogonalise the subblocks over the basis
    /////////////////////////////////////////////////////////////
    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
+    blockOrthogonalise(InnerProd,V.subspace);
+    blockOrthogonalise(InnerProd,U.subspace);

-    //    for(int s=0;s<Subspace.subspace.size();s++){
-      //      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
-    //    }
    const int npoint = geom.npoint;
      
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -542,7 +552,7 @@ public:
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
 	tphaseBZ-=usecond();
-	phaV = phaF[p]*Subspace.subspace[i];
+	phaV = phaF[p]*V.subspace[i];
 	tphaseBZ+=usecond();

 	/////////////////////////////////////////////////////////////////////
@@ -555,7 +565,7 @@ public:
 	//	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;

 	tproj-=usecond();
-	blockProject(coarseInner,MphaV,Subspace.subspace);
+	blockProject(coarseInner,MphaV,U.subspace);
 	coarseInner = conjugate(pha[p]) * coarseInner;

 	ComputeProj[p] = coarseInner;
@@ -54,6 +54,9 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
+    if ( (_Tp*)ptr == (_Tp *) NULL ) {
+      printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
+    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -66,7 +69,7 @@ public:
  }

  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
@@ -100,6 +103,9 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
+    if ( (_Tp*)ptr == (_Tp *) NULL ) {
+      printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
+    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -145,6 +151,9 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
+    if ( (_Tp*)ptr == (_Tp *) NULL ) {
+      printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
+    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -165,19 +174,10 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-#ifdef ACCELERATOR_CSHIFT
-// Cshift on device
-template<class T> using cshiftAllocator = devAllocator<T>;
-#else
-// Cshift on host
-template<class T> using cshiftAllocator = std::allocator<T>;
-#endif
-
-template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
-template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
-template<class T> using commVector    = std::vector<T,devAllocator<T> >;
-template<class T> using deviceVector  = std::vector<T,devAllocator<T> >;
-template<class T> using cshiftVector  = std::vector<T,cshiftAllocator<T> >;
+template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview
+template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // Really want to deprecate
+template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page
+template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector

 /*
 template<class T> class vecView
@@ -188,8 +188,9 @@ template<class T> class vecView
  ViewMode mode;
  void * cpu_ptr;
 public:
+  // Rvalue accessor
  accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
-  vecView(std::vector<T> &refer_to_me,ViewMode _mode)
+  vecView(Vector<T> &refer_to_me,ViewMode _mode)
  {
    cpu_ptr = &refer_to_me[0];
    size = refer_to_me.size();
@@ -205,22 +206,12 @@ template<class T> class vecView
  }
 };

-template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
+template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
 {
  vecView<T> ret(vec,_mode); // does the open
  return ret;                // must be closed
 }

-// Little autoscope assister
-template<class View> 
-class VectorViewCloser
-{
-  View v;  // Take a copy of view and call view close when I go out of scope automatically
- public:
-  VectorViewCloser(View &_v) : v(_v) {};
-  ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose();  MemoryManager::NotifyDeletion(ptr);}
-};
-
 #define autoVecView(v_v,v,mode)					\
  auto v_v = VectorView(v,mode);				\
  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
@@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid);
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
+
+#if defined(__has_feature)
+#if __has_feature(leak_sanitizer)
+#define ASAN_LEAK_CHECK
+#endif
+#endif
+
+#ifdef ASAN_LEAK_CHECK
+#include <sanitizer/asan_interface.h>
+#include <sanitizer/common_interface_defs.h>
+#include <sanitizer/lsan_interface.h>
+#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
+#else
+#define LEAK_CHECK(A) { }
+#endif
+
+void MemoryManager::DisplayMallinfo(void)
+{
+#ifdef __linux__
+  struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
+  
+  mi = mallinfo();
+
+  std::cout << "MemoryManager: Total non-mmapped bytes (arena):       "<< (size_t)mi.arena<<std::endl;
+  std::cout << "MemoryManager: # of free chunks (ordblks):            "<< (size_t)mi.ordblks<<std::endl;
+  std::cout << "MemoryManager: # of free fastbin blocks (smblks):     "<< (size_t)mi.smblks<<std::endl;
+  std::cout << "MemoryManager: # of mapped regions (hblks):           "<< (size_t)mi.hblks<<std::endl;
+  std::cout << "MemoryManager: Bytes in mapped regions (hblkhd):      "<< (size_t)mi.hblkhd<<std::endl;
+  std::cout << "MemoryManager: Max. total allocated space (usmblks):  "<< (size_t)mi.usmblks<<std::endl;
+  std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
+  std::cout << "MemoryManager: Total allocated space (uordblks):      "<< (size_t)mi.uordblks<<std::endl;
+  std::cout << "MemoryManager: Total free space (fordblks):           "<< (size_t)mi.fordblks<<std::endl;
+  std::cout << "MemoryManager: Topmost releasable block (keepcost):   "<< (size_t)mi.keepcost<<std::endl;
+#endif
+  LEAK_CHECK();
+ 
+}
+
 void MemoryManager::PrintBytes(void)
 {
  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
@@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void)
 #ifdef GRID_CUDA
  cuda_mem();
 #endif
-  
+  DisplayMallinfo();
 }

 uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
@@ -211,6 +211,7 @@ private:
 #endif

 public:
+  static void DisplayMallinfo(void);
  static void NotifyDeletion(void * CpuPtr);
  static void Print(void);
  static void PrintAll(void);
@@ -1,16 +1,15 @@
 #include <Grid/GridCore.h>
 #ifndef GRID_UVM

-#warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);

 #define MAXLINE 512
 static char print_buffer [ MAXLINE ];

-#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
-#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
+#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
+#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug  << print_buffer << std::endl;
 //#define dprintf(...) 
-
+//#define mprintf(...) 

 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
@@ -111,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -121,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
    AccCache.AccPtr=(uint64_t) NULL;
-    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@@ -141,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@@ -169,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -184,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx",
+	  (uint64_t)AccCache.bytes,
+	  (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
+    dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
+    dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
+  if(bytes>=DeviceMaxBytes) {
+    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
+  }
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
@@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error

  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
+    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
@@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
-    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
+    dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
+    dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
  } else {
    assert(0);
  }
@@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
-    dprintf("AccCache entry removed from LRU \n");
+    dprintf("AccCache entry removed from LRU ");
    LRUremove(AccCache);
  }

@@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
  } else {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
-  uint64_t pagedata[npages];
+  std::vector<uint64_t> pagedata(npages);
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
-  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
+  ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
@@ -82,6 +82,7 @@ public:
  bool _isCheckerBoarded; 
  int        LocallyPeriodic;
  Coordinate _checker_dim_mask;
+  int              _checker_dim;

 public:

@@ -89,7 +90,7 @@ public:
  // Checkerboarding interface is virtual and overridden by 
  // GridCartesian / GridRedBlackCartesian
  ////////////////////////////////////////////////////////////////
-  virtual int CheckerBoarded(int dim)=0;
+  virtual int CheckerBoarded(int dim) =0;
  virtual int CheckerBoard(const Coordinate &site)=0;
  virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
  virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
@@ -38,7 +38,7 @@ class GridCartesian: public GridBase {

 public:
  int dummy;
-  Coordinate _checker_dim_mask;
+  //  Coordinate _checker_dim_mask;
  virtual int  CheckerBoardFromOindexTable (int Oindex) {
    return 0;
  }
@@ -46,7 +46,7 @@ public:
  {
    return 0;
  }
-  virtual int CheckerBoarded(int dim){
+  virtual int CheckerBoarded(int dim) {
    return 0;
  }
  virtual int CheckerBoard(const Coordinate &site){
@@ -106,6 +106,7 @@ public:
    _rdimensions.resize(_ndimension);
    _simd_layout.resize(_ndimension);
    _checker_dim_mask.resize(_ndimension);;
+    _checker_dim = -1;
    _lstart.resize(_ndimension);
    _lend.resize(_ndimension);

@@ -57,9 +57,10 @@ class GridRedBlackCartesian : public GridBase
 {
 public:
  //  Coordinate _checker_dim_mask;
-  int              _checker_dim;
+  //  int              _checker_dim;
  std::vector<int> _checker_board;

+  virtual int isCheckerBoarded(void) const { return 1; };
  virtual int CheckerBoarded(int dim){
    if( dim==_checker_dim) return 1;
    else return 0;
@@ -147,7 +148,7 @@ public:
  {
    Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim)  ;
  }
-
+  
  virtual ~GridRedBlackCartesian() = default;

  void Init(const Coordinate &dimensions,
@@ -57,18 +57,29 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////

+#ifdef USE_GRID_REDUCTION
+void CartesianCommunicator::GlobalSum(ComplexF &c)
+{
+  GlobalSumP2P(c);
+}
+void CartesianCommunicator::GlobalSum(ComplexD &c)
+{
+  GlobalSumP2P(c);
+}
+#else
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
-void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
-{
-  GlobalSumVector((float *)c,2*N);
-}
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumVector((double *)&c,2);
 }
+#endif
+void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
+{
+  GlobalSumVector((float *)c,2*N);
+}
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 #include <Grid/communicator/SharedMemory.h>

+#define NVLINK_GET
+
 NAMESPACE_BEGIN(Grid);

 extern bool Stencil_force_mpi ;
@@ -127,7 +129,36 @@ public:
  void GlobalSumVector(ComplexD *c,int N);
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
-  
+
+  template<class obj> void GlobalSumP2P(obj &o)
+  {
+    std::vector<obj> column;
+    obj accum = o;
+    int source,dest;
+    for(int d=0;d<_ndimension;d++){
+      column.resize(_processors[d]);
+      column[0] = accum;
+      std::vector<MpiCommsRequest_t> list;
+      for(int p=1;p<_processors[d];p++){
+	ShiftedRanks(d,p,source,dest);
+	SendToRecvFromBegin(list,
+			    &column[0],
+			    dest,
+			    &column[p],
+			    source,
+			    sizeof(obj),d*100+p);
+
+      }
+      if (!list.empty()) // avoid triggering assert in comms == none
+	CommsComplete(list);
+      for(int p=1;p<_processors[d];p++){
+	accum = accum + column[p];
+      }
+    }
+    Broadcast(0,accum);
+    o=accum;
+  }
+
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
@@ -138,8 +169,8 @@ public:
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
-  void CommsComplete(std::vector<CommsRequest_t> &list);
-  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+  void CommsComplete(std::vector<MpiCommsRequest_t> &list);
+  void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
 			   void *xmit,
 			   int dest,
 			   void *recv,
@@ -158,6 +189,17 @@ public:
 			       int recv_from_rank,int do_recv,
 			       int bytes,int dir);

+  double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+				      void *xmit,
+				      int xmit_to_rank,int do_xmit,
+				      void *recv,
+				      int recv_from_rank,int do_recv,
+				      int xbytes,int rbytes,int dir);
+
+  // Could do a PollHtoD and have a CommsMerge dependence
+  void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
+  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
+
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;

 ////////////////////////////////////////////
@@ -257,6 +258,25 @@ CartesianCommunicator::~CartesianCommunicator()
    }
  }
 }
+#ifdef USE_GRID_REDUCTION
+void CartesianCommunicator::GlobalSum(float &f){
+  CartesianCommunicator::GlobalSumP2P(f);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  CartesianCommunicator::GlobalSumP2P(d);
+}
+#else
+void CartesianCommunicator::GlobalSum(float &f){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+#endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@@ -287,27 +307,18 @@ void CartesianCommunicator::GlobalMax(double &d)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
  assert(ierr==0);
 }
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }

-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
@@ -332,7 +343,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  assert(ierr==0);
  list.push_back(xrq);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
+void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
 {
  int nreq=list.size();

@@ -351,9 +362,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int from,
 					   int bytes)
 {
-  std::vector<CommsRequest_t> reqs(0);
-  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
+  std::vector<MpiCommsRequest_t> reqs(0);

  int myrank = _processor;
  int ierr;
@@ -369,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);

-  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
-  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@@ -381,12 +387,287 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }

-#undef NVLINK_GET // Define to use get instead of put DMA
+
+#ifdef ACCELERATOR_AWARE_MPI
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int dest,int dox,
+							   void *recv,
+							   int from,int dor,
+							   int xbytes,int rbytes,int dir)
+{
+  return 0.0; // Do nothing -- no preparation required
+}
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int dest,int dox,
+							 void *recv,
+							 int from,int dor,
+							 int xbytes,int rbytes,int dir)
+{
+  int ncomm  =communicator_halo.size();
+  int commdir=dir%ncomm;
+
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  assert(gme  == ShmRank);
+  double off_node_bytes=0.0;
+  int tag;
+  
+  if ( dor ) {
+    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+from*32;
+      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      assert(ierr==0);
+      list.push_back(rrq);
+      off_node_bytes+=rbytes;
+    }
+#ifdef NVLINK_GET
+    else { 
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
+      assert(shm!=NULL);
+      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+    }
+#endif
+  }
+  // This is a NVLINK PUT  
+  if (dox) {
+    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+_processor*32;
+      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      assert(ierr==0);
+      list.push_back(xrq);
+      off_node_bytes+=xbytes;
+    } else {
+#ifndef NVLINK_GET
+      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+      assert(shm!=NULL);
+      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+#endif
+    }
+  }
+  return off_node_bytes;
+}
+
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
+{
+  int nreq=list.size();
+  /*finishes Get/Put*/
+  acceleratorCopySynchronise();
+
+  if (nreq==0) return;
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  assert(ierr==0);
+  list.resize(0);
+  this->StencilBarrier(); 
+}
+
+#else /* NOT     ... ACCELERATOR_AWARE_MPI */
+///////////////////////////////////////////
+// Pipeline mode through host memory
+///////////////////////////////////////////
+  /*
+   * In prepare (phase 1):
+   * PHASE 1: (prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   * PHASE 2: (Begin)
+   * - complete all copies
+   * - post MPI send asynch
+   * - post device - device transfers
+   * PHASE 3: (Complete)
+   * - MPI_waitall
+   * - host-device transfers
+   *
+   *********************************
+   * NB could split this further:
+   *--------------------------------
+   * PHASE 1: (Prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   * PHASE 2: (BeginInterNode)
+   * - complete all copies 
+   * - post MPI send asynch
+   * PHASE 3: (BeginIntraNode)
+   * - post device - device transfers
+   * PHASE 4: (Complete)
+   * - MPI_waitall
+   * - host-device transfers asynch
+   * - (complete all copies) 
+   */
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int dest,int dox,
+							   void *recv,
+							   int from,int dor,
+							   int xbytes,int rbytes,int dir)
+{
+/*
+ * Bring sequence from Stencil.h down to lower level.
+ * Assume using XeLink is ok
+ */  
+  int ncomm  =communicator_halo.size();
+  int commdir=dir%ncomm;
+
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  assert(gme  == ShmRank);
+  double off_node_bytes=0.0;
+  int tag;
+
+  void * host_recv = NULL;
+  void * host_xmit = NULL;
+
+  /*
+   * PHASE 1: (Prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   */
+  
+  if ( dor ) {
+    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+from*32;
+      host_recv = this->HostBufferMalloc(rbytes);
+      ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      assert(ierr==0);
+      CommsRequest_t srq;
+      srq.PacketType = InterNodeRecv;
+      srq.bytes      = rbytes;
+      srq.req        = rrq;
+      srq.host_buf   = host_recv;
+      srq.device_buf = recv;
+      list.push_back(srq);
+      off_node_bytes+=rbytes;
+    }
+  }
+  
+  if (dox) {
+    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+
+      tag= dir+_processor*32;
+
+      host_xmit = this->HostBufferMalloc(xbytes);
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+      
+      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      //      assert(ierr==0);
+      //      off_node_bytes+=xbytes;
+
+      srq.PacketType = InterNodeXmit;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = host_xmit;
+      srq.device_buf = xmit;
+      srq.tag        = tag;
+      srq.dest       = dest;
+      srq.commdir    = commdir;
+      list.push_back(srq);
+    }
+  }
+
+  return off_node_bytes;
+}
+/*
+ * In the interest of better pipelining, poll for completion on each DtoH and 
+ * start MPI_ISend in the meantime
+ */
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeRecv ) {
+
+	int flag = 0;
+	MPI_Status status;
+	int ierr = MPI_Test(&list[idx].req,&flag,&status);
+	assert(ierr==0);
+
+	if ( flag ) {
+	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
+	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
+	  list[idx].PacketType=InterNodeReceiveHtoD;
+	} else {
+	  pending ++;
+	}
+      }
+    }
+    //    std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
+  } while ( pending );
+  
+}
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeXmit ) {
+
+	if ( acceleratorEventIsComplete(list[idx].ev) ) {
+
+	  void *host_xmit = list[idx].host_buf;
+	  uint32_t xbytes = list[idx].bytes;
+	  int dest        = list[idx].dest;
+	  int tag         = list[idx].tag;
+	  int commdir     = list[idx].commdir;
+	  ///////////////////
+	  // Send packet
+	  ///////////////////
+
+	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
+	  
+	  MPI_Request xrq;
+	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+	  assert(ierr==0);
+
+	  list[idx].req        = xrq; // Update the MPI request in the list
+
+	  list[idx].PacketType=InterNodeXmitISend;
+
+	} else {
+	  // not done, so return to polling loop
+	  pending++;
+	}
+      }
+    }
+  } while (pending);
+}  
+
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
@@ -411,54 +692,106 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;

-  if ( dor ) {
-    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
-      tag= dir+from*32;
-      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      assert(ierr==0);
-      list.push_back(rrq);
-      off_node_bytes+=rbytes;
-    }
+  void * host_xmit = NULL;
+
+  ////////////////////////////////
+  // Receives already posted
+  // Copies already started
+  ////////////////////////////////
+  /*  
+   * PHASE 2: (Begin)
+   * - complete all copies
+   * - post MPI send asynch
+   */
 #ifdef NVLINK_GET
+  if ( dor ) {
+
+    if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
-#endif
-  }
-  
+
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+
+      srq.PacketType = IntraNodeRecv;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
+    }
+  }  
+#else
  if (dox) {
-    //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-      tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
-      list.push_back(xrq);
-      off_node_bytes+=xbytes;
-    } else {
-#ifndef NVLINK_GET
+
+    if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
-#endif
+
+      CommsRequest_t srq;
+      
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+
+      srq.PacketType = IntraNodeXmit;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
      
    }
  }
-
+#endif
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  int nreq=list.size();
+  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D

-  acceleratorCopySynchronise();
+  std::vector<MPI_Status> status;
+  std::vector<MPI_Request> MpiRequests;
+    
+  for(int r=0;r<list.size();r++){
+    // Must check each Send buf is clear to reuse
+    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
+    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
+  }

-  if (nreq==0) return;
+  int nreq=MpiRequests.size();

-  std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-  assert(ierr==0);
-  list.resize(0);
+  if (nreq>0) {
+    status.resize(MpiRequests.size());
+    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
+    assert(ierr==0);
+  }
+  
+  //  for(int r=0;r<nreq;r++){
+  //    if ( list[r].PacketType==InterNodeRecv ) {
+  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
+  //    }
+  //  }
+  
+  
+  list.resize(0);               // Delete the list
+  this->HostBufferFreeAll();    // Clean up the buffer allocs
+#ifndef NVLINK_GET
+  this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
+#endif   
 }
+#endif
+////////////////////////////////////////////
+// END PIPELINE MODE / NO CUDA AWARE MPI
+////////////////////////////////////////////
+
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
@@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -132,6 +132,17 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int xmit_to_rank,int dox,
+							   void *recv,
+							   int recv_from_rank,int dor,
+							   int xbytes,int rbytes, int dir)
+{
+  return 0.0;
+}
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,int dox,
@@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);

 #if defined (GRID_COMMS_MPI3) 
 typedef MPI_Comm    Grid_MPI_Comm;
+typedef MPI_Request MpiCommsRequest_t;
+#ifdef ACCELERATOR_AWARE_MPI
 typedef MPI_Request CommsRequest_t;
+#else
+/*
+ * Enable state transitions as each packet flows.
+ */
+enum PacketType_t {
+  FaceGather,
+  InterNodeXmit,
+  InterNodeRecv,
+  IntraNodeXmit,
+  IntraNodeRecv,
+  InterNodeXmitISend,
+  InterNodeReceiveHtoD
+};
+/*
+ *Package arguments needed for various actions along packet flow
+ */
+typedef struct {
+  PacketType_t PacketType;
+  void *host_buf;
+  void *device_buf;
+  int dest;
+  int tag;
+  int commdir;
+  unsigned long bytes;
+  acceleratorEvent_t ev;
+  MpiCommsRequest_t req;
+} CommsRequest_t;
+#endif
+
 #else 
+typedef int MpiCommsRequest_t;
 typedef int CommsRequest_t;
 typedef int Grid_MPI_Comm;
 #endif
@@ -105,7 +137,7 @@ public:
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
-  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
+  //  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
  static void SharedMemoryZero(void *dest,size_t bytes);

 };
@@ -42,6 +42,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef ACCELERATOR_AWARE_MPI
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #define SHM_SOCKETS
+#else
+#ifdef HAVE_NUMAIF_H
+  #warning " Using NUMAIF "
+#include <numaif.h>
+#endif 
 #endif 
 #include <syscall.h>
 #endif
@@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  HostCommBuf= malloc(bytes);
+  // printf("Host buffer allocate for GPU non-aware MPI\n");
+#if 0
+  HostCommBuf= acceleratorAllocHost(bytes);
+#else 
+  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
+#if 0
+  #warning "Moving host buffers to specific NUMA domain"
+  int numa;
+  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
+  if(numa_name) {
+    unsigned long page_size = sysconf(_SC_PAGESIZE);
+    numa = atoi(numa_name);
+    unsigned long page_count = bytes/page_size;
+    std::vector<void *> pages(page_count);
+    std::vector<int>    nodes(page_count,numa);
+    std::vector<int>    status(page_count,-1);
+    for(unsigned long p=0;p<page_count;p++){
+      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
+    }
+    int ret = move_pages(0,
+			 page_count,
+			 &pages[0],
+			 &nodes[0],
+			 &status[0],
+			 MPOL_MF_MOVE);
+    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
+    if (ret) perror(" move_pages failed for reason:");
+  }
+#endif  
+  acceleratorPin(HostCommBuf,bytes);
+#endif  
+
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
@@ -569,8 +605,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;

-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
+    auto zeDevice    = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
+    auto zeContext   = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
      
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
@@ -880,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
  bzero(dest,bytes);
 #endif
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-{
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-  acceleratorCopyToDevice(src,dest,bytes);
-#else   
-  bcopy(src,dest,bytes);
-#endif
-}
+//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+//{
+//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+//  acceleratorCopyToDevice(src,dest,bytes);
+//#else   
+//  bcopy(src,dest,bytes);
+//#endif
+//}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
@@ -923,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);

    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
+    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
  }
  ShmBufferFreeAll();

@@ -953,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  //SharedMemoryTest();
+  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@@ -975,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
       check[0]=GlobalSharedMemory::WorldNode;
       check[1]=r;
       check[2]=magic;
-       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
+       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
    }
  }
  ShmBarrier();
  for(uint64_t r=0;r<ShmSize;r++){
-    ShmBarrier();
-    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
-    ShmBarrier();
+    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==magic);
-    ShmBarrier();
  }
+  ShmBarrier();
+  std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
 }

 void *SharedMemory::ShmBuffer(int rank)
@@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #endif 

 NAMESPACE_BEGIN(Grid);
-
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
 auto Cshift(const Expression &expr,int dim,int shift)  -> decltype(closure(expr)) 
 {
@@ -30,12 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);

 extern std::vector<std::pair<int,int> > Cshift_table; 
-extern commVector<std::pair<int,int> > Cshift_table_device; 
+extern deviceVector<std::pair<int,int> > Cshift_table_device; 

 inline std::pair<int,int> *MapCshiftTable(void)
 {
  // GPU version
-#ifdef ACCELERATOR_CSHIFT    
  uint64_t sz=Cshift_table.size();
  if (Cshift_table_device.size()!=sz )    {
    Cshift_table_device.resize(sz);
@@ -45,16 +44,13 @@ inline std::pair<int,int> *MapCshiftTable(void)
 			  sizeof(Cshift_table[0])*sz);

  return &Cshift_table_device[0];
-#else 
-  return &Cshift_table[0];
-#endif
  // CPU version use identify map
 }
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];

@@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
  {
    auto buffer_p = & buffer[0];
    auto table = MapCshiftTable();
-#ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
-#else
-    autoView(rhs_v , rhs, CpuRead);
-    thread_for(i,ent,{
-      buffer_p[table[i].first]=rhs_v[table[i].second];
-    });
-#endif
  }
 }

@@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
  int n1=rhs.Grid()->_slice_stride[dimension];

  if ( cbmask ==0x3){
-#ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      });
-#else
-    autoView(rhs_v , rhs, CpuRead);
-    thread_for2d(n,e1,b,e2,{
-	int o      =   n*n1;
-	int offset = b+n*e2;
-	
-	vobj temp =rhs_v[so+o+b];
-	extract<vobj>(temp,pointers,offset);
-      });
-#endif
  } else { 
    Coordinate rdim=rhs.Grid()->_rdimensions;
    Coordinate cdm =rhs.Grid()->_checker_dim_mask;
    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
-#ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@@ -175,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	  extract<vobj>(temp,pointers,offset);
 	}
      });
-#else
-    autoView(rhs_v , rhs, CpuRead);
-    thread_for2d(n,e1,b,e2,{
-
-	Coordinate coor;
-
-	int o=n*n1;
-	int oindex = o+b;
-
-       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
-
-	int ocb=1<<cb;
-	int offset = b+n*e2;
-
-	if ( ocb & cbmask ) {
-	  vobj temp =rhs_v[so+o+b];
-	  extract<vobj>(temp,pointers,offset);
-	}
-      });
-#endif
  }
 }

 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];

@@ -245,17 +202,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
  {
    auto buffer_p = & buffer[0];
    auto table = MapCshiftTable();
-#ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
    });
-#else
-    autoView( rhs_v, rhs, CpuWrite);
-    thread_for(i,ent,{
-      rhs_v[table[i].first]=buffer_p[table[i].second];
-    });
-#endif
  }
 }

@@ -278,7 +228,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  if(cbmask ==0x3 ) {
    int _slice_stride = rhs.Grid()->_slice_stride[dimension];
    int _slice_block = rhs.Grid()->_slice_block[dimension];
-#ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v , rhs, AcceleratorWrite);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@@ -287,14 +236,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
      });
-#else
-    autoView( rhs_v , rhs, CpuWrite);
-    thread_for2d(n,e1,b,e2,{
-	int o      = n*_slice_stride;
-	int offset = b+n*_slice_block;
-	merge(rhs_v[so+o+b],pointers,offset);
-    });
-#endif
  } else { 

    // Case of SIMD split AND checker dim cannot currently be hit, except in 
@@ -360,19 +301,11 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs

  {
    auto table = MapCshiftTable();
-#ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
-#else
-    autoView(rhs_v , rhs, CpuRead);
-    autoView(lhs_v , lhs, CpuWrite);
-    thread_for(i,ent,{
-      lhs_v[table[i].first]=rhs_v[table[i].second];
-    });
-#endif
  }
 }

@@ -412,19 +345,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo

  {
    auto table = MapCshiftTable();
-#ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);
    accelerator_for(i,ent,1,{
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
    });
-#else
-    autoView( rhs_v, rhs, CpuRead);
-    autoView( lhs_v, lhs, CpuWrite);
-    thread_for(i,ent,{
-      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
-    });
-#endif
  }
 }

@@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>


 NAMESPACE_BEGIN(Grid); 
-
+const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
  typedef typename vobj::vector_type vector_type;
@@ -55,17 +55,17 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  RealD t1,t0;
  t0=usecond();
  if ( !comm_dim ) {
-    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
+    //    std::cout << "CSHIFT: Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
  } else if ( splice_dim ) {
-    //std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
+    //    std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift);
  } else {
-    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
+    //    std::cout << "CSHIFT: Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
  t1=usecond();
-  //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
+  if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }

@@ -94,18 +94,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
  sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
  sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);

-  //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+  //  std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
  if ( sshift[0] == sshift[1] ) {
-    //std::cout << "Single pass Cshift_comms" <<std::endl;
+    //    std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
  } else {
-    //std::cout << "Two pass Cshift_comms" <<std::endl;
+    //    std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
 }
-#define ACCELERATOR_CSHIFT_NO_COPY
-#ifdef ACCELERATOR_CSHIFT_NO_COPY
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
@@ -125,9 +123,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
-  static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
-    
+  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
+  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
+#ifndef ACCELERATOR_AWARE_MPI
+  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
+  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
+#endif
+  
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  RealD tcopy=0.0;
@@ -158,18 +160,31 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
+
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      
      tcomms-=usecond();
-      //      grid->Barrier();
+      grid->Barrier();

+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
+#else
+      // bouncy bouncy
+      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
+      grid->SendToRecvFrom((void *)&hsend_buf[0],
+			   xmit_to_rank,
+			   (void *)&hrecv_buf[0],
+			   recv_from_rank,
+			   bytes);
+      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
+#endif
+
      xbytes+=bytes;
-      //      grid->Barrier();
+      grid->Barrier();
      tcomms+=usecond();

      tscatter-=usecond();
@@ -177,13 +192,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      tscatter+=usecond();
    }
  }
-  /*
-  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  */
+  if (Cshift_verbose){
+    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  }
 }

 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -201,9 +216,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;

-  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
-  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
-  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+  //  std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //	    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //	    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;

  assert(comm_dim==1);
  assert(simd_layout==2);
@@ -224,16 +239,20 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);

-  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
-  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
+  static std::vector<deviceVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<deviceVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
- 
+
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
-
+#ifndef ACCELERATOR_AWARE_MPI
+  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
+  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
+#endif
+  
  int bytes = buffer_size*sizeof(scalar_object);

  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
@@ -281,266 +300,50 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 

 	tcomms-=usecond();
-	//	grid->Barrier();
+	grid->Barrier();

 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
+#ifdef ACCELERATOR_AWARE_MPI
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
-
-	xbytes+=bytes;
-	//	grid->Barrier();
-	tcomms+=usecond();
-
-	rpointers[i] = &recv_buf_extract[i][0];
-      } else { 
-	rpointers[i] = &send_buf_extract[nbr_lane][0];
-      }
-
-    }
-    tscatter-=usecond();
-    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
-    tscatter+=usecond();
-  }
-  /*
-  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  */
-}
-#else 
-template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
-{
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  GridBase *grid=rhs.Grid();
-  Lattice<vobj> temp(rhs.Grid());
-
-  int fd              = rhs.Grid()->_fdimensions[dimension];
-  int rd              = rhs.Grid()->_rdimensions[dimension];
-  int pd              = rhs.Grid()->_processors[dimension];
-  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
-  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
-  assert(simd_layout==1);
-  assert(comm_dim==1);
-  assert(shift>=0);
-  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-  
-  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
-  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
-  vobj *send_buf;
-  vobj *recv_buf;
-  {
-    grid->ShmBufferFreeAll();
-    size_t bytes = buffer_size*sizeof(vobj);
-    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
-    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
-  }
-    
-  int cb= (cbmask==0x2)? Odd : Even;
-  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
-  for(int x=0;x<rd;x++){       
-
-    int sx        =  (x+sshift)%rd;
-    int comm_proc = ((x+sshift)/rd)%pd;
-    
-    if (comm_proc==0) {
-
-      tcopy-=usecond();
-      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-      tcopy+=usecond();
-
-    } else {
-
-      int words = buffer_size;
-      if (cbmask != 0x3) words=words>>1;
-
-      int bytes = words * sizeof(vobj);
-
-      tgather-=usecond();
-      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
-      tgather+=usecond();
-
-      //      int rank           = grid->_processor;
-      int recv_from_rank;
-      int xmit_to_rank;
-      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-
-
-      tcomms-=usecond();
-      //      grid->Barrier();
-
-      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
-      grid->SendToRecvFrom((void *)&send_buf[0],
-			   xmit_to_rank,
-			   (void *)&recv_buf[0],
-			   recv_from_rank,
-			   bytes);
-      xbytes+=bytes;
-      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
-
-      //      grid->Barrier();
-      tcomms+=usecond();
-
-      tscatter-=usecond();
-      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
-      tscatter+=usecond();
-    }
-  }
-  /*
-  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  */
-}
-
-template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
-{
-  GridBase *grid=rhs.Grid();
-  const int Nsimd = grid->Nsimd();
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_object scalar_object;
-  typedef typename vobj::scalar_type scalar_type;
-   
-  int fd = grid->_fdimensions[dimension];
-  int rd = grid->_rdimensions[dimension];
-  int ld = grid->_ldimensions[dimension];
-  int pd = grid->_processors[dimension];
-  int simd_layout     = grid->_simd_layout[dimension];
-  int comm_dim        = grid->_processors[dimension] >1 ;
-
-  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
-  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
-  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
-
-  assert(comm_dim==1);
-  assert(simd_layout==2);
-  assert(shift>=0);
-  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-
-  int permute_type=grid->PermuteType(dimension);
-
-  ///////////////////////////////////////////////
-  // Simd direction uses an extract/merge pair
-  ///////////////////////////////////////////////
-  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
-  //  int words = sizeof(vobj)/sizeof(vector_type);
-
-  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
-  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
-  scalar_object *  recv_buf_extract_mpi;
-  scalar_object *  send_buf_extract_mpi;
-  {
-    size_t bytes = sizeof(scalar_object)*buffer_size;
-    grid->ShmBufferFreeAll();
-    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
-    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
-  }
-  for(int s=0;s<Nsimd;s++){
-    send_buf_extract[s].resize(buffer_size);
-    recv_buf_extract[s].resize(buffer_size);
-  }
-
-  int bytes = buffer_size*sizeof(scalar_object);
-
-  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
-  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
-
-  ///////////////////////////////////////////
-  // Work out what to send where
-  ///////////////////////////////////////////
-  int cb    = (cbmask==0x2)? Odd : Even;
-  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
-  // loop over outer coord planes orthog to dim
-  for(int x=0;x<rd;x++){       
-
-    // FIXME call local permute copy if none are offnode.
-    for(int i=0;i<Nsimd;i++){       
-      pointers[i] = &send_buf_extract[i][0];
-    }
-    tgather-=usecond();
-    int sx   = (x+sshift)%rd;
-    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
-    tgather+=usecond();
-
-    for(int i=0;i<Nsimd;i++){
-      
-      int inner_bit = (Nsimd>>(permute_type+1));
-      int ic= (i&inner_bit)? 1:0;
-
-      int my_coor          = rd*ic + x;
-      int nbr_coor         = my_coor+sshift;
-      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
-
-      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
-      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
-      int nbr_lane = (i&(~inner_bit));
-
-      int recv_from_rank;
-      int xmit_to_rank;
-
-      if (nbr_ic) nbr_lane|=inner_bit;
-
-      assert (sx == nbr_ox);
-
-      if(nbr_proc){
-	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-
-	tcomms-=usecond();
-	//	grid->Barrier();
-
-	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
-	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
+#else
+      // bouncy bouncy
+	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
+	grid->SendToRecvFrom((void *)&hsend_buf[0],
 			     xmit_to_rank,
-			     (void *)recv_buf_extract_mpi,
+			     (void *)&hrecv_buf[0],
 			     recv_from_rank,
 			     bytes);
-	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
-	xbytes+=bytes;
-
-	//	grid->Barrier();
-	tcomms+=usecond();
-	rpointers[i] = &recv_buf_extract[i][0];
-      } else { 
-	rpointers[i] = &send_buf_extract[nbr_lane][0];
-      }
-
-    }
-    tscatter-=usecond();
-    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
-    tscatter+=usecond();
-
-  }
-  /*
-  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
-  */
-}
+	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
 #endif
+
+	xbytes+=bytes;
+	grid->Barrier();
+	tcomms+=usecond();
+
+	rpointers[i] = &recv_buf_extract[i][0];
+      } else { 
+	rpointers[i] = &send_buf_extract[nbr_lane][0];
+      }
+
+    }
+    tscatter-=usecond();
+    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
+    tscatter+=usecond();
+  }
+  if(Cshift_verbose){
+    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  }
+}
+
 NAMESPACE_END(Grid); 

 #endif
@@ -1,5 +1,5 @@
 #include <Grid/GridCore.h>       
 NAMESPACE_BEGIN(Grid);
 std::vector<std::pair<int,int> > Cshift_table; 
-commVector<std::pair<int,int> > Cshift_table_device; 
+deviceVector<std::pair<int,int> > Cshift_table_device; 
 NAMESPACE_END(Grid);
@@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
  });
 }

+#define FAST_AXPY_NORM
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpy_norm");
-    return axpy_norm_fast(ret,a,x,y);
+#ifdef FAST_AXPY_NORM
+  return axpy_norm_fast(ret,a,x,y);
+#else
+  ret = a*x+y;
+  RealD nn=norm2(ret);
+  return nn;
+#endif
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpby_norm");
-    return axpby_norm_fast(ret,a,b,x,y);
+#ifdef FAST_AXPY_NORM
+  return axpby_norm_fast(ret,a,b,x,y);
+#else
+  ret = a*x+b*y;
+  RealD nn=norm2(ret);
+  return nn;
+#endif
 }

 /// Trace product
@@ -236,11 +236,21 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
+#if 0
+    deviceVector<vobj> vvtmp(1);
+    acceleratorPut(vvtmp[0],vtmp);
+    vobj *vvtmp_p = & vvtmp[0];
    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
-	auto stmp=coalescedRead(vtmp);
+	auto stmp=coalescedRead(*vvtmp_p);
 	coalescedWrite(me[ss],stmp);
    });
+#else    
+    auto me  = View(CpuWrite);
+    thread_for(ss,me.size(),{
+       me[ss]= r;
+      });
+#endif    
    me.ViewClose();
    return *this;
  }
@@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  typedef decltype(basis[0]) Field;
  typedef decltype(basis[0].View(AcceleratorRead)) View;

-  Vector<View> basis_v; basis_v.reserve(basis.size());
-  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
+  hostVector<View>  h_basis_v(basis.size());
+  deviceVector<View> d_basis_v(basis.size());
+  typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
  typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
+
  GridBase* grid = basis[0].Grid();
      
  for(int k=0;k<basis.size();k++){
-    basis_v.push_back(basis[k].View(AcceleratorWrite));
+    h_basis_v[k] = basis[k].View(AcceleratorWrite);
+    acceleratorPut(d_basis_v[k],h_basis_v[k]);
  }

-#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
-  int max_threads = thread_max();
-  Vector < vobj > Bt(Nm * max_threads);
-  thread_region
-    {
-      vobj* B = &Bt[Nm * thread_num()];
-      thread_for_in_region(ss, grid->oSites(),{
-	  for(int j=j0; j<j1; ++j) B[j]=0.;
-      
-	  for(int j=j0; j<j1; ++j){
-	    for(int k=k0; k<k1; ++k){
-	      B[j] +=Qt(j,k) * basis_v[k][ss];
-	    }
-	  }
-	  for(int j=j0; j<j1; ++j){
-	    basis_v[j][ss] = B[j];
-	  }
-	});
-    }
-#else
-  View *basis_vp = &basis_v[0];
+  View *basis_vp = &d_basis_v[0];

  int nrot = j1-j0;
  if (!nrot) // edge case not handled gracefully by Cuda
@@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  uint64_t oSites   =grid->oSites();
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead

-  Vector <vobj> Bt(siteBlock * nrot); 
+  deviceVector <vobj> Bt(siteBlock * nrot); 
  auto Bp=&Bt[0];

  // GPU readable copy of matrix
-  Vector<Coeff_t> Qt_jv(Nm*Nm);
+  hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
+  deviceVector<Coeff_t> Qt_jv(Nm*Nm);
  Coeff_t *Qt_p = & Qt_jv[0];
  thread_for(i,Nm*Nm,{
      int j = i/Nm;
      int k = i%Nm;
-      Qt_p[i]=Qt(j,k);
+      h_Qt_jv[i]=Qt(j,k);
  });
+  acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));

  // Block the loop to keep storage footprint down
  for(uint64_t s=0;s<oSites;s+=siteBlock){
@@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 	coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
      });
  }
-#endif

-  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
+  for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
 }

 // Extract a single rotated vector
@@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in

  result.Checkerboard() = basis[0].Checkerboard();

-  Vector<View> basis_v; basis_v.reserve(basis.size());
+  hostVector<View>  h_basis_v(basis.size());
+  deviceVector<View> d_basis_v(basis.size());
  for(int k=0;k<basis.size();k++){
-    basis_v.push_back(basis[k].View(AcceleratorRead));
+    h_basis_v[k]=basis[k].View(AcceleratorRead);
+    acceleratorPut(d_basis_v[k],h_basis_v[k]);
  }
-  vobj zz=Zero();
-  Vector<double> Qt_jv(Nm);
-  double * Qt_j = & Qt_jv[0];
-  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);

-  auto basis_vp=& basis_v[0];
+  vobj zz=Zero();
+  deviceVector<double> Qt_jv(Nm);
+  double * Qt_j = & Qt_jv[0];
+  for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
+
+  auto basis_vp=& d_basis_v[0];
  autoView(result_v,result,AcceleratorWrite);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
    vobj zzz=Zero();
@@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
    }
    coalescedWrite(result_v[ss], B);
  });
-  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
+  for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
 }

 template<class Field>
@@ -165,7 +165,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)

  int Nsimd = grid->Nsimd();

-  assert( l.Checkerboard()== grid->CheckerBoard(site));
+  //  assert( l.Checkerboard()== grid->CheckerBoard(site));
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));

  static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -179,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  for(int w=0;w<words;w++){
    pt[w] = getlane(vp[w],idx);
  }
-      
+  //  std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
  return;
 };
 template<class vobj,class sobj>
@@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)

  int Nsimd = grid->Nsimd();

-  assert( l.Checkerboard()== grid->CheckerBoard(site));
+  //  assert( l.Checkerboard()== grid->CheckerBoard(site));
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));

  static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
  //  const int Nsimd = vobj::Nsimd();
  const int nthread = GridThread::GetThreads();

-  Vector<sobj> sumarray(nthread);
+  std::vector<sobj> sumarray(nthread);
  for(int i=0;i<nthread;i++){
    sumarray[i]=Zero();
  }
@@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)

  const int nthread = GridThread::GetThreads();

-  Vector<sobj> sumarray(nthread);
+  std::vector<sobj> sumarray(nthread);
  for(int i=0;i<nthread;i++){
    sumarray[i]=Zero();
  }
@@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  
  // Might make all code paths go this way.
-#if 0
-  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
-  Vector<inner_t> inner_tmp(sites);
-  auto inner_tmp_v = &inner_tmp[0];
-  {
-    autoView( left_v , left, AcceleratorRead);
-    autoView( right_v,right, AcceleratorRead);
-    // This code could read coalesce
-    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v(ss);
-	auto y_l = right_v(ss);
-	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
-    });
-  }
-#else
  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
-  Vector<inner_t> inner_tmp(sites);
+  deviceVector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
    
  {
@@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
    });
  }
-#endif
  // This is in single precision and fails some tests
  auto anrm = sumD(inner_tmp_v,sites);  
  nrm = anrm;
@@ -307,8 +290,10 @@ template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();

+  bool ok;
 #ifdef GRID_SYCL
  uint64_t csum=0;
+  uint64_t csum2=0;
  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
  {
    // Hack
@@ -317,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
    uint64_t *base= (uint64_t *)&l_v[0];
    csum=svm_xor(base,words);
+    ok = FlightRecorder::CsumLog(csum);
+    if ( !ok ) {
+      csum2=svm_xor(base,words);
+      std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+    } else {
+      //      csum2=svm_xor(base,words);
+      //      std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+    }
+    assert(ok);
  }
-  FlightRecorder::CsumLog(csum);
 #endif
+  FlightRecorder::StepLog("rank inner product");
  ComplexD nrm = rankInnerProduct(left,right);
+  //  ComplexD nrmck=nrm;
  RealD local = real(nrm);
-  FlightRecorder::NormLog(real(nrm)); 
+  ok = FlightRecorder::NormLog(real(nrm));
+  if ( !ok ) {
+    ComplexD nrm2 = rankInnerProduct(left,right);
+    RealD local2 = real(nrm2);
+    std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
+    assert(ok);
+  }
+  FlightRecorder::StepLog("Start global sum");
+  //  grid->GlobalSumP2P(nrm);
  grid->GlobalSum(nrm);
+  FlightRecorder::StepLog("Finished global sum");
+  //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
  FlightRecorder::ReductionLog(local,real(nrm)); 
  return nrm;
 }
@@ -360,20 +365,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
-#if 0
-  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
-  Vector<inner_t> inner_tmp(sites);
-  auto inner_tmp_v = &inner_tmp[0];
-
-  accelerator_for( ss, sites, nsimd,{
-      auto tmp = a*x_v(ss)+b*y_v(ss);
-      coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
-      coalescedWrite(z_v[ss],tmp);
-  });
-  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
-#else
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
-  Vector<inner_t> inner_tmp(sites);
+  deviceVector<inner_t> inner_tmp;
+  inner_tmp.resize(sites);
  auto inner_tmp_v = &inner_tmp[0];

  accelerator_for( ss, sites, nsimd,{
@@ -381,9 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
-  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
+  bool ok;
+#ifdef GRID_SYCL
+  uint64_t csum=0;
+  uint64_t csum2=0;
+  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
+  {
+    // z_v
+    {
+      Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
+      uint64_t *base= (uint64_t *)&z_v[0];
+      csum=svm_xor(base,words);
+      ok = FlightRecorder::CsumLog(csum);
+      if ( !ok ) {
+	csum2=svm_xor(base,words);
+	std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+      }
+      assert(ok);
+    }
+    // inner_v
+    {
+      Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
+      uint64_t *base= (uint64_t *)&inner_tmp_v[0];
+      csum=svm_xor(base,words);
+      ok = FlightRecorder::CsumLog(csum);
+      if ( !ok ) {
+	csum2=svm_xor(base,words);
+	std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+      }
+      assert(ok);
+    }
+  }
 #endif
+  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
+  ok = FlightRecorder::NormLog(real(nrm));
+  assert(ok);
+  RealD local = real(nrm);
  grid->GlobalSum(nrm);
+  FlightRecorder::ReductionLog(local,real(nrm));
  return nrm; 
 }
 
@@ -393,7 +422,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
  conformable(left,right);

  typedef typename vobj::vector_typeD vector_type;
-  Vector<ComplexD> tmp(2);
+  std::vector<ComplexD> tmp(2);

  GridBase *grid = left.Grid();

@@ -403,8 +432,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
  // GPU
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  typedef decltype(innerProductD(vobj(),vobj())) norm_t;
-  Vector<inner_t> inner_tmp(sites);
-  Vector<norm_t>  norm_tmp(sites);
+  deviceVector<inner_t> inner_tmp(sites);
+  deviceVector<norm_t>  norm_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  auto norm_tmp_v = &norm_tmp[0];
  {
@@ -454,7 +483,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
 // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////

-template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
+template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
+					  std::vector<typename vobj::scalar_object> &result,
+					  int orthogdim)
 {
  ///////////////////////////////////////////////////////
  // FIXME precision promoted summation
@@ -476,8 +507,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];

-  Vector<vobj> lvSum(rd); // will locally sum vectors first
-  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
+  std::vector<vobj> lvSum(rd); // will locally sum vectors first
+  std::vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD

  result.resize(fd); // And then global sum to return the same vector to every node 
@@ -525,6 +556,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  scalar_type * ptr = (scalar_type *) &result[0];
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
+  //  std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
+  
 }
 template<class vobj> inline
 std::vector<typename vobj::scalar_object> 
@@ -535,7 +568,20 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
  return result;
 }

+/*
+Reimplement

+1)
+template<class vobj>
+static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
+
+2)
+template<class vobj>
+static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
+
+3)
+-- Make Slice Mul Matrix call sliceMaddMatrix
+ */
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
@@ -555,8 +601,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];

-  Vector<vector_type> lvSum(rd); // will locally sum vectors first
-  Vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
+  std::vector<vector_type> lvSum(rd); // will locally sum vectors first
+  std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  

  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@@ -686,203 +732,96 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };

-/*
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
  int nsimd = BlockSolverGrid->Nsimd();
  
-  std::vector<int> latt_phys(0);
-  std::vector<int> simd_phys(0);
-  std::vector<int>  mpi_phys(0);
-  
+  std::vector<int> latt_phys(NN-1);
+  Coordinate simd_phys;
+  std::vector<int>  mpi_phys(NN-1);
+  Coordinate checker_dim_mask(NN-1);
+  int checker_dim=-1;
+
+  int dd;
  for(int d=0;d<NN;d++){
    if( d!=Orthog ) { 
-      latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
-      simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
-      mpi_phys.push_back(BlockSolverGrid->_processors[d]);
+      latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
+      mpi_phys[dd] =BlockSolverGrid->_processors[d];
+      checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
+      if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
+      dd++;
    }
  }
-  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
+  simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
+  GridCartesian *tmp         = new GridCartesian(latt_phys,simd_phys,mpi_phys);
+  if(BlockSolverGrid->_isCheckerBoarded) {
+    GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
+    delete tmp;
+    return (GridBase *) ret;
+  } else { 
+    return (GridBase *) tmp;
+  }
 }
-*/

 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
+  GridBase *FullGrid = X.Grid();
+  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+  Lattice<vobj> Ys(SliceGrid);
+  Lattice<vobj> Rs(SliceGrid);
+  Lattice<vobj> Xs(SliceGrid);
+  Lattice<vobj> RR(FullGrid);
+
+  RR = R; // Copies checkerboard for insert
+  
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::vector_type vector_type;
-
-  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
-
-  GridBase *FullGrid  = X.Grid();
-  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-
-  //  Lattice<vobj> Xslice(SliceGrid);
-  //  Lattice<vobj> Rslice(SliceGrid);
-
-  assert( FullGrid->_simd_layout[Orthog]==1);
-  //  int nh =  FullGrid->_ndimension;
-  //  int nl = SliceGrid->_ndimension;
-  //  int nl = nh-1;
-
-  //FIXME package in a convenient iterator
-  //Should loop over a plane orthogonal to direction "Orthog"
-  int stride=FullGrid->_slice_stride[Orthog];
-  int block =FullGrid->_slice_block [Orthog];
-  int nblock=FullGrid->_slice_nblock[Orthog];
-  int ostride=FullGrid->_ostride[Orthog];
-
-  autoView( X_v, X, CpuRead);
-  autoView( Y_v, Y, CpuRead);
-  autoView( R_v, R, CpuWrite);
-  thread_region
-  {
-    Vector<vobj> s_x(Nblock);
-
-    thread_for_collapse_in_region(2, n,nblock, {
-     for(int b=0;b<block;b++){
-      int o  = n*stride + b;
-
-      for(int i=0;i<Nblock;i++){
-	s_x[i] = X_v[o+i*ostride];
-      }
-
-      vobj dot;
-      for(int i=0;i<Nblock;i++){
-	dot = Y_v[o+i*ostride];
-	for(int j=0;j<Nblock;j++){
-	  dot = dot + s_x[j]*(scale*aa(j,i));
-	}
-	R_v[o+i*ostride]=dot;
-      }
-    }});
+  int Nslice = X.Grid()->GlobalDimensions()[Orthog];
+  for(int i=0;i<Nslice;i++){
+    ExtractSlice(Ys,Y,i,Orthog);
+    ExtractSlice(Rs,R,i,Orthog);
+    Rs=Ys;
+    for(int j=0;j<Nslice;j++){
+      ExtractSlice(Xs,X,j,Orthog);
+      Rs = Rs + Xs*(scale*aa(j,i));
+    }
+    InsertSlice(Rs,RR,i,Orthog);
  }
+  R=RR; // Copy back handles arguments aliasing case
+  delete SliceGrid;
 };

 template<class vobj>
-static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
-{    
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::vector_type vector_type;
-
-  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
-
-  GridBase *FullGrid  = X.Grid();
-  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  //  Lattice<vobj> Xslice(SliceGrid);
-  //  Lattice<vobj> Rslice(SliceGrid);
-
-  assert( FullGrid->_simd_layout[Orthog]==1);
-  //  int nh =  FullGrid->_ndimension;
-  //  int nl = SliceGrid->_ndimension;
-  //  int nl=1;
-
-  //FIXME package in a convenient iterator
-  // thread_for2d_in_region
-  //Should loop over a plane orthogonal to direction "Orthog"
-  int stride=FullGrid->_slice_stride[Orthog];
-  int block =FullGrid->_slice_block [Orthog];
-  int nblock=FullGrid->_slice_nblock[Orthog];
-  int ostride=FullGrid->_ostride[Orthog];
-  autoView( R_v, R, CpuWrite);
-  autoView( X_v, X, CpuRead);
-  thread_region
-  {
-    std::vector<vobj> s_x(Nblock);
-
-
-    thread_for_collapse_in_region( 2 ,n,nblock,{
-    for(int b=0;b<block;b++){
-      int o  = n*stride + b;
-
-      for(int i=0;i<Nblock;i++){
-	s_x[i] = X_v[o+i*ostride];
-      }
-
-      vobj dot;
-      for(int i=0;i<Nblock;i++){
-	dot = s_x[0]*(scale*aa(0,i));
-	for(int j=1;j<Nblock;j++){
-	  dot = dot + s_x[j]*(scale*aa(j,i));
-	}
-	R_v[o+i*ostride]=dot;
-      }
-    }});
-  }
+static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
+{
+  R=Zero();
+  sliceMaddMatrix(R,aa,X,R,Orthog,scale);
 };


 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
+  GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
+
+  Lattice<vobj> ls(SliceGrid);
+  Lattice<vobj> rs(SliceGrid);
+  
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::vector_type vector_type;
-  
-  GridBase *FullGrid  = lhs.Grid();
-  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  
-  int Nblock = FullGrid->GlobalDimensions()[Orthog];
-  
-  //  Lattice<vobj> Lslice(SliceGrid);
-  //  Lattice<vobj> Rslice(SliceGrid);
-  
-  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  assert( FullGrid->_simd_layout[Orthog]==1);
-  //  int nh =  FullGrid->_ndimension;
-  //  int nl = SliceGrid->_ndimension;
-  //  int nl = nh-1;
-
-  //FIXME package in a convenient iterator
-  //Should loop over a plane orthogonal to direction "Orthog"
-  int stride=FullGrid->_slice_stride[Orthog];
-  int block =FullGrid->_slice_block [Orthog];
-  int nblock=FullGrid->_slice_nblock[Orthog];
-  int ostride=FullGrid->_ostride[Orthog];
-
-  typedef typename vobj::vector_typeD vector_typeD;
-
-  autoView( lhs_v, lhs, CpuRead);
-  autoView( rhs_v, rhs, CpuRead);
-  thread_region
-  {
-    std::vector<vobj> Left(Nblock);
-    std::vector<vobj> Right(Nblock);
-    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-    thread_for_collapse_in_region( 2, n,nblock,{
-    for(int b=0;b<block;b++){
-
-      int o  = n*stride + b;
-
-      for(int i=0;i<Nblock;i++){
-	Left [i] = lhs_v[o+i*ostride];
-	Right[i] = rhs_v[o+i*ostride];
-      }
-
-      for(int i=0;i<Nblock;i++){
-      for(int j=0;j<Nblock;j++){
-	auto tmp = innerProduct(Left[i],Right[j]);
-	auto rtmp = TensorRemove(tmp);
-	auto red  =  Reduce(rtmp);
-	mat_thread(i,j) += std::complex<double>(real(red),imag(red));
-      }}
-    }});
-    thread_critical
-    {
-      mat += mat_thread;
-    }  
+  int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
+  mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
+  for(int s=0;s<Nslice;s++){
+    ExtractSlice(ls,lhs,s,Orthog);
+    for(int ss=0;ss<Nslice;ss++){
+      ExtractSlice(rs,rhs,ss,Orthog);
+      mat(s,ss) = innerProduct(ls,rs);
+    }
  }
-
-  for(int i=0;i<Nblock;i++){
-  for(int j=0;j<Nblock;j++){
-    ComplexD sum = mat(i,j);
-    FullGrid->GlobalSum(sum);
-    mat(i,j)=sum;
-  }}
-
-  return;
+  delete SliceGrid;
 }

 NAMESPACE_END(Grid);
@@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  // Move out of UVM
  // Turns out I had messed up the synchronise after move to compute stream
  // as running this on the default stream fools the synchronise
-#undef UVM_BLOCK_BUFFER  
-#ifndef UVM_BLOCK_BUFFER  
-  commVector<sobj> buffer(numBlocks);
+  deviceVector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
  sobj result;
  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
  accelerator_barrier();
  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
-#else
-  Vector<sobj> buffer(numBlocks);
-  sobj *buffer_v = &buffer[0];
-  sobj result;
-  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
-  accelerator_barrier();
-  result = *buffer_v;
-#endif
  return result;
 }

@@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
  
  const int words = sizeof(vobj)/sizeof(vector);

-  Vector<vector> buffer(osites);
+  deviceVector<vector> buffer(osites);
  vector *dat = (vector *)lat;
  vector *buf = &buffer[0];
  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
@@ -4,29 +4,28 @@ NAMESPACE_BEGIN(Grid);
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////

+
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_objectD sobjD;
-  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
-  sobj identity; zeroit(identity);
-  sobj ret ; 

+  sobj identity; zeroit(identity);
+  sobj ret; zeroit(ret);
  Integer nsimd= vobj::Nsimd();
-  
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
-     cgh.parallel_for(cl::sycl::range<1>{osites},
-		      Reduction,
-		      [=] (cl::sycl::id<1> item, auto &sum) {
-      auto osite   = item[0];
-      sum +=Reduce(lat[osite]);
-     });
-   });
-  theGridAccelerator->wait();
-  ret = mysum[0];
-  free(mysum,*theGridAccelerator);
+  { 
+    sycl::buffer<sobj, 1> abuff(&ret, {1});
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
+      cgh.parallel_for(sycl::range<1>{osites},
+                      Reduction,
+                      [=] (sycl::id<1> item, auto &sum) {
+                        auto osite   = item[0];
+                        sum +=Reduce(lat[osite]);
+                      });
+    });
+  }
  sobjD dret; convertType(dret,ret);
  return dret;
 }
@@ -72,55 +71,22 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite

 template<class Word> Word svm_xor(Word *vec,uint64_t L)
 {
-  Word xorResult; xorResult = 0;
-  Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
  Word identity;  identity=0;
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
-     cgh.parallel_for(cl::sycl::range<1>{L},
-		      Reduction,
-		      [=] (cl::sycl::id<1> index, auto &sum) {
-	 sum ^=vec[index];
-     });
-   });
+  Word ret = 0;
+  { 
+    sycl::buffer<Word, 1> abuff(&ret, {1});
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
+      cgh.parallel_for(sycl::range<1>{L},
+                      Reduction,
+                      [=] (sycl::id<1> index, auto &sum) {
+                        sum ^=vec[index];
+                      });
+    });
+  }
  theGridAccelerator->wait();
-  Word ret = d_sum[0];
-  free(d_sum,*theGridAccelerator);
  return ret;
 }

 NAMESPACE_END(Grid);

-/*
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::vector_type  vector;
-  typedef typename vobj::scalar_type  scalar;
-
-  typedef typename vobj::scalar_typeD scalarD;
-  typedef typename vobj::scalar_objectD sobjD;
-
-  sobjD ret;
-  scalarD *ret_p = (scalarD *)&ret;
-  
-  const int nsimd = vobj::Nsimd();
-  const int words = sizeof(vobj)/sizeof(vector);
-
-  Vector<scalar> buffer(osites*nsimd);
-  scalar *buf = &buffer[0];
-  vector *dat = (vector *)lat;
-
-  for(int w=0;w<words;w++) {
-
-    accelerator_for(ss,osites,nsimd,{
-	int lane = acceleratorSIMTlane(nsimd);
-	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
-    });
-    //Precision change at this point is to late to gain precision
-    ret_p[w] = svm_reduce(buf,nsimd*osites);
-  }
-  return ret;
-}
-*/
@@ -1,5 +1,5 @@
 #pragma once
-#include <type_traits>
+
 #if defined(GRID_CUDA)

 #include <cub/cub.cuh>
@@ -21,9 +21,18 @@ NAMESPACE_BEGIN(Grid);


 #if defined(GRID_CUDA) || defined(GRID_HIP)
-template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+template<class vobj>
+inline void sliceSumReduction_cub_small(const vobj *Data,
+					std::vector<vobj> &lvSum,
+					const int rd,
+					const int e1,
+					const int e2,
+					const int stride,
+					const int ostride,
+					const int Nsimd)
+{
  size_t subvol_size = e1*e2;
-  commVector<vobj> reduction_buffer(rd*subvol_size);
+  deviceVector<vobj> reduction_buffer(rd*subvol_size);
  auto rb_p = &reduction_buffer[0];
  vobj zero_init;
  zeroit(zero_init);
@@ -46,7 +55,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
  
  //copy offsets to device
-  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
+  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  
  
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@@ -79,7 +88,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
    exit(EXIT_FAILURE);
  }
  
-  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
+  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  
  //sync after copy
  accelerator_barrier();
@@ -90,61 +99,34 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
  

 }
-
-template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
-  typedef typename vobj::vector_type vector;
-  const int words = sizeof(vobj)/sizeof(vector);
-  const int osites = rd*e1*e2;
-  commVector<vector>buffer(osites);
-  vector *dat = (vector *)Data;
-  vector *buf = &buffer[0];
-  Vector<vector> lvSum_small(rd);
-  vector *lvSum_ptr = (vector *)&lvSum[0];
-
-  for (int w = 0; w < words; w++) {
-    accelerator_for(ss,osites,1,{
-	    buf[ss] = dat[ss*words+w];
-    });
-
-    sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
-      
-    for (int r = 0; r < rd; r++) {
-      lvSum_ptr[w+words*r]=lvSum_small[r];
-    }
-
-  }
-
-  
-}
-
-template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
-{
-  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
-    if constexpr (sizeof(vobj) <= 256) { 
-      sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
-    }
-    else {
-      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
-    }
-}
-#endif
+#endif 


 #if defined(GRID_SYCL)
-template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
+template<class vobj>
+inline void sliceSumReduction_sycl_small(const vobj *Data,
+					 std::vector <vobj> &lvSum,
+					 const int  &rd,
+					 const int &e1,
+					 const int &e2,
+					 const int &stride,
+					 const int &ostride,
+					 const int &Nsimd)
 {
-  typedef typename vobj::scalar_object sobj;
  size_t subvol_size = e1*e2;

-  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
+  vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
  vobj vobj_zero;
  zeroit(vobj_zero);
-    
-  commVector<vobj> reduction_buffer(rd*subvol_size);    
+  for (int r = 0; r<rd; r++) { 
+    mysum[r] = vobj_zero; 
+  }
+
+  deviceVector<vobj> reduction_buffer(rd*subvol_size);    

  auto rb_p = &reduction_buffer[0];

-  autoView(Data_v, Data, AcceleratorRead);
+  // autoView(Data_v, Data, AcceleratorRead);

  //prepare reduction buffer 
  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
@@ -154,30 +136,102 @@ template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Dat
      int so=r*ostride; // base offset for start of plane 
      int ss= so+n*stride+b;

-      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
+      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));

  });

  for (int r = 0; r < rd; r++) {
-      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
-      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
-          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
+      theGridAccelerator->submit([&](sycl::handler &cgh) {
+          auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
+          cgh.parallel_for(sycl::range<1>{subvol_size},
          Reduction,
-          [=](cl::sycl::id<1> item, auto &sum) {
+          [=](sycl::id<1> item, auto &sum) {
              auto s = item[0];
              sum += rb_p[r*subvol_size+s];
          });
      });
-      theGridAccelerator->wait();
-      lvSum[r] = mysum[0];
+      
+     
+  }
+  theGridAccelerator->wait();
+  for (int r = 0; r < rd; r++) {
+    lvSum[r] = mysum[r];
  }
-  
  free(mysum,*theGridAccelerator);
 }
 #endif

-template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
+template<class vobj>
+inline void sliceSumReduction_large(const vobj *Data,
+				    std::vector<vobj> &lvSum,
+				    const int rd,
+				    const int e1,
+				    const int e2,
+				    const int stride,
+				    const int ostride,
+				    const int Nsimd)
+{
+  typedef typename vobj::vector_type vector;
+  const int words = sizeof(vobj)/sizeof(vector);
+  const int osites = rd*e1*e2;
+  deviceVector<vector>buffer(osites);
+  vector *dat = (vector *)Data;
+  vector *buf = &buffer[0];
+  std::vector<vector> lvSum_small(rd);
+  vector *lvSum_ptr = (vector *)&lvSum[0];
+
+  for (int w = 0; w < words; w++) {
+    accelerator_for(ss,osites,1,{
+	    buf[ss] = dat[ss*words+w];
+    });
+
+    #if defined(GRID_CUDA) || defined(GRID_HIP)
+      sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
+    #elif defined(GRID_SYCL)
+      sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
+    #endif
+
+    for (int r = 0; r < rd; r++) {
+      lvSum_ptr[w+words*r]=lvSum_small[r];
+    }
+  }
+}
+
+template<class vobj>
+inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
+				  std::vector<vobj> &lvSum,
+				  const int rd,
+				  const int e1,
+				  const int e2,
+				  const int stride,
+				  const int ostride,
+				  const int Nsimd)
+{
+  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
+    if constexpr (sizeof(vobj) <= 256) { 
+
+      #if defined(GRID_CUDA) || defined(GRID_HIP)
+        sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      #elif defined (GRID_SYCL)
+        sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      #endif
+
+    }
+    else {
+      sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+    }
+}
+
+
+template<class vobj>
+inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
+				  std::vector<vobj> &lvSum,
+				  const int &rd,
+				  const int &e1,
+				  const int &e2,
+				  const int &stride,
+				  const int &ostride,
+				  const int &Nsimd)
 {
  // sum over reduced dimension planes, breaking out orthog dir
  // Parallel over orthog direction
@@ -193,20 +247,20 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
  });
 }

-template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
+template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
+						   std::vector<vobj> &lvSum,
+						   const int &rd,
+						   const int &e1,
+						   const int &e2,
+						   const int &stride,
+						   const int &ostride,
+						   const int &Nsimd) 
 {
-  #if defined(GRID_CUDA) || defined(GRID_HIP)
-  
-  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-  
-  #elif defined(GRID_SYCL)
-  
-  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-  
-  #else
+#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
+#else
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-
-  #endif
+#endif
 }


@@ -981,8 +981,14 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
    hcoor[orthog] = slice;
    for(int d=0;d<nh;d++){
      if ( d!=orthog ) { 
-	hcoor[d]=lcoor[ddl++];
+	hcoor[d]=lcoor[ddl];
+	if ( hg->_checker_dim == d ) {
+	  hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
+	  lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
+	}
+	ddl++;
      }
+      
    }
    peekLocalSite(s,lowDimv,lcoor);
    pokeLocalSite(s,higherDimv,hcoor);
@@ -1003,6 +1009,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
  assert(orthog<nh);
  assert(orthog>=0);
  assert(hg->_processors[orthog]==1);
+  lowDim.Checkerboard() = higherDim.Checkerboard();

  int dl; dl = 0;
  for(int d=0;d<nh;d++){
@@ -1020,11 +1027,16 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
    Coordinate lcoor(nl);
    Coordinate hcoor(nh);
    lg->LocalIndexToLocalCoor(idx,lcoor);
-    int ddl=0;
    hcoor[orthog] = slice;
+    int ddl=0;
    for(int d=0;d<nh;d++){
      if ( d!=orthog ) { 
-	hcoor[d]=lcoor[ddl++];
+	hcoor[d]=lcoor[ddl];
+	if ( hg->_checker_dim == d ) {
+	  hcoor[d]=hcoor[d]*2;     // factor in the full gridd coor for peekLocalSite
+	  lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
+	}
+	ddl++;
      }
    }
    peekLocalSite(s,higherDimv,hcoor);
@@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
 *
 */

-template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
+template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
 					      Lattice<vobj> &lat,
 					      int x,
 					      int dim,
@@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
  });
 }

-template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
+template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
 					     const Lattice<vobj> &lat,
 					     int x,
 					     int dim,
@@ -462,13 +462,19 @@ public:
    int rNsimd = Nsimd / simd[dimension];
    assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);

-    static cshiftVector<vobj> send_buf; 
-    static cshiftVector<vobj> recv_buf;
+    static deviceVector<vobj> send_buf; 
+    static deviceVector<vobj> recv_buf;
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);
+#ifndef ACCELERATOR_AWARE_MPI
+    static hostVector<vobj> hsend_buf; 
+    static hostVector<vobj> hrecv_buf;
+    hsend_buf.resize(buffer_size*2*depth);    
+    hrecv_buf.resize(buffer_size*2*depth);
+#endif    

-    std::vector<CommsRequest_t> fwd_req;   
-    std::vector<CommsRequest_t> bwd_req;   
+    std::vector<MpiCommsRequest_t> fwd_req;   
+    std::vector<MpiCommsRequest_t> bwd_req;   

    int words = buffer_size;
    int bytes = words * sizeof(vobj);
@@ -495,9 +501,16 @@ public:
      t_gather+=usecond()-t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&send_buf[d*buffer_size], xmit_to_rank,
 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
+      grid->SendToRecvFromBegin(fwd_req,
+				(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
+				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+#endif
      t_comms+=usecond()-t;
     }
    for ( int d=0;d < depth ; d ++ ) {
@@ -508,9 +521,16 @@ public:
      t_gather+= usecond() - t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
+      grid->SendToRecvFromBegin(bwd_req,
+				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
+				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+#endif      
      t_comms+=usecond()-t;
    }

@@ -533,8 +553,13 @@ public:

    t=usecond();
    grid->CommsComplete(fwd_req);
+#ifndef ACCELERATOR_AWARE_MPI
+    for ( int d=0;d < depth ; d ++ ) {
+      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
+    }
+#endif
    t_comms+= usecond() - t;
-
+    
    t=usecond();
    for ( int d=0;d < depth ; d ++ ) {
      ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
@@ -543,6 +568,11 @@ public:

    t=usecond();
    grid->CommsComplete(bwd_req);
+#ifndef ACCELERATOR_AWARE_MPI
+    for ( int d=0;d < depth ; d ++ ) {
+      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
+    }
+#endif
    t_comms+= usecond() - t;
    
    t=usecond();
@@ -98,7 +98,7 @@ public:
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
  virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
-
+ 
  /////////////////////////////////////////////////////////////
  // virtual smeared interface through configuration container
  /////////////////////////////////////////////////////////////
@@ -132,6 +132,10 @@ public:
 template <class GaugeField >
 class EmptyAction : public Action <GaugeField>
 {
+  using Action<GaugeField>::refresh;
+  using Action<GaugeField>::Sinitial;
+  using Action<GaugeField>::deriv;
+
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative
@@ -55,6 +55,11 @@ public:
  RealD alpha; // Mobius scale
  RealD k;     // EOFA normalization constant

+  // Device resident
+  deviceVector<Coeff_t> d_shift_coefficients;
+  deviceVector<Coeff_t> d_MooeeInv_shift_lc;
+  deviceVector<Coeff_t> d_MooeeInv_shift_norm;
+  
  virtual void Instantiatable(void) = 0;

  // EOFA-specific operations
@@ -92,6 +97,11 @@ public:
    this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
      ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
      ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
+    
+    d_shift_coefficients.resize(Ls);
+    d_MooeeInv_shift_lc.resize(Ls);
+    d_MooeeInv_shift_norm.resize(Ls);
+
  };
 };

@@ -90,16 +90,16 @@ public:
  void M5D(const FermionField &psi,
 	   const FermionField &phi,
 	   FermionField &chi,
-	   Vector<Coeff_t> &lower,
-	   Vector<Coeff_t> &diag,
-	   Vector<Coeff_t> &upper);
+	   std::vector<Coeff_t> &lower,
+	   std::vector<Coeff_t> &diag,
+	   std::vector<Coeff_t> &upper);

  void M5Ddag(const FermionField &psi,
 	      const FermionField &phi,
 	      FermionField &chi,
-	      Vector<Coeff_t> &lower,
-	      Vector<Coeff_t> &diag,
-	      Vector<Coeff_t> &upper);
+	      std::vector<Coeff_t> &lower,
+	      std::vector<Coeff_t> &diag,
+	      std::vector<Coeff_t> &upper);

  virtual void   Instantiatable(void)=0;

@@ -119,35 +119,51 @@ public:
  RealD mass_plus, mass_minus;

  // Save arguments to SetCoefficientsInternal
-  Vector<Coeff_t> _gamma;
+  std::vector<Coeff_t> _gamma;
  RealD                _zolo_hi;
  RealD                _b;
  RealD                _c;

+  // possible boost
+  std::vector<ComplexD> qmu;
+  void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
+  void addQmu(const FermionField &in, FermionField &out, int dag);
+  
  // Cayley form Moebius (tanh and zolotarev)
-  Vector<Coeff_t> omega;
-  Vector<Coeff_t> bs;    // S dependent coeffs
-  Vector<Coeff_t> cs;
-  Vector<Coeff_t> as;
+  std::vector<Coeff_t> omega;
+  std::vector<Coeff_t> bs;    // S dependent coeffs
+  std::vector<Coeff_t> cs;
+  std::vector<Coeff_t> as;
  // For preconditioning Cayley form
-  Vector<Coeff_t> bee;
-  Vector<Coeff_t> cee;
-  Vector<Coeff_t> aee;
-  Vector<Coeff_t> beo;
-  Vector<Coeff_t> ceo;
-  Vector<Coeff_t> aeo;
+  std::vector<Coeff_t> bee;
+  std::vector<Coeff_t> cee;
+  std::vector<Coeff_t> aee;
+  std::vector<Coeff_t> beo;
+  std::vector<Coeff_t> ceo;
+  std::vector<Coeff_t> aeo;
  // LDU factorisation of the eeoo matrix
-  Vector<Coeff_t> lee;
-  Vector<Coeff_t> leem;
-  Vector<Coeff_t> uee;
-  Vector<Coeff_t> ueem;
-  Vector<Coeff_t> dee;
+  std::vector<Coeff_t> lee;
+  std::vector<Coeff_t> leem;
+  std::vector<Coeff_t> uee;
+  std::vector<Coeff_t> ueem;
+  std::vector<Coeff_t> dee;
+
+  // Device memory
+  deviceVector<Coeff_t> d_diag;
+  deviceVector<Coeff_t> d_upper;
+  deviceVector<Coeff_t> d_lower;
+
+  deviceVector<Coeff_t> d_lee;
+  deviceVector<Coeff_t> d_dee;
+  deviceVector<Coeff_t> d_uee;
+  deviceVector<Coeff_t> d_leem;
+  deviceVector<Coeff_t> d_ueem;

  // Matrices of 5d ee inverse params
-  Vector<iSinglet<Simd> >  MatpInv;
-  Vector<iSinglet<Simd> >  MatmInv;
-  Vector<iSinglet<Simd> >  MatpInvDag;
-  Vector<iSinglet<Simd> >  MatmInvDag;
+  //  std::vector<iSinglet<Simd> >  MatpInv;
+  //  std::vector<iSinglet<Simd> >  MatmInv;
+  //  std::vector<iSinglet<Simd> >  MatpInvDag;
+  //  std::vector<iSinglet<Simd> >  MatmInvDag;

  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
@@ -187,7 +203,7 @@ public:
 protected:
  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
-  virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
+  virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
 };

 NAMESPACE_END(Grid);
@@ -0,0 +1,196 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
+
+    Copyright (C) 2020 - 2025
+
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
+#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
+
+template<class Impl, class CloverHelpers>
+class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
+				     public WilsonCloverHelpers<Impl>,
+				     public CompactWilsonCloverHelpers<Impl> {
+  /////////////////////////////////////////////
+  // Sizes
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_COMPACT_CLOVER_SIZES(Impl);
+
+  /////////////////////////////////////////////
+  // Type definitions
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+  INHERIT_COMPACT_CLOVER_TYPES(Impl);
+
+  typedef WilsonFermion5D<Impl>            WilsonBase;
+  typedef WilsonCloverHelpers<Impl>        Helpers;
+  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
+
+  /////////////////////////////////////////////
+  // Constructors
+  /////////////////////////////////////////////
+
+public:
+
+  CompactWilsonCloverFermion5D(GaugeField& _Umu,
+			       GridCartesian         &FiveDimGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridCartesian         &FourDimGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
+			       const RealD _mass,
+			       const RealD _csw_r = 0.0,
+			       const RealD _csw_t = 0.0,
+			       const RealD _cF = 1.0,
+			       const ImplParams& impl_p = ImplParams());
+
+  /////////////////////////////////////////////
+  // Member functions (implementing interface)
+  /////////////////////////////////////////////
+
+public:
+
+  virtual void Instantiatable() {};
+  int          ConstEE()     override { return 0; };
+  int          isTrivialEE() override { return 0; };
+
+  void Dhop(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
+
+  void M(const FermionField& in, FermionField& out) override;
+
+  void Mdag(const FermionField& in, FermionField& out) override;
+
+  void Meooe(const FermionField& in, FermionField& out) override;
+
+  void MeooeDag(const FermionField& in, FermionField& out) override;
+
+  void Mooee(const FermionField& in, FermionField& out) override;
+
+  void MooeeDag(const FermionField& in, FermionField& out) override;
+
+  void MooeeInv(const FermionField& in, FermionField& out) override;
+
+  void MooeeInvDag(const FermionField& in, FermionField& out) override;
+
+  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
+
+  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
+
+  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  /////////////////////////////////////////////
+  // Member functions (internals)
+  /////////////////////////////////////////////
+
+  void MooeeInternal(const FermionField&        in,
+                     FermionField&              out,
+                     const CloverDiagonalField& diagonal,
+                     const CloverTriangleField& triangle);
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+  void ImportGauge(const GaugeField& _Umu) override;
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+private:
+
+  template<class Field>
+  const MaskField* getCorrectMaskField(const Field &in) const {
+    if(in.Grid()->_isCheckerBoarded) {
+      if(in.Checkerboard() == Odd) {
+        return &this->BoundaryMaskOdd;
+      } else {
+        return &this->BoundaryMaskEven;
+      }
+    } else {
+      return &this->BoundaryMask;
+    }
+  }
+
+  template<class Field>
+  void ApplyBoundaryMask(Field& f) {
+    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
+    assert(m != nullptr);
+    CompactHelpers::ApplyBoundaryMask(f, *m);
+  }
+
+  /////////////////////////////////////////////
+  // Member Data
+  /////////////////////////////////////////////
+
+public:
+
+  RealD csw_r;
+  RealD csw_t;
+  RealD cF;
+  int n_rhs;
+  
+  bool fixedBoundaries;
+
+  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
+  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
+
+  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
+  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
+
+  FermionField Tmp;
+
+  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
+};
+
+NAMESPACE_END(Grid);
@@ -60,6 +60,50 @@ public:
  //      virtual void   Instantiatable(void)=0;
  virtual void   Instantiatable(void) =0;

+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
+  {
+    std::cout << "Free Propagator for PartialFraction"<<std::endl;
+    FermionField in_k(in.Grid());
+    FermionField prop_k(in.Grid());
+    
+    FFT theFFT((GridCartesian *) in.Grid());
+
+    //phase for boundary condition
+    ComplexField coor(in.Grid());
+    ComplexField ph(in.Grid());  ph = Zero();
+    FermionField in_buf(in.Grid()); in_buf = Zero();
+    typedef typename Simd::scalar_type Scalar;
+    Scalar ci(0.0,1.0);
+    assert(twist.size() == Nd);//check that twist is Nd
+    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    int shift = 0;
+    for(unsigned int nu = 0; nu < Nd; nu++)
+      {
+	// Shift coordinate lattice index by 1 to account for 5th dimension.
+	LatticeCoordinate(coor, nu + shift);
+	double boundary_phase = ::acos(real(boundary[nu]));
+	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
+	//momenta for propagator shifted by twist+boundary
+	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
+      }
+    in_buf = exp(ci*ph*(-1.0))*in;
+
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    
+    //phase for boundary condition
+    out = out * exp(ci*ph);
+  };
+
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+    std::vector<Complex> boundary;
+    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+    FreePropagator(in,out,mass,boundary,twist);
+  };
+
+  
  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
@@ -90,12 +134,12 @@ protected:
  RealD mass;
  RealD R;
  RealD ZoloHiInv;
-  Vector<double> Beta;
-  Vector<double> cc;;
-  Vector<double> cc_d;;
-  Vector<double> sqrt_cc;
-  Vector<double> See;
-  Vector<double> Aee;
+  std::vector<double> Beta;
+  std::vector<double> cc;;
+  std::vector<double> cc_d;;
+  std::vector<double> sqrt_cc;
+  std::vector<double> See;
+  std::vector<double> Aee;

 };

@@ -69,10 +69,10 @@ public:
  // Instantiate different versions depending on Impl
  /////////////////////////////////////////////////////
  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	   std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);

  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);

  virtual void RefreshShiftCoefficients(RealD new_shift);

@@ -83,7 +83,7 @@ public:
 			RealD _M5, const ImplParams& p=ImplParams());

 protected:
-  void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
+  void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
 };

 NAMESPACE_END(Grid);
@@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@@ -164,12 +165,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS

 // Compact Clover fermions
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
+template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;

 typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;

+typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
+typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
+typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
+
 typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
@@ -102,11 +102,11 @@ public:
 		     GaugeField &mat, 
 		     const FermionField &A, const FermionField &B, int dag);

-  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);

  //////////////////////////////////////////////////////////////////////////
@@ -164,8 +164,6 @@ public:
  DoubledGaugeField UUUmuEven;
  DoubledGaugeField UUUmuOdd;

-  LebesgueOrder Lebesgue;
-  LebesgueOrder LebesgueEvenOdd;
  
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
@@ -100,7 +100,6 @@ public:
 		     int dag);
    
  void DhopInternal(StencilImpl & st,
-		    LebesgueOrder &lo,
 		    DoubledGaugeField &U,
 		    DoubledGaugeField &UUU,
 		    const FermionField &in, 
@@ -108,7 +107,6 @@ public:
 		    int dag);
    
    void DhopInternalOverlappedComms(StencilImpl & st,
-		      LebesgueOrder &lo,
 		      DoubledGaugeField &U,
 		      DoubledGaugeField &UUU,
 		      const FermionField &in, 
@@ -116,7 +114,6 @@ public:
 		      int dag);

    void DhopInternalSerialComms(StencilImpl & st,
-		      LebesgueOrder &lo,
 		      DoubledGaugeField &U,
 		      DoubledGaugeField &UUU,
 		      const FermionField &in, 
@@ -192,8 +189,6 @@ public:
  DoubledGaugeField UUUmuEven;
  DoubledGaugeField UUUmuOdd;
    
-  LebesgueOrder Lebesgue;
-  LebesgueOrder LebesgueEvenOdd;
    
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
@@ -42,11 +42,11 @@ public:

 public:
  // Shift operator coefficients for red-black preconditioned Mobius EOFA
-  Vector<Coeff_t> Mooee_shift;
-  Vector<Coeff_t> MooeeInv_shift_lc;
-  Vector<Coeff_t> MooeeInv_shift_norm;
-  Vector<Coeff_t> MooeeInvDag_shift_lc;
-  Vector<Coeff_t> MooeeInvDag_shift_norm;
+  std::vector<Coeff_t> Mooee_shift;
+  std::vector<Coeff_t> MooeeInv_shift_lc;
+  std::vector<Coeff_t> MooeeInv_shift_norm;
+  std::vector<Coeff_t> MooeeInvDag_shift_lc;
+  std::vector<Coeff_t> MooeeInvDag_shift_norm;

  virtual void Instantiatable(void) {};

@@ -74,18 +74,18 @@ public:
  // Instantiate different versions depending on Impl
  /////////////////////////////////////////////////////
  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	   std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);

  void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-		 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-		 Vector<Coeff_t>& shift_coeffs);
+		 std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+		 std::vector<Coeff_t>& shift_coeffs);

  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);

  void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-		    Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-		    Vector<Coeff_t>& shift_coeffs);
+		    std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+		    std::vector<Coeff_t>& shift_coeffs);

  virtual void RefreshShiftCoefficients(RealD new_shift);

@@ -102,11 +102,11 @@ public:
 		     GaugeField &mat, 
 		     const FermionField &A, const FermionField &B, int dag);

-  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
 			       const FermionField &in, FermionField &out, int dag);
-  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
 				   const FermionField &in, FermionField &out, int dag);

  //////////////////////////////////////////////////////////////////////////
@@ -152,9 +152,6 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;

-  LebesgueOrder Lebesgue;
-  LebesgueOrder LebesgueEvenOdd;
-  
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
@@ -42,7 +42,7 @@ public:

     void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
       this->MomentumSpacePropagatorHw(out,in,_m,twist);
-  };
+     };

  // Constructors
  OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
@@ -41,6 +41,10 @@ public:
 public:

  // Constructors
+  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };

  OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
 				      GridCartesian         &FiveDimGrid,
@@ -41,6 +41,9 @@ public:
 public:

  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
  // Constructors
  OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
 				   GridCartesian         &FiveDimGrid,
@@ -40,6 +40,9 @@ public:
  INHERIT_IMPL_TYPES(Impl);

  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
  // Constructors
  OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
 					GridCartesian         &FiveDimGrid,
@@ -41,6 +41,9 @@ public:
 public:

  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
  // Constructors
  OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
 					  GridCartesian         &FiveDimGrid,
@@ -40,6 +40,11 @@ public:
  INHERIT_IMPL_TYPES(Impl);

  virtual void   Instantiatable(void){};
+
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
+
  // Constructors
  OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
 					       GridCartesian         &FiveDimGrid,
@@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
 public:
  INHERIT_IMPL_TYPES(Impl);

-  const int part_frac_chroma_convention=1;
+  const int part_frac_chroma_convention=0;

  void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
  void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
@@ -83,19 +83,78 @@ public:
 			   GridRedBlackCartesian &FourDimRedBlackGrid,
 			   RealD _mass,RealD M5,const ImplParams &p= ImplParams());

+  PartialFractionFermion5D(GaugeField &_Umu,
+			   GridCartesian         &FiveDimGrid,
+			   GridRedBlackCartesian &FiveDimRedBlackGrid,
+			   GridCartesian         &FourDimGrid,
+			   GridRedBlackCartesian &FourDimRedBlackGrid,
+			   RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
+
+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
+  {
+    std::cout << "Free Propagator for PartialFraction"<<std::endl;
+    FermionField in_k(in.Grid());
+    FermionField prop_k(in.Grid());
+    
+    FFT theFFT((GridCartesian *) in.Grid());
+
+    //phase for boundary condition
+    ComplexField coor(in.Grid());
+    ComplexField ph(in.Grid());  ph = Zero();
+    FermionField in_buf(in.Grid()); in_buf = Zero();
+    typedef typename Simd::scalar_type Scalar;
+    Scalar ci(0.0,1.0);
+    assert(twist.size() == Nd);//check that twist is Nd
+    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    int shift = 0;
+    for(unsigned int nu = 0; nu < Nd; nu++)
+      {
+	// Shift coordinate lattice index by 1 to account for 5th dimension.
+	LatticeCoordinate(coor, nu + shift);
+	double boundary_phase = ::acos(real(boundary[nu]));
+	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
+	//momenta for propagator shifted by twist+boundary
+	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
+      }
+    in_buf = exp(ci*ph*(-1.0))*in;
+
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    if ( this->qmu.size() ){
+      this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
+    } else {
+      this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
+    }
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    
+    //phase for boundary condition
+    out = out * exp(ci*ph);
+  };
+
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+    std::vector<Complex> boundary;
+    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+    FreePropagator(in,out,mass,boundary,twist);
+  };
+
+  void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
+  void addQmu(const FermionField &in, FermionField &out, int dag);
+
 protected:

  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
  virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);

+  std::vector<RealD> qmu;
+
  // Part frac
  RealD mass;
  RealD dw_diag;
  RealD R;
  RealD amax;
  RealD scale;
-  Vector<double> p; 
-  Vector<double> q;
+  std::vector<double> p; 
+  std::vector<double> q;

 };

@@ -35,7 +35,7 @@ template<class Matrix, class Field>
 class KappaSimilarityTransform {
 public:
  INHERIT_IMPL_TYPES(Matrix);
-  Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
+  std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;

  KappaSimilarityTransform (Matrix &zmob) {
    for (int i=0;i<(int)zmob.bs.size();i++) {
@@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   
 public:

-  void DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
+  void DhopImproved(StencilImpl &st,
 		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
 		    const FermionField &in, FermionField &out, int dag, int interior,int exterior);
-  void DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
+  void DhopNaive(StencilImpl &st,
 		 DoubledGaugeField &U,
 		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
  
@@ -47,7 +47,7 @@ public:
  static int PartialCompressionFactor(GridBase *grid) { return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
+  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
 				   const Lattice<vobj> &rhs,
 				   cobj *buffer,
 				   compressor &compress,
@@ -109,7 +109,7 @@ public:
  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
@@ -197,7 +197,7 @@ public:
 #endif
  
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
+  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
 					 const Lattice<vobj> &rhs,
 					 cobj *buffer,
 					 compressor &compress,
@@ -208,7 +208,7 @@ public:
    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
  }
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
@@ -402,7 +402,6 @@ public:

  typedef CartesianStencil<vobj,cobj,Parameters> Base;
  typedef typename Base::View_type View_type;
-  typedef typename Base::StencilVector StencilVector;

  //  Vector<int> surface_list;
  WilsonStencil(GridBase *grid,
@@ -415,29 +414,6 @@ public:
    //    surface_list.resize(0);
    this->same_node.resize(npoints);
  };
-
-  /*
-  void BuildSurfaceList(int Ls,int vol4){
-
-    // find same node for SHM
-    // Here we know the distance is 1 for WilsonStencil
-    for(int point=0;point<this->_npoints;point++){
-      this->same_node[point] = this->SameNode(point);
-    }
-    
-    for(int site = 0 ;site< vol4;site++){
-      int local = 1;
-      for(int point=0;point<this->_npoints;point++){
-	if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ 
-	  local = 0;
-	}
-      }
-      if(local == 0) { 
-	surface_list.push_back(site);
-      }
-    }
-  }
-  */
  
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
@@ -508,6 +484,11 @@ public:
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
+#ifdef NVLINK_GET
+    this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
+    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
+    // Or issue barrier AFTER the DMA is running
+#endif    
  }

 };
@@ -126,14 +126,17 @@ public:
  void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
                     const FermionField &A, const FermionField &B, int dag);

-  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternal(StencilImpl &st,
+		    DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);

-  void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-                    const FermionField &in, FermionField &out, int dag);
+  void DhopInternalSerial(StencilImpl &st,
+			  DoubledGaugeField &U,
+			  const FermionField &in, FermionField &out, int dag);

-  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-                    const FermionField &in, FermionField &out, int dag);
+  void DhopInternalOverlappedComms(StencilImpl &st,
+				   DoubledGaugeField &U,
+				   const FermionField &in, FermionField &out, int dag);

  // Constructor
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
@@ -168,9 +171,6 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;

-  LebesgueOrder Lebesgue;
-  LebesgueOrder LebesgueEvenOdd;
-
  WilsonAnisotropyCoefficients anisotropyCoeff;

  ///////////////////////////////////////////////////////////////
@@ -91,13 +91,13 @@ public:
  virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};

  // half checkerboard operations; leave unimplemented as abstract for now
-  virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);

-  virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

@@ -109,6 +109,8 @@ public:
  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
+				  std::vector<double> qmu) ;

  // Implement hopping term non-hermitian hopping term; half cb or both
  // Implement s-diagonal DW
@@ -117,6 +119,9 @@ public:
  void DhopOE(const FermionField &in, FermionField &out,int dag);
  void DhopEO(const FermionField &in, FermionField &out,int dag);

+  void DhopComms  (const FermionField &in, FermionField &out);
+  void DhopCalc   (const FermionField &in, FermionField &out,uint64_t *ids);
+  
  // add a DhopComm
  // -- suboptimal interface will presently trigger multiple comms.
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
@@ -135,21 +140,18 @@ public:
 		     int dag);
    
  void DhopInternal(StencilImpl & st,
-		    LebesgueOrder &lo,
 		    DoubledGaugeField &U,
 		    const FermionField &in, 
 		    FermionField &out,
 		    int dag);

  void DhopInternalOverlappedComms(StencilImpl & st,
-				   LebesgueOrder &lo,
 				   DoubledGaugeField &U,
 				   const FermionField &in, 
 				   FermionField &out,
 				   int dag);

  void DhopInternalSerialComms(StencilImpl & st,
-			       LebesgueOrder &lo,
 			       DoubledGaugeField &U,
 			       const FermionField &in, 
 			       FermionField &out,
@@ -203,9 +205,6 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
    
-  LebesgueOrder Lebesgue;
-  LebesgueOrder LebesgueEvenOdd;
-    
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;

@@ -57,6 +57,10 @@ public:
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
 			 int interior=1,int exterior=1) ;

+  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 uint64_t *ids);
+  
  static void DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out,
 			    int interior=1,int exterior=1) ;
@@ -58,7 +58,7 @@ public:
  {
    //    RealD eps = 1.0;
    std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
-    Vector<Coeff_t> zgamma(this->Ls);
+    std::vector<Coeff_t> zgamma(this->Ls);
    for(int s=0;s<this->Ls;s++){
      zgamma[s] = gamma[s];
    }
@@ -1,3 +1,5 @@
+#if 0
+
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -818,3 +820,5 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
 }

 NAMESPACE_END(Grid);
+
+#endif
@@ -1,3 +1,4 @@
+#if 0
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -241,3 +242,4 @@ void LebesgueOrder::ZGraph(void)
 }
 NAMESPACE_END(Grid);

+#endif
@@ -72,7 +72,7 @@ public:
  void ThreadInterleave(void);

 private:
-  Vector<IndexInteger> _LebesgueReorder;
+  deviceVector<IndexInteger> _LebesgueReorder;

 };    

@@ -48,7 +48,8 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
  mass_plus(_mass), mass_minus(_mass)
-{ 
+{
+  // qmu defaults to zero size;
 }

 ///////////////////////////////////////////////////////////////
@@ -156,18 +157,18 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
+  std::vector<Coeff_t> diag (Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bs;
-  Vector<Coeff_t> upper= cs;
-  Vector<Coeff_t> lower= cs; 
+  std::vector<Coeff_t> diag = bs;
+  std::vector<Coeff_t> upper= cs;
+  std::vector<Coeff_t> lower= cs; 
  upper[Ls-1]=-mass_minus*upper[Ls-1];
  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
@@ -176,9 +177,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = beo;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = beo;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
@@ -191,9 +192,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-cee[i];
    lower[i]=-cee[i];
@@ -206,9 +207,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);

  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
@@ -236,9 +237,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);
-  Vector<Coeff_t> lower(Ls,-1.0);
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);
+  std::vector<Coeff_t> lower(Ls,-1.0);
  upper[Ls-1]=-mass_plus*upper[Ls-1];
  lower[0]   =-mass_minus*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
@@ -248,9 +249,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag =bs;
-  Vector<Coeff_t> upper=cs;
-  Vector<Coeff_t> lower=cs; 
+  std::vector<Coeff_t> diag =bs;
+  std::vector<Coeff_t> upper=cs;
+  std::vector<Coeff_t> lower=cs; 

  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
@@ -270,6 +271,34 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  M5Ddag(psi,psi,Din,lower,diag,upper);
 }

+template<class Impl>
+void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
+{
+  if ( qmu.size() ) {
+
+    Gamma::Algebra Gmu [] = {
+      Gamma::Algebra::GammaX,
+      Gamma::Algebra::GammaY,
+      Gamma::Algebra::GammaZ,
+      Gamma::Algebra::GammaT
+    };
+    std::vector<ComplexD> coeff(Nd);
+    ComplexD ci(0,1);
+
+    assert(qmu.size()==Nd);
+
+    for(int mu=0;mu<Nd;mu++){
+       coeff[mu] = ci*qmu[mu];
+       if ( dag ) coeff[mu] = conjugate(coeff[mu]);
+    }
+
+    chi = chi + Gamma(Gmu[0])*psi*coeff[0];
+    for(int mu=1;mu<Nd;mu++){
+      chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
+    }
+  }
+}
+
 template<class Impl>
 void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
@@ -277,8 +306,12 @@ void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
  
  // Assemble Din
  Meooe5D(psi,Din);
-  
+
  this->DW(Din,chi,DaggerNo);
+
+  // add i q_mu gamma_mu here
+  addQmu(Din,chi,DaggerNo);
+  
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby(chi,1.0,1.0,chi,psi); 
  
@@ -295,6 +328,9 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  FermionField Din(psi.Grid());
  // Apply Dw
  this->DW(psi,Din,DaggerYes); 
+
+  // add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
+  addQmu(psi,Din,DaggerYes);
  
  MeooeDag5D(Din,chi);
  
@@ -394,7 +430,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  Vector<Coeff_t> gamma(this->Ls);
+  std::vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(1.0,gamma,b,c);
 }
@@ -402,13 +438,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  Vector<Coeff_t> gamma(this->Ls);
+  std::vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(zolo_hi,gamma,b,c);
 }
 //Zolo
 template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
+void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
 {
  int Ls=this->Ls;

@@ -488,7 +524,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  leem.resize(Ls);
  uee.resize(Ls);
  ueem.resize(Ls);
-  
+
  for(int i=0;i<Ls;i++){
    
    dee[i] = bee[i];
@@ -529,6 +565,18 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
    dee[Ls-1] += delta_d;
  }  

+  //////////////////////////////////////////
+  // Device buffers
+  //////////////////////////////////////////
+  d_diag.resize(Ls);
+  d_upper.resize(Ls);
+  d_lower.resize(Ls);
+
+  d_dee.resize(Ls);
+  d_lee.resize(Ls);
+  d_uee.resize(Ls);
+  d_leem.resize(Ls);
+  d_ueem.resize(Ls);
  //  int inv=1;
  //  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
  //  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
@@ -43,9 +43,9 @@ void
 CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
 			       const FermionField &phi_i, 
 			       FermionField &chi_i,
-			       Vector<Coeff_t> &lower,
-			       Vector<Coeff_t> &diag,
-			       Vector<Coeff_t> &upper)
+			       std::vector<Coeff_t> &lower,
+			       std::vector<Coeff_t> &diag,
+			       std::vector<Coeff_t> &upper)
 {
  
  chi_i.Checkerboard()=psi_i.Checkerboard();
@@ -55,12 +55,16 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());

-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
-
  int Ls =this->Ls;

+  acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
+  
+  auto pdiag = &d_diag[0];
+  auto pupper = &d_upper[0];
+  auto plower = &d_lower[0];
+
  // 10 = 3 complex mult + 2 complex add
  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
  uint64_t nloop = grid->oSites();
@@ -82,9 +86,9 @@ void
 CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
 			      const FermionField &phi_i, 
 			      FermionField &chi_i,
-			      Vector<Coeff_t> &lower,
-			      Vector<Coeff_t> &diag,
-			      Vector<Coeff_t> &upper)
+			      std::vector<Coeff_t> &lower,
+			      std::vector<Coeff_t> &diag,
+			      std::vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
@@ -93,12 +97,16 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());

-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
-
  int Ls=this->Ls;

+  acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
+  
+  auto pdiag = &d_diag[0];
+  auto pupper = &d_upper[0];
+  auto plower = &d_lower[0];
+
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -126,11 +134,17 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi

  int Ls=this->Ls;

-  auto plee  = & lee [0];
-  auto pdee  = & dee [0];
-  auto puee  = & uee [0];
-  auto pleem = & leem[0];
-  auto pueem = & ueem[0];
+  acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+
+  auto plee  = & d_lee [0];
+  auto pdee  = & d_dee [0];
+  auto puee  = & d_uee [0];
+  auto pleem = & d_leem[0];
+  auto pueem = & d_ueem[0];

  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -182,11 +196,17 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
  autoView(psi , psi_i,AcceleratorRead);
  autoView(chi , chi_i,AcceleratorWrite);

-  auto plee  = & lee [0];
-  auto pdee  = & dee [0];
-  auto puee  = & uee [0];
-  auto pleem = & leem[0];
-  auto pueem = & ueem[0];
+  acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+
+  auto plee  = & d_lee [0];
+  auto pdee  = & d_dee [0];
+  auto puee  = & d_uee [0];
+  auto pleem = & d_leem[0];
+  auto pueem = & d_ueem[0];

  assert(psi.Checkerboard() == psi.Checkerboard());

@@ -0,0 +1,376 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
+
+    Copyright (C) 2017 - 2025
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
+
+
+NAMESPACE_BEGIN(Grid);
+template<class Impl, class CloverHelpers>
+CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
+										GridCartesian         &FiveDimGrid,
+										GridRedBlackCartesian &FiveDimRedBlackGrid,
+										GridCartesian         &FourDimGrid,
+										GridRedBlackCartesian &FourDimRedBlackGrid,
+										const RealD _mass,
+										const RealD _csw_r,
+										const RealD _csw_t,
+										const RealD _cF,
+										const ImplParams& impl_p)
+  : WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
+  , csw_r(_csw_r)
+  , csw_t(_csw_t)
+  , cF(_cF)
+  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
+  , Diagonal(&FourDimGrid),        Triangle(&FourDimGrid)
+  , DiagonalEven(&FourDimRedBlackGrid),    TriangleEven(&FourDimRedBlackGrid)
+  , DiagonalOdd(&FourDimRedBlackGrid),     TriangleOdd(&FourDimRedBlackGrid)
+  , DiagonalInv(&FourDimGrid),     TriangleInv(&FourDimGrid)
+  , DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
+  , DiagonalInvOdd(&FourDimRedBlackGrid),  TriangleInvOdd(&FourDimRedBlackGrid)
+  , Tmp(&FiveDimGrid)
+  , BoundaryMask(&FiveDimGrid)
+  , BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
+{
+  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
+
+  csw_r *= 0.5;
+  csw_t *= 0.5;
+  //if (clover_anisotropy.isAnisotropic)
+  //  csw_r /= clover_anisotropy.xi_0;
+
+  ImportGauge(_Umu);
+  if (fixedBoundaries) {
+    this->BoundaryMaskEven.Checkerboard() = Even;
+    this->BoundaryMaskOdd.Checkerboard() = Odd;
+    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
+  }
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::Dhop(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopOE(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopEO(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
+  WilsonBase::DhopDir(in, out, dir, disp);
+  if(this->fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
+  WilsonBase::DhopDirAll(in, out);
+  if(this->fixedBoundaries) {
+    for(auto& o : out) ApplyBoundaryMask(o);
+  }
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
+  Mooee(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
+  MooeeDag(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
+  WilsonBase::Meooe(in, out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
+  WilsonBase::MeooeDag(in, out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalEven, TriangleEven);
+    }
+  } else {
+    MooeeInternal(in, out, Diagonal, Triangle);
+  }
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
+  Mooee(in, out); // blocks are hermitian
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
+    }
+  } else {
+    MooeeInternal(in, out, DiagonalInv, TriangleInv);
+  }
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
+  MooeeInv(in, out); // blocks are hermitian
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
+  DhopDirAll(in, out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
+  assert(!fixedBoundaries); // TODO check for changes required for open bc
+
+  // NOTE: code copied from original clover term
+  conformable(X.Grid(), Y.Grid());
+  conformable(X.Grid(), force.Grid());
+  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+  GaugeField clover_force(force.Grid());
+  PropagatorField Lambda(force.Grid());
+
+  // Guido: Here we are hitting some performance issues:
+  // need to extract the components of the DoubledGaugeField
+  // for each call
+  // Possible solution
+  // Create a vector object to store them? (cons: wasting space)
+  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
+
+  Impl::extractLinkField(U, this->Umu);
+
+  force = Zero();
+  // Derivative of the Wilson hopping term
+  this->DhopDeriv(force, X, Y, dag);
+
+  ///////////////////////////////////////////////////////////
+  // Clover term derivative
+  ///////////////////////////////////////////////////////////
+  Impl::outerProductImpl(Lambda, X, Y);
+  //std::cout << "Lambda:" << Lambda << std::endl;
+
+  Gamma::Algebra sigma[] = {
+      Gamma::Algebra::SigmaXY,
+      Gamma::Algebra::SigmaXZ,
+      Gamma::Algebra::SigmaXT,
+      Gamma::Algebra::MinusSigmaXY,
+      Gamma::Algebra::SigmaYZ,
+      Gamma::Algebra::SigmaYT,
+      Gamma::Algebra::MinusSigmaXZ,
+      Gamma::Algebra::MinusSigmaYZ,
+      Gamma::Algebra::SigmaZT,
+      Gamma::Algebra::MinusSigmaXT,
+      Gamma::Algebra::MinusSigmaYT,
+      Gamma::Algebra::MinusSigmaZT};
+
+  /*
+    sigma_{\mu \nu}=
+    | 0         sigma[0]  sigma[1]  sigma[2] |
+    | sigma[3]    0       sigma[4]  sigma[5] |
+    | sigma[6]  sigma[7]     0      sigma[8] |
+    | sigma[9]  sigma[10] sigma[11]   0      |
+  */
+
+  int count = 0;
+  clover_force = Zero();
+  for (int mu = 0; mu < 4; mu++)
+  {
+    force_mu = Zero();
+    for (int nu = 0; nu < 4; nu++)
+    {
+      if (mu == nu)
+        continue;
+
+      RealD factor;
+      if (nu == 4 || mu == 4)
+      {
+        factor = 2.0 * csw_t;
+      }
+      else
+      {
+        factor = 2.0 * csw_r;
+      }
+      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
+      count++;
+    }
+
+    pokeLorentz(clover_force, U[mu] * force_mu, mu);
+  }
+  //clover_force *= csw;
+  force += clover_force;
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
+								      FermionField&              out,
+								      const CloverDiagonalField& diagonal,
+								      const CloverTriangleField& triangle) {
+  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+  out.Checkerboard() = in.Checkerboard();
+  conformable(in, out);
+  CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
+  // NOTE: parts copied from original implementation
+
+  // Import gauge into base class
+  double t0 = usecond();
+  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
+
+  // Initialize temporary variables
+  double t1 = usecond();
+  conformable(_Umu.Grid(), this->GaugeGrid());
+  GridBase* grid = _Umu.Grid();
+  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+  CloverField TmpOriginal(grid);
+  CloverField TmpInverse(grid);
+
+  // Compute the field strength terms mu>nu
+  double t2 = usecond();
+  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+  // Compute the Clover Operator acting on Colour and Spin
+  // multiply here by the clover coefficients for the anisotropy
+  double t3 = usecond();
+  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
+  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
+  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
+  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
+  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
+  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
+
+  // Instantiate the clover term
+  // - In case of the standard clover the mass term is added
+  // - In case of the exponential clover the clover term is exponentiated
+  double t4 = usecond();
+  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
+
+  // Convert the data layout of the clover term
+  double t5 = usecond();
+  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
+
+  // Modify the clover term at the temporal boundaries in case of open boundary conditions
+  double t6 = usecond();
+  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
+
+  // Invert the Clover term
+  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
+  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
+  // TODO: For now this inversion is explictly done on the CPU
+  double t7 = usecond();
+  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
+
+  // Fill the remaining clover fields
+  double t8 = usecond();
+  pickCheckerboard(Even, DiagonalEven,    Diagonal);
+  pickCheckerboard(Even, TriangleEven,    Triangle);
+  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
+  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
+  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
+  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
+  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
+  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
+
+  // Report timings
+  double t9 = usecond();
+
+  std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
+  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
+}
+
+NAMESPACE_END(Grid);
@@ -42,13 +42,13 @@ template<class Impl>
 void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
 {
  // How to check Ls matches??
-  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
  int Ls = this->Ls;
+  std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
  assert(zdata->db==Ls);// Beta has Ls coeffs

  R=(1+this->mass)/(1-this->mass);
@@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
@@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid);
 // Pplus  backwards..
 template<class Impl>
 void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+				      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  int Ls = this->Ls;
@@ -50,9 +50,15 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  autoView( psi , psi_i, AcceleratorRead);
  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
+
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+
+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
+
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  
  auto nloop=grid->oSites()/Ls;
@@ -73,7 +79,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi

 template<class Impl>
 void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+					 std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase* grid = psi_i.Grid();
@@ -83,9 +89,14 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  autoView( phi , phi_i, AcceleratorRead);
  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
+  
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+
+  acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));

  // Flops = 6.0*(Nc*Ns) *Ls*vol

@@ -114,12 +125,17 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
  autoView( chi, chi_i, AcceleratorWrite);
  int Ls = this->Ls;

-  auto plee  = & this->lee[0];
-  auto pdee  = & this->dee[0];
-  auto puee  = & this->uee[0];
-
-  auto pleem = & this->leem[0];
-  auto pueem = & this->ueem[0];
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];
+  
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));

  uint64_t nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -131,9 +131,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi
    else{ shiftm = -shift*(mq3-mq2); }
  }

-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;

 #if(0)
  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
@@ -168,9 +168,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField&
    else{ shiftm = -shift*(mq3-mq2); }
  }

-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;

  this->M5Ddag(psi, chi, chi, lower, diag, upper);
 }
@@ -181,9 +181,9 @@ void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& c
 {
  int Ls = this->Ls;

-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);

  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
@@ -200,9 +200,9 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
 {
  int Ls = this->Ls;

-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);

  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
@@ -218,7 +218,7 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField

 //Zolo
 template<class Impl>
-void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
+void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
 {
  int   Ls    = this->Ls;
  int   pm    = this->pm;
@@ -61,8 +61,6 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
  UUUmu(&FourDimGrid),
  UUUmuEven(&FourDimRedBlackGrid),
  UUUmuOdd(&FourDimRedBlackGrid),
-  Lebesgue(&FourDimGrid),
-  LebesgueEvenOdd(&FourDimRedBlackGrid),
  _tmp(&FiveDimRedBlackGrid)
 {

@@ -277,18 +275,18 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,

 /*CHANGE */
 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, 
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
  else
-    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalSerialComms(st,U,UUU,in,out,dag);
 }

 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, 
 								   DoubledGaugeField & U,DoubledGaugeField & UUU,
 								   const FermionField &in, FermionField &out,int dag)
 {
@@ -313,7 +311,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }

  st.CommsMerge(compressor);
@@ -323,12 +321,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 }

 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, 
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
@@ -341,7 +339,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 }
 /*CHANGE END*/
@@ -357,7 +355,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
  assert(in.Checkerboard()==Even);
  out.Checkerboard() = Odd;

-  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
+  DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
@@ -368,7 +366,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
  assert(in.Checkerboard()==Odd);
  out.Checkerboard() = Even;

-  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
+  DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
@@ -378,7 +376,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField

  out.Checkerboard() = in.Checkerboard();

-  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
+  DhopInternal(Stencil,Umu,UUUmu,in,out,dag);
 }

 /////////////////////////////////////////////////////////////////////////
@@ -48,8 +48,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
    mass(_mass),
-    Lebesgue(_grid),
-    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
@@ -339,7 +337,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &

  out.Checkerboard() = in.Checkerboard();

-  DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
+  DhopInternal(Stencil, Umu, UUUmu, in, out, dag);
 }

 template <class Impl>
@@ -351,7 +349,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;

-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
+  DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag);
 }

 template <class Impl>
@@ -363,7 +361,7 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;

-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
+  DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag);
 }

 template <class Impl>
@@ -394,19 +392,19 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel


 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, 
 						  DoubledGaugeField &U,
 						  DoubledGaugeField &UUU,
 						  const FermionField &in,
 						  FermionField &out, int dag) 
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
  else
-    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalSerialComms(st,U,UUU,in,out,dag);
 }
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, 
 								 DoubledGaugeField &U,
 								 DoubledGaugeField &UUU,
 								 const FermionField &in,
@@ -429,7 +427,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }

  st.CommunicateComplete(requests);
@@ -440,13 +438,13 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 }


 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, 
 							     DoubledGaugeField &U,
 							     DoubledGaugeField &UUU,
 							     const FermionField &in,
@@ -460,7 +458,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 };

@@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
 
 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				  Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+				  std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@@ -50,10 +50,14 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField

  assert(phi.Checkerboard() == psi.Checkerboard());

-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];

+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
+  
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -74,8 +78,8 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField

 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
-					Vector<Coeff_t> &shift_coeffs)
+					std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
+					std::vector<Coeff_t> &shift_coeffs)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@@ -86,13 +90,18 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion

  auto pm  = this->pm;
  int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-
+  
  assert(phi.Checkerboard() == psi.Checkerboard());

-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
-  auto pshift_coeffs = &shift_coeffs[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+  auto pshift_coeffs = &this->d_shift_coefficients[0];
+
+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));

  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@@ -119,7 +128,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion

 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				     Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+				     std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@@ -129,10 +138,14 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
  autoView(chi , chi_i, AcceleratorWrite);

  assert(phi.Checkerboard() == psi.Checkerboard());
+  
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];

-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));

  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@@ -154,8 +167,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie

 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					   Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
-					   Vector<Coeff_t> &shift_coeffs)
+					   std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
+					   std::vector<Coeff_t> &shift_coeffs)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@@ -167,11 +180,16 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm

  assert(phi.Checkerboard() == psi.Checkerboard());

-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
-  auto pshift_coeffs = &shift_coeffs[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+  auto pshift_coeffs = &this->d_shift_coefficients[0];

+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
+  
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  auto pm = this->pm;

@@ -212,11 +230,17 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);

-  auto plee = & this->lee [0];
-  auto pdee = & this->dee [0];
-  auto puee = & this->uee [0];
-  auto pleem= & this->leem[0];
-  auto pueem= & this->ueem[0];
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];
+
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));

  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }

@@ -268,14 +292,23 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);

+  // Move into object and constructor
  auto pm = this->pm;
-  auto plee = & this->lee [0];
-  auto pdee = & this->dee [0];
-  auto puee = & this->uee [0];
-  auto pleem= & this->leem[0];
-  auto pueem= & this->ueem[0];
-  auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0];
-  auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];
+  auto pMooeeInv_shift_lc   = &this->d_MooeeInv_shift_lc[0];
+  auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0];
+
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -333,11 +366,17 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);

-  auto plee = & this->lee [0];
-  auto pdee = & this->dee [0];
-  auto puee = & this->uee [0];
-  auto pleem= & this->leem[0];
-  auto pueem= & this->ueem[0];
+  auto plee  = &this->d_lee [0];
+  auto pdee  = &this->d_dee [0];
+  auto puee  = &this->d_uee [0];
+  auto pleem = &this->d_leem[0];
+  auto pueem = &this->d_ueem[0];
+
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -387,13 +426,25 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
  int Ls = this->Ls;

  auto pm = this->pm;
-  auto plee = & this->lee [0];
-  auto pdee = & this->dee [0];
-  auto puee = & this->uee [0];
-  auto pleem= & this->leem[0];
-  auto pueem= & this->ueem[0];
-  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
-  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];
+
+  auto pMooeeInvDag_shift_lc   = &this->d_MooeeInv_shift_lc[0];
+  auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0];
+
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
+
+  //  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
+  //  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -196,9 +196,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
 {
  int Ls = this->Ls;

-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;

  // no shift term
  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
@@ -212,9 +212,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
 {
  int Ls = this->Ls;

-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;

  // no shift term
  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
@@ -230,9 +230,9 @@ void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
  int Ls = this->Ls;

  // coefficients of Mooee
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
    lower[s] = -this->cee[s];
@@ -253,9 +253,9 @@ void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& ch
  int Ls = this->Ls;

  // coefficients of MooeeDag
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    if(s==0) {
      upper[s] = -this->cee[s+1];
@@ -314,10 +314,10 @@ void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
  // Tridiagonal solve for MooeeInvDag_shift_lc
  {
    Coeff_t m(0.0);
-    Vector<Coeff_t> d = Mooee_shift;
-    Vector<Coeff_t> u(Ls,0.0);
-    Vector<Coeff_t> y(Ls,0.0);
-    Vector<Coeff_t> q(Ls,0.0);
+    std::vector<Coeff_t> d = Mooee_shift;
+    std::vector<Coeff_t> u(Ls,0.0);
+    std::vector<Coeff_t> y(Ls,0.0);
+    std::vector<Coeff_t> q(Ls,0.0);
    if(pm == 1){ u[0] = 1.0; }
    else{ u[Ls-1] = 1.0; }

@@ -48,8 +48,6 @@ NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed
    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
    mass(_mass),
-    Lebesgue(_grid),
-    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
@@ -268,7 +266,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out

  out.Checkerboard() = in.Checkerboard();

-  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
+  DhopInternal(Stencil, Umu, in, out, dag);
 }

 template <class Impl>
@@ -280,7 +278,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;

-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
+  DhopInternal(StencilEven, UmuOdd, in, out, dag);
 }

 template <class Impl>
@@ -292,7 +290,7 @@ void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &o
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;

-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
+  DhopInternal(StencilOdd, UmuEven, in, out, dag);
 }

 template <class Impl>
@@ -323,18 +321,18 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &


 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
 					       DoubledGaugeField &U,
 					       const FermionField &in,
 					       FermionField &out, int dag) 
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
+    DhopInternalOverlappedComms(st,U,in,out,dag);
  else
-    DhopInternalSerialComms(st,lo,U,in,out,dag);
+    DhopInternalSerialComms(st,U,in,out,dag);
 }
 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
 							      DoubledGaugeField &U,
 							      const FermionField &in,
 							      FermionField &out, int dag) 
@@ -356,7 +354,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
  }

  st.CommunicateComplete(requests);
@@ -367,12 +365,12 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
  }
 }

 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
+void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
 							  DoubledGaugeField &U,
 							  const FermionField &in,
 							  FermionField &out, int dag) 
@@ -385,7 +383,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
  }
 };

@@ -237,7 +237,32 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
  //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
  //

-  this->DW(psi,D,DaggerNo); 
+  this->DW(psi,D,DaggerNo);
+
+  // DW - DW+iqslash
+  //  (g5 Dw)^dag = g5 Dw
+  //  (iqmu g5 gmu)^dag = (-i qmu gmu^dag g5^dag) = i qmu g5 gmu
+  if ( qmu.size() ) {
+
+    std::cout<< "Mat" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
+    assert(qmu.size()==Nd);
+
+    FermionField qslash_psi(psi.Grid());
+
+    Gamma::Algebra Gmu [] = {
+			     Gamma::Algebra::GammaX,
+			     Gamma::Algebra::GammaY,
+			     Gamma::Algebra::GammaZ,
+			     Gamma::Algebra::GammaT
+    };
+    qslash_psi = qmu[0]*(Gamma(Gmu[0])*psi);
+    for(int mu=1;mu<Nd;mu++){
+      qslash_psi = qslash_psi + qmu[mu]*(Gamma(Gmu[mu])*psi);
+    }
+    ComplexD ci(0.0,1.0);
+    qslash_psi = ci*qslash_psi ; // i qslash
+    D = D + qslash_psi;
+  }

  int nblock=(Ls-1)/2;
  for(int b=0;b<nblock;b++){
@@ -255,15 +280,55 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
  }
 	
  {
+    // The 'conventional' Cayley overlap operator is
+    //
+    // Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
+    //
+    //
+    // With massless limit 1/2(1+g5 sgnHw)
+    //
+    // Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
+    //
+    // However, the conventional normalisation has both a leading order factor of 2 in Zq
+    // at tree level AND a mass dependent (1-m) that are convenient to absorb.
+    //
+    // In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
+    //
+    // num = -i sin kmu gmu
+    //
+    // denom ( sqrt(sk^2 + (2shk^2 - 1)^2
+    //    b_k = sk2 - M5;
+    //     
+    //    w_k = sqrt(sk + b_k*b_k);
+    //
+    //    denom= ( w_k + b_k + mass*mass) ;
+    //
+    //    denom= one/denom;
+    //    out = num*denom;
+    //
+    // Chroma, and Grid define partial fraction via 4d operator
+    //
+    //   Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
+    //
+    // Now since:
+    //
+    //      (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
+    //
+    // This corresponds to a modified mass parameter
+    //
+    // It has an annoying 
+    //
+    // 
    double R=(1+this->mass)/(1-this->mass);
-    //R g5 psi[Ls] + p[0] H
+    //R g5 psi[Ls] + p[0] Hw
    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
-	
+    
    for(int b=0;b<nblock;b++){
      int s = 2*b+1;
      double pp = p[nblock-1-b];
      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
    }
+   
  }

 }
@@ -411,17 +476,18 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
    {
+      //void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
      int Ls = this->Ls;
      conformable(imported5d.Grid(),this->FermionGrid());
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
@@ -442,7 +508,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,

 {
  int Ls = this->Ls;
-
+  qmu.resize(0);
  assert((Ls&0x1)==1); // Odd Ls required
  int nrational=Ls-1;

@@ -460,6 +526,22 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
  Approx::zolotarev_free(zdata);

 }
+template<class Impl>
+PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
+							 GridCartesian         &FiveDimGrid,
+							 GridRedBlackCartesian &FiveDimRedBlackGrid,
+							 GridCartesian         &FourDimGrid,
+							 GridRedBlackCartesian &FourDimRedBlackGrid,
+							 RealD _mass,RealD M5,
+							 std::vector<RealD> &_qmu,
+							 const ImplParams &p)
+  : PartialFractionFermion5D<Impl>(_Umu,
+			     FiveDimGrid,FiveDimRedBlackGrid,
+			     FourDimGrid,FourDimRedBlackGrid,
+			     _mass,M5,p)
+{
+  qmu=_qmu;
+}

 NAMESPACE_END(Grid);

--- a/Show More
+++ b/Show More
				`@@ -0,0 +1,2 @@`

				`mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL`