Adding DWF evec Chirality measurement

Merge branch 'develop' of https://github.com/paboyle/Grid into specflow
Shared Memory test reenabled on every Grid object creation.
2025-06-12 20:27:06 +01:00 · 2025-04-22 22:17:51 +00:00 · 2025-04-18 19:55:36 +00:00 · 2025-04-07 11:51:40 -04:00 · 2025-04-07 11:50:59 -04:00 · 2025-04-04 18:40:20 -04:00
107 changed files with 6799 additions and 3063 deletions
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@ -191,7 +191,7 @@ public:
      
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
-    std::cout << "CPU view" << std::endl;
+    //std::cout << "CPU view" << std::endl;
    
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@ -215,7 +215,7 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
      
-    std::cout << "Making FFTW plan" << std::endl;
+    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -229,7 +229,7 @@ public:
    }
      
    // Barrel shift and collect global pencil
-    std::cout << "Making pencil" << std::endl;
+    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@ -251,7 +251,7 @@ public:
      }
    }
      
-    std::cout << "Looping orthog" << std::endl;
+    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@ -274,7 +274,7 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
      
-    std::cout << "Writing back results " << std::endl;
+    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@ -291,7 +291,7 @@ public:
    }
    result = result*div;
      
-    std::cout << "Destroying plan " << std::endl;
+    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -277,6 +277,38 @@ public:
    assert(0);
  }
 };
+template<class Matrix,class Field>
+class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD shift;
+public:
+  ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    _Mat.Mdiag(in,out);
+    out = out + shift*in;
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};

 //////////////////////////////////////////////////////////
 // Even Odd Schur decomp operators; there are several
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
  typedef cublasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
-  typedef cl::sycl::queue *gridblasHandle_t;
+  typedef sycl::queue *gridblasHandle_t;
 #endif
 #ifdef GRID_ONE_MKL
-  typedef cl::sycl::queue *gridblasHandle_t;
+  typedef sycl::queue *gridblasHandle_t;
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
  typedef int32_t gridblasHandle_t;
@ -89,9 +89,9 @@ public:
      gridblasHandle = theGridAccelerator;
 #endif
 #ifdef GRID_ONE_MKL
-      cl::sycl::gpu_selector selector;
-      cl::sycl::device selectedDevice { selector };
-      cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
+      sycl::gpu_selector selector;
+      sycl::device selectedDevice { selector };
+      sycl::property_list q_prop{sycl::property::queue::in_order()};
      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 #endif
      gridblasInit=1;
@ -208,8 +208,8 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);

-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -367,28 +367,67 @@ public:
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
        });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@ -414,8 +453,8 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -514,28 +553,70 @@ public:
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@ -661,29 +742,41 @@ public:
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
-	  } );
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  });
      } else { 
 	assert(0);
      }
@ -809,28 +902,40 @@ public:
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
--- a/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
+++ b/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
@ -144,11 +144,11 @@ public:
      acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
    }    
    RealD t4 = usecond();
-    std::cout << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
-    std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
-    std::cout << "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
-    std::cout << "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
-    std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
  }
  
  void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
@ -242,16 +242,16 @@ public:
    RealD flops = 8.0*M*N*K;
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
-    std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
-    std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #else
    int nrhs;
    GridBase *grid;
@ -358,17 +358,17 @@ public:
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
    xybytes = 4*xybytes/(t2-t1)/1.e3;
-    std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
-    std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #endif
  }
 };
--- a/Grid/algorithms/iterative/AdefMrhs.h
+++ b/Grid/algorithms/iterative/AdefMrhs.h
@ -63,7 +63,12 @@ class TwoLevelCGmrhs
  GridStopWatch SmoothTimer;
  GridStopWatch InsertTimer;

-  
+  /*
+    Field rrr;
+  Field sss;
+  Field qqq;
+  Field zzz;
+  */  
  // more most opertor functions
  TwoLevelCGmrhs(RealD tol,
 		 Integer maxit,
@ -74,6 +79,12 @@ class TwoLevelCGmrhs
    MaxIterations(maxit),
    _FineLinop(FineLinop),
    _Smoother(Smoother)
+    /*
+    rrr(fine),
+    sss(fine),
+    qqq(fine),
+    zzz(fine)
+*/
  {
    grid       = fine;
  };
@ -81,8 +92,8 @@ class TwoLevelCGmrhs
  // Vector case
  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
  {
-    SolveSingleSystem(src,x);
-    //    SolvePrecBlockCG(src,x);
+    //    SolveSingleSystem(src,x);
+    SolvePrecBlockCG(src,x);
  }

 ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -657,6 +668,8 @@ public:
    CoarseField PleftProjMrhs(this->coarsegridmrhs);
    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);

+    //    this->rrr=in[0];
+
 #undef SMOOTHER_BLOCK_SOLVE
 #if SMOOTHER_BLOCK_SOLVE
    this->SmoothTimer.Start();
@ -669,6 +682,7 @@ public:
      this->SmoothTimer.Stop();
    }
 #endif
+    //    this->sss=Min[0];
    
    for(int rhs=0;rhs<nrhs;rhs++) {
      
@ -705,9 +719,11 @@ public:
    this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]  
    this->PromoteTimer.Stop();
    this->FineTimer.Start();
+    //    this->qqq=tmp[0];
    for(int rhs=0;rhs<nrhs;rhs++) {
      axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
    }
+    //    this->zzz=out[0];
    this->FineTimer.Stop();
  }
 };
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
      //Compute double precision rsd and also new RHS vector.
      Linop_d.HermOp(sol_d, tmp_d);
      RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
-      
+      std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;

      if(norm < OuterLoopNormMult * stop){
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 	break;
      }
-      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+      while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??

      PrecChangeTimer.Start();
      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -245,9 +245,10 @@ until convergence
 	_HermOp(src_n,tmp);
 	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
 	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
-	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+//	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+	RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
 	RealD vden = norm2(src_n);
-	RealD na = vnum/vden;
+	RealD na = std::sqrt(vnum/vden);
 	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
@ -255,6 +256,7 @@ until convergence
 	src_n = tmp;
      }
    }
+    std::cout << GridLogIRL << " Final evalMaxApprox  " << evalMaxApprox << std::endl;
 	
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@ -74,7 +74,7 @@ public:

  void operator() (const Field &src, Field &psi){

-    psi=Zero();
+    //    psi=Zero();
    RealD cp, ssq,rsq;
    ssq=norm2(src);
    rsq=Tolerance*Tolerance*ssq;
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once

+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+
 NAMESPACE_BEGIN(Grid);

 inline RealD AggregatePowerLaw(RealD x)
@ -95,7 +97,7 @@ public:

    RealD scale;

-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    ConjugateGradient<FineField> CG(1.0e-3,400,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);

@ -108,7 +110,7 @@ public:
      
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;

-      for(int i=0;i<1;i++){
+      for(int i=0;i<4;i++){

 	CG(hermop,noise,subspace[b]);

@ -124,6 +126,53 @@ public:
    }
  }

+  virtual void CreateSubspaceGCR(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
+  {
+    RealD scale;
+
+    TrivialPrecon<FineField> simple_fine;
+    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
+    FineField noise(FineGrid);
+    FineField src(FineGrid);
+    FineField guess(FineGrid);
+    FineField Mn(FineGrid);
+
+    for(int b=0;b<nn;b++){
+      
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
+
+      for(int i=0;i<2;i++){
+	//  void operator() (const Field &src, Field &psi){
+#if 1
+	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
+	src = noise;
+	guess=Zero();
+	GCR(src,guess);
+	subspace[b] = guess;
+#else
+	std::cout << GridLogMessage << " inverting on zero "<<std::endl;
+	src=Zero();
+	guess = noise;
+	GCR(src,guess);
+	subspace[b] = guess;
+#endif
+	noise = subspace[b];
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+      }
+
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
+      subspace[b]   = noise;
+
+    }
+  }
+
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
@ -160,14 +209,21 @@ public:

    int b =0;
    {
+      ComplexD ip;
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+
+      hermop.Op(Mn,tmp);
+      ip= innerProduct(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+
+      hermop.AdjOp(Mn,tmp); 
+      ip = innerProduct(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
      b++;
    }

@ -213,8 +269,18 @@ public:
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
-	  hermop.Op(Mn,tmp); 
-	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+
+
+	  ComplexD ip;
+
+	  hermop.Op(Mn,tmp);
+	  ip= innerProduct(Mn,tmp); 
+	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+
+	  hermop.AdjOp(Mn,tmp); 
+	  ip = innerProduct(Mn,tmp); 
+	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+	  
 	  b++;
 	}

@ -228,6 +294,70 @@ public:
    }
    assert(b==nn);
  }
+
+
+  virtual void CreateSubspacePolyCheby(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo1,
+				       int orderfilter,
+				       double lo2,
+				       int orderstep)
+  {
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
+    // Initial matrix element
+    hermop.Op(noise,Mn);
+    std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {
+      // Filter
+      std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
+      Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
+    }
+
+    // Generate a full sequence of Chebyshevs
+    for(int n=1;n<nn;n++){
+      std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
+      Chebyshev<FineField> Cheb(lo2,hi,orderstep);
+      Cheb(hermop,subspace[n-1],Mn);
+
+      for(int m=0;m<n;m++){
+	ComplexD c = innerProduct(subspace[m],Mn);
+	Mn = Mn - c*subspace[m];
+      }
+      
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5);
+      Mn=Mn*scale;
+      
+      subspace[n]=Mn;
+      
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
+
+    }
+  }
+
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@ -441,8 +441,20 @@ public:
    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
  }
 #else
+  //////////////////////////////////////////////////////////////////////
+  // Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    CoarsenOperator(linop,Subspace,Subspace);
+  }
+  //////////////////////////////////////////////////////////////////////
+  // Petrov - Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & U,
+		       Aggregation<Fobj,CComplex,nbasis> & V)
  {
    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
    GridBase *grid = FineGrid();
@ -458,11 +470,9 @@ public:
    // Orthogonalise the subblocks over the basis
    /////////////////////////////////////////////////////////////
    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
+    blockOrthogonalise(InnerProd,V.subspace);
+    blockOrthogonalise(InnerProd,U.subspace);

-    //    for(int s=0;s<Subspace.subspace.size();s++){
-      //      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
-    //    }
    const int npoint = geom.npoint;
      
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
@ -542,7 +552,7 @@ public:
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
 	tphaseBZ-=usecond();
-	phaV = phaF[p]*Subspace.subspace[i];
+	phaV = phaF[p]*V.subspace[i];
 	tphaseBZ+=usecond();

 	/////////////////////////////////////////////////////////////////////
@ -555,7 +565,7 @@ public:
 	//	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;

 	tproj-=usecond();
-	blockProject(coarseInner,MphaV,Subspace.subspace);
+	blockProject(coarseInner,MphaV,U.subspace);
 	coarseInner = conjugate(pha[p]) * coarseInner;

 	ComputeProj[p] = coarseInner;
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@ -69,7 +69,7 @@ public:
  }

  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
@ -175,10 +175,11 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
 template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview
-template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // 
+template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // Really want to deprecate
 template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page
 template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector

+/*
 template<class T> class vecView
 {
 protected:
@ -214,6 +215,7 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
 #define autoVecView(v_v,v,mode)					\
  auto v_v = VectorView(v,mode);				\
  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
+*/

 NAMESPACE_END(Grid);

--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -1,16 +1,15 @@
 #include <Grid/GridCore.h>
 #ifndef GRID_UVM

-#warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);

 #define MAXLINE 512
 static char print_buffer [ MAXLINE ];

-#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
-#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
+#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
+#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug  << print_buffer << std::endl;
 //#define dprintf(...) 
-
+//#define mprintf(...) 

 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
@ -111,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -121,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
    AccCache.AccPtr=(uint64_t) NULL;
-    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@ -141,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@ -169,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@ -184,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx",
+	  (uint64_t)AccCache.bytes,
+	  (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
+    dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
+    dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
+  if(bytes>=DeviceMaxBytes) {
+    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
+  }
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error

  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
+    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
-    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
+    dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
+    dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
  } else {
    assert(0);
  }
@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
-    dprintf("AccCache entry removed from LRU \n");
+    dprintf("AccCache entry removed from LRU ");
    LRUremove(AccCache);
  }

@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
  } else {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
--- a/Grid/communicator/Communicator_base.cc
+++ b/Grid/communicator/Communicator_base.cc
@ -57,18 +57,29 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////

+#ifdef USE_GRID_REDUCTION
+void CartesianCommunicator::GlobalSum(ComplexF &c)
+{
+  GlobalSumP2P(c);
+}
+void CartesianCommunicator::GlobalSum(ComplexD &c)
+{
+  GlobalSumP2P(c);
+}
+#else
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
-void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
-{
-  GlobalSumVector((float *)c,2*N);
-}
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumVector((double *)&c,2);
 }
+#endif
+void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
+{
+  GlobalSumVector((float *)c,2*N);
+}
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 #include <Grid/communicator/SharedMemory.h>

+#define NVLINK_GET
+
 NAMESPACE_BEGIN(Grid);

 extern bool Stencil_force_mpi ;
@ -127,7 +129,36 @@ public:
  void GlobalSumVector(ComplexD *c,int N);
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
-  
+
+  template<class obj> void GlobalSumP2P(obj &o)
+  {
+    std::vector<obj> column;
+    obj accum = o;
+    int source,dest;
+    for(int d=0;d<_ndimension;d++){
+      column.resize(_processors[d]);
+      column[0] = accum;
+      std::vector<MpiCommsRequest_t> list;
+      for(int p=1;p<_processors[d];p++){
+	ShiftedRanks(d,p,source,dest);
+	SendToRecvFromBegin(list,
+			    &column[0],
+			    dest,
+			    &column[p],
+			    source,
+			    sizeof(obj),d*100+p);
+
+      }
+      if (!list.empty()) // avoid triggering assert in comms == none
+	CommsComplete(list);
+      for(int p=1;p<_processors[d];p++){
+	accum = accum + column[p];
+      }
+    }
+    Broadcast(0,accum);
+    o=accum;
+  }
+
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
@ -138,8 +169,8 @@ public:
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
-  void CommsComplete(std::vector<CommsRequest_t> &list);
-  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+  void CommsComplete(std::vector<MpiCommsRequest_t> &list);
+  void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
 			   void *xmit,
 			   int dest,
 			   void *recv,
@ -158,6 +189,17 @@ public:
 			       int recv_from_rank,int do_recv,
 			       int bytes,int dir);

+  double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+				      void *xmit,
+				      int xmit_to_rank,int do_xmit,
+				      void *recv,
+				      int recv_from_rank,int do_recv,
+				      int xbytes,int rbytes,int dir);
+
+  // Could do a PollHtoD and have a CommsMerge dependence
+  void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
+  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
+
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;

 ////////////////////////////////////////////
@ -257,6 +258,25 @@ CartesianCommunicator::~CartesianCommunicator()
    }
  }
 }
+#ifdef USE_GRID_REDUCTION
+void CartesianCommunicator::GlobalSum(float &f){
+  CartesianCommunicator::GlobalSumP2P(f);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  CartesianCommunicator::GlobalSumP2P(d);
+}
+#else
+void CartesianCommunicator::GlobalSum(float &f){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+#endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@ -287,27 +307,18 @@ void CartesianCommunicator::GlobalMax(double &d)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
  assert(ierr==0);
 }
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }

-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
@ -332,7 +343,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  assert(ierr==0);
  list.push_back(xrq);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
+void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
 {
  int nreq=list.size();

@ -351,9 +362,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int from,
 					   int bytes)
 {
-  std::vector<CommsRequest_t> reqs(0);
-  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
+  std::vector<MpiCommsRequest_t> reqs(0);

  int myrank = _processor;
  int ierr;
@ -369,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);

-  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
-  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@ -381,12 +387,287 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }

-#undef NVLINK_GET // Define to use get instead of put DMA
+
+#ifdef ACCELERATOR_AWARE_MPI
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int dest,int dox,
+							   void *recv,
+							   int from,int dor,
+							   int xbytes,int rbytes,int dir)
+{
+  return 0.0; // Do nothing -- no preparation required
+}
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int dest,int dox,
+							 void *recv,
+							 int from,int dor,
+							 int xbytes,int rbytes,int dir)
+{
+  int ncomm  =communicator_halo.size();
+  int commdir=dir%ncomm;
+
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  assert(gme  == ShmRank);
+  double off_node_bytes=0.0;
+  int tag;
+  
+  if ( dor ) {
+    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+from*32;
+      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      assert(ierr==0);
+      list.push_back(rrq);
+      off_node_bytes+=rbytes;
+    }
+#ifdef NVLINK_GET
+    else { 
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
+      assert(shm!=NULL);
+      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+    }
+#endif
+  }
+  // This is a NVLINK PUT  
+  if (dox) {
+    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+_processor*32;
+      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      assert(ierr==0);
+      list.push_back(xrq);
+      off_node_bytes+=xbytes;
+    } else {
+#ifndef NVLINK_GET
+      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+      assert(shm!=NULL);
+      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+#endif
+    }
+  }
+  return off_node_bytes;
+}
+
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
+{
+  int nreq=list.size();
+  /*finishes Get/Put*/
+  acceleratorCopySynchronise();
+
+  if (nreq==0) return;
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  assert(ierr==0);
+  list.resize(0);
+  this->StencilBarrier(); 
+}
+
+#else /* NOT     ... ACCELERATOR_AWARE_MPI */
+///////////////////////////////////////////
+// Pipeline mode through host memory
+///////////////////////////////////////////
+  /*
+   * In prepare (phase 1):
+   * PHASE 1: (prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   * PHASE 2: (Begin)
+   * - complete all copies
+   * - post MPI send asynch
+   * - post device - device transfers
+   * PHASE 3: (Complete)
+   * - MPI_waitall
+   * - host-device transfers
+   *
+   *********************************
+   * NB could split this further:
+   *--------------------------------
+   * PHASE 1: (Prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   * PHASE 2: (BeginInterNode)
+   * - complete all copies 
+   * - post MPI send asynch
+   * PHASE 3: (BeginIntraNode)
+   * - post device - device transfers
+   * PHASE 4: (Complete)
+   * - MPI_waitall
+   * - host-device transfers asynch
+   * - (complete all copies) 
+   */
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int dest,int dox,
+							   void *recv,
+							   int from,int dor,
+							   int xbytes,int rbytes,int dir)
+{
+/*
+ * Bring sequence from Stencil.h down to lower level.
+ * Assume using XeLink is ok
+ */  
+  int ncomm  =communicator_halo.size();
+  int commdir=dir%ncomm;
+
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  assert(gme  == ShmRank);
+  double off_node_bytes=0.0;
+  int tag;
+
+  void * host_recv = NULL;
+  void * host_xmit = NULL;
+
+  /*
+   * PHASE 1: (Prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   */
+  
+  if ( dor ) {
+    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+from*32;
+      host_recv = this->HostBufferMalloc(rbytes);
+      ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      assert(ierr==0);
+      CommsRequest_t srq;
+      srq.PacketType = InterNodeRecv;
+      srq.bytes      = rbytes;
+      srq.req        = rrq;
+      srq.host_buf   = host_recv;
+      srq.device_buf = recv;
+      list.push_back(srq);
+      off_node_bytes+=rbytes;
+    }
+  }
+  
+  if (dox) {
+    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+
+      tag= dir+_processor*32;
+
+      host_xmit = this->HostBufferMalloc(xbytes);
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+      
+      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      //      assert(ierr==0);
+      //      off_node_bytes+=xbytes;
+
+      srq.PacketType = InterNodeXmit;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = host_xmit;
+      srq.device_buf = xmit;
+      srq.tag        = tag;
+      srq.dest       = dest;
+      srq.commdir    = commdir;
+      list.push_back(srq);
+    }
+  }
+
+  return off_node_bytes;
+}
+/*
+ * In the interest of better pipelining, poll for completion on each DtoH and 
+ * start MPI_ISend in the meantime
+ */
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeRecv ) {
+
+	int flag = 0;
+	MPI_Status status;
+	int ierr = MPI_Test(&list[idx].req,&flag,&status);
+	assert(ierr==0);
+
+	if ( flag ) {
+	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
+	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
+	  list[idx].PacketType=InterNodeReceiveHtoD;
+	} else {
+	  pending ++;
+	}
+      }
+    }
+    //    std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
+  } while ( pending );
+  
+}
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeXmit ) {
+
+	if ( acceleratorEventIsComplete(list[idx].ev) ) {
+
+	  void *host_xmit = list[idx].host_buf;
+	  uint32_t xbytes = list[idx].bytes;
+	  int dest        = list[idx].dest;
+	  int tag         = list[idx].tag;
+	  int commdir     = list[idx].commdir;
+	  ///////////////////
+	  // Send packet
+	  ///////////////////
+
+	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
+	  
+	  MPI_Request xrq;
+	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+	  assert(ierr==0);
+
+	  list[idx].req        = xrq; // Update the MPI request in the list
+
+	  list[idx].PacketType=InterNodeXmitISend;
+
+	} else {
+	  // not done, so return to polling loop
+	  pending++;
+	}
+      }
+    }
+  } while (pending);
+}  
+
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
@ -411,54 +692,106 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;

-  if ( dor ) {
-    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
-      tag= dir+from*32;
-      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      assert(ierr==0);
-      list.push_back(rrq);
-      off_node_bytes+=rbytes;
-    }
+  void * host_xmit = NULL;
+
+  ////////////////////////////////
+  // Receives already posted
+  // Copies already started
+  ////////////////////////////////
+  /*  
+   * PHASE 2: (Begin)
+   * - complete all copies
+   * - post MPI send asynch
+   */
 #ifdef NVLINK_GET
+  if ( dor ) {
+
+    if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
-#endif
-  }
-  
+
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+
+      srq.PacketType = IntraNodeRecv;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
+    }
+  }  
+#else
  if (dox) {
-    //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-      tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
-      list.push_back(xrq);
-      off_node_bytes+=xbytes;
-    } else {
-#ifndef NVLINK_GET
+
+    if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
-#endif
+
+      CommsRequest_t srq;
+      
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+
+      srq.PacketType = IntraNodeXmit;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
      
    }
  }
-
+#endif
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  int nreq=list.size();
+  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D

-  acceleratorCopySynchronise();
+  std::vector<MPI_Status> status;
+  std::vector<MPI_Request> MpiRequests;
+    
+  for(int r=0;r<list.size();r++){
+    // Must check each Send buf is clear to reuse
+    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
+    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
+  }

-  if (nreq==0) return;
+  int nreq=MpiRequests.size();

-  std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-  assert(ierr==0);
-  list.resize(0);
+  if (nreq>0) {
+    status.resize(MpiRequests.size());
+    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
+    assert(ierr==0);
+  }
+  
+  //  for(int r=0;r<nreq;r++){
+  //    if ( list[r].PacketType==InterNodeRecv ) {
+  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
+  //    }
+  //  }
+  
+  
+  list.resize(0);               // Delete the list
+  this->HostBufferFreeAll();    // Clean up the buffer allocs
+#ifndef NVLINK_GET
+  this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
+#endif   
 }
+#endif
+////////////////////////////////////////////
+// END PIPELINE MODE / NO CUDA AWARE MPI
+////////////////////////////////////////////
+
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@ -132,6 +132,17 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int xmit_to_rank,int dox,
+							   void *recv,
+							   int recv_from_rank,int dor,
+							   int xbytes,int rbytes, int dir)
+{
+  return 0.0;
+}
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,int dox,
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);

 #if defined (GRID_COMMS_MPI3) 
 typedef MPI_Comm    Grid_MPI_Comm;
+typedef MPI_Request MpiCommsRequest_t;
+#ifdef ACCELERATOR_AWARE_MPI
 typedef MPI_Request CommsRequest_t;
+#else
+/*
+ * Enable state transitions as each packet flows.
+ */
+enum PacketType_t {
+  FaceGather,
+  InterNodeXmit,
+  InterNodeRecv,
+  IntraNodeXmit,
+  IntraNodeRecv,
+  InterNodeXmitISend,
+  InterNodeReceiveHtoD
+};
+/*
+ *Package arguments needed for various actions along packet flow
+ */
+typedef struct {
+  PacketType_t PacketType;
+  void *host_buf;
+  void *device_buf;
+  int dest;
+  int tag;
+  int commdir;
+  unsigned long bytes;
+  acceleratorEvent_t ev;
+  MpiCommsRequest_t req;
+} CommsRequest_t;
+#endif
+
 #else 
+typedef int MpiCommsRequest_t;
 typedef int CommsRequest_t;
 typedef int Grid_MPI_Comm;
 #endif
@ -105,7 +137,7 @@ public:
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
-  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
+  //  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
  static void SharedMemoryZero(void *dest,size_t bytes);

 };
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -42,6 +42,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef ACCELERATOR_AWARE_MPI
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #define SHM_SOCKETS
+#else
+#ifdef HAVE_NUMAIF_H
+  #warning " Using NUMAIF "
+#include <numaif.h>
+#endif 
 #endif 
 #include <syscall.h>
 #endif
@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  HostCommBuf= malloc(bytes);
+  // printf("Host buffer allocate for GPU non-aware MPI\n");
+#if 0
+  HostCommBuf= acceleratorAllocHost(bytes);
+#else 
+  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
+#if 0
+  #warning "Moving host buffers to specific NUMA domain"
+  int numa;
+  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
+  if(numa_name) {
+    unsigned long page_size = sysconf(_SC_PAGESIZE);
+    numa = atoi(numa_name);
+    unsigned long page_count = bytes/page_size;
+    std::vector<void *> pages(page_count);
+    std::vector<int>    nodes(page_count,numa);
+    std::vector<int>    status(page_count,-1);
+    for(unsigned long p=0;p<page_count;p++){
+      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
+    }
+    int ret = move_pages(0,
+			 page_count,
+			 &pages[0],
+			 &nodes[0],
+			 &status[0],
+			 MPOL_MF_MOVE);
+    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
+    if (ret) perror(" move_pages failed for reason:");
+  }
+#endif  
+  acceleratorPin(HostCommBuf,bytes);
+#endif  
+
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
@ -569,8 +605,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;

-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
+    auto zeDevice    = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
+    auto zeContext   = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
      
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
@ -880,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
  bzero(dest,bytes);
 #endif
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-{
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-  acceleratorCopyToDevice(src,dest,bytes);
-#else   
-  bcopy(src,dest,bytes);
-#endif
-}
+//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+//{
+//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+//  acceleratorCopyToDevice(src,dest,bytes);
+//#else   
+//  bcopy(src,dest,bytes);
+//#endif
+//}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
@ -923,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);

    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
+    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
  }
  ShmBufferFreeAll();

@ -953,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  //SharedMemoryTest();
+  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@ -975,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
       check[0]=GlobalSharedMemory::WorldNode;
       check[1]=r;
       check[2]=magic;
-       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
+       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
    }
  }
  ShmBarrier();
  for(uint64_t r=0;r<ShmSize;r++){
-    ShmBarrier();
-    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
-    ShmBarrier();
+    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==magic);
-    ShmBarrier();
  }
+  ShmBarrier();
+  std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
 }

 void *SharedMemory::ShmBuffer(int rank)
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>


 NAMESPACE_BEGIN(Grid); 
-
+const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
  typedef typename vobj::vector_type vector_type;
@ -55,17 +55,17 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  RealD t1,t0;
  t0=usecond();
  if ( !comm_dim ) {
-    std::cout << "CSHIFT: Cshift_local" <<std::endl;
+    //    std::cout << "CSHIFT: Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
  } else if ( splice_dim ) {
-    std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
+    //    std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift);
  } else {
-    std::cout << "CSHIFT: Cshift_comms" <<std::endl;
+    //    std::cout << "CSHIFT: Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
  t1=usecond();
-  //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
+  if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }

@ -76,12 +76,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
  sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
  sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);

-  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+  //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
  if ( sshift[0] == sshift[1] ) {
-    std::cout << "Single pass Cshift_comms" <<std::endl;
+    //    std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift,0x3);
  } else {
-    std::cout << "Two pass Cshift_comms" <<std::endl;
+    //    std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
@ -94,12 +94,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
  sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
  sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);

-  std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+  //  std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
  if ( sshift[0] == sshift[1] ) {
-    std::cout << "Single pass Cshift_comms" <<std::endl;
+    //    std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
  } else {
-    std::cout << "Two pass Cshift_comms" <<std::endl;
+    //    std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
@ -125,7 +125,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
-    
+#ifndef ACCELERATOR_AWARE_MPI
+  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
+  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
+#endif
+  
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  RealD tcopy=0.0;
@ -156,16 +160,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
+
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      
      tcomms-=usecond();
      grid->Barrier();

+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
+#else
+      // bouncy bouncy
+      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
+      grid->SendToRecvFrom((void *)&hsend_buf[0],
+			   xmit_to_rank,
+			   (void *)&hrecv_buf[0],
+			   recv_from_rank,
+			   bytes);
+      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
+#endif
+
      xbytes+=bytes;
      grid->Barrier();
      tcomms+=usecond();
@ -175,11 +192,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      tscatter+=usecond();
    }
  }
-  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  if (Cshift_verbose){
+    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  }
 }

 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -197,9 +216,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;

-  std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
-	    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
-	    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+  //  std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //	    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //	    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;

  assert(comm_dim==1);
  assert(simd_layout==2);
@ -224,12 +243,16 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  static std::vector<deviceVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
- 
+
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
-
+#ifndef ACCELERATOR_AWARE_MPI
+  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
+  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
+#endif
+  
  int bytes = buffer_size*sizeof(scalar_object);

  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
@ -281,11 +304,22 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo

 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
+#ifdef ACCELERATOR_AWARE_MPI
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
+#else
+      // bouncy bouncy
+	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
+	grid->SendToRecvFrom((void *)&hsend_buf[0],
+			     xmit_to_rank,
+			     (void *)&hrecv_buf[0],
+			     recv_from_rank,
+			     bytes);
+	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
+#endif

 	xbytes+=bytes;
 	grid->Barrier();
@ -301,12 +335,15 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
  }
-  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  if(Cshift_verbose){
+    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  }
 }
+
 NAMESPACE_END(Grid); 

 #endif
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
  });
 }

+#define FAST_AXPY_NORM
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpy_norm");
-    return axpy_norm_fast(ret,a,x,y);
+#ifdef FAST_AXPY_NORM
+  return axpy_norm_fast(ret,a,x,y);
+#else
+  ret = a*x+y;
+  RealD nn=norm2(ret);
+  return nn;
+#endif
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpby_norm");
-    return axpby_norm_fast(ret,a,b,x,y);
+#ifdef FAST_AXPY_NORM
+  return axpby_norm_fast(ret,a,b,x,y);
+#else
+  ret = a*x+b*y;
+  RealD nn=norm2(ret);
+  return nn;
+#endif
 }

 /// Trace product
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -290,8 +290,10 @@ template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();

+  bool ok;
 #ifdef GRID_SYCL
  uint64_t csum=0;
+  uint64_t csum2=0;
  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
  {
    // Hack
@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
    uint64_t *base= (uint64_t *)&l_v[0];
    csum=svm_xor(base,words);
+    ok = FlightRecorder::CsumLog(csum);
+    if ( !ok ) {
+      csum2=svm_xor(base,words);
+      std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+    } else {
+      //      csum2=svm_xor(base,words);
+      //      std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+    }
+    assert(ok);
  }
-  FlightRecorder::CsumLog(csum);
 #endif
+  FlightRecorder::StepLog("rank inner product");
  ComplexD nrm = rankInnerProduct(left,right);
+  //  ComplexD nrmck=nrm;
  RealD local = real(nrm);
-  FlightRecorder::NormLog(real(nrm)); 
+  ok = FlightRecorder::NormLog(real(nrm));
+  if ( !ok ) {
+    ComplexD nrm2 = rankInnerProduct(left,right);
+    RealD local2 = real(nrm2);
+    std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
+    assert(ok);
+  }
+  FlightRecorder::StepLog("Start global sum");
+  //  grid->GlobalSumP2P(nrm);
  grid->GlobalSum(nrm);
+  FlightRecorder::StepLog("Finished global sum");
+  //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
  FlightRecorder::ReductionLog(local,real(nrm)); 
  return nrm;
 }
@ -353,8 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
+  bool ok;
+#ifdef GRID_SYCL
+  uint64_t csum=0;
+  uint64_t csum2=0;
+  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
+  {
+    // z_v
+    {
+      Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
+      uint64_t *base= (uint64_t *)&z_v[0];
+      csum=svm_xor(base,words);
+      ok = FlightRecorder::CsumLog(csum);
+      if ( !ok ) {
+	csum2=svm_xor(base,words);
+	std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+      }
+      assert(ok);
+    }
+    // inner_v
+    {
+      Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
+      uint64_t *base= (uint64_t *)&inner_tmp_v[0];
+      csum=svm_xor(base,words);
+      ok = FlightRecorder::CsumLog(csum);
+      if ( !ok ) {
+	csum2=svm_xor(base,words);
+	std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+      }
+      assert(ok);
+    }
+  }
+#endif
  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
+  ok = FlightRecorder::NormLog(real(nrm));
+  assert(ok);
+  RealD local = real(nrm);
  grid->GlobalSum(nrm);
+  FlightRecorder::ReductionLog(local,real(nrm));
  return nrm; 
 }
 
@ -498,6 +556,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
  scalar_type * ptr = (scalar_type *) &result[0];
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
+  //  std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
+  
 }
 template<class vobj> inline
 std::vector<typename vobj::scalar_object> 
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@ -16,11 +16,11 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
  Integer nsimd= vobj::Nsimd();
  { 
    sycl::buffer<sobj, 1> abuff(&ret, {1});
-    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>());
-      cgh.parallel_for(cl::sycl::range<1>{osites},
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
+      cgh.parallel_for(sycl::range<1>{osites},
                      Reduction,
-                      [=] (cl::sycl::id<1> item, auto &sum) {
+                      [=] (sycl::id<1> item, auto &sum) {
                        auto osite   = item[0];
                        sum +=Reduce(lat[osite]);
                      });
@ -75,11 +75,11 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
  Word ret = 0;
  { 
    sycl::buffer<Word, 1> abuff(&ret, {1});
-    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
-      cgh.parallel_for(cl::sycl::range<1>{L},
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
+      cgh.parallel_for(sycl::range<1>{L},
                      Reduction,
-                      [=] (cl::sycl::id<1> index, auto &sum) {
+                      [=] (sycl::id<1> index, auto &sum) {
                        sum ^=vec[index];
                      });
    });
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
  
  //copy offsets to device
-  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
+  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  
  
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
    exit(EXIT_FAILURE);
  }
  
-  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
+  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  
  //sync after copy
  accelerator_barrier();
@ -141,11 +141,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
  });

  for (int r = 0; r < rd; r++) {
-      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
-          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
+      theGridAccelerator->submit([&](sycl::handler &cgh) {
+          auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
+          cgh.parallel_for(sycl::range<1>{subvol_size},
          Reduction,
-          [=](cl::sycl::id<1> item, auto &sum) {
+          [=](sycl::id<1> item, auto &sum) {
              auto s = item[0];
              sum += rb_p[r*subvol_size+s];
          });
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -466,9 +466,15 @@ public:
    static deviceVector<vobj> recv_buf;
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);
+#ifndef ACCELERATOR_AWARE_MPI
+    static hostVector<vobj> hsend_buf; 
+    static hostVector<vobj> hrecv_buf;
+    hsend_buf.resize(buffer_size*2*depth);    
+    hrecv_buf.resize(buffer_size*2*depth);
+#endif    

-    std::vector<CommsRequest_t> fwd_req;   
-    std::vector<CommsRequest_t> bwd_req;   
+    std::vector<MpiCommsRequest_t> fwd_req;   
+    std::vector<MpiCommsRequest_t> bwd_req;   

    int words = buffer_size;
    int bytes = words * sizeof(vobj);
@ -495,9 +501,17 @@ public:
      t_gather+=usecond()-t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&send_buf[d*buffer_size], xmit_to_rank,
 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
+      grid->SendToRecvFromBegin(fwd_req,
+				(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
+				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
+#endif
      t_comms+=usecond()-t;
     }
    for ( int d=0;d < depth ; d ++ ) {
@ -508,9 +522,17 @@ public:
      t_gather+= usecond() - t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
+      grid->SendToRecvFromBegin(bwd_req,
+				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
+				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
+#endif      
      t_comms+=usecond()-t;
    }

--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@ -98,7 +98,7 @@ public:
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
  virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
-
+ 
  /////////////////////////////////////////////////////////////
  // virtual smeared interface through configuration container
  /////////////////////////////////////////////////////////////
@ -132,6 +132,10 @@ public:
 template <class GaugeField >
 class EmptyAction : public Action <GaugeField>
 {
+  using Action<GaugeField>::refresh;
+  using Action<GaugeField>::Sinitial;
+  using Action<GaugeField>::deriv;
+
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative
--- a/Grid/qcd/action/fermion/AbstractEOFAFermion.h
+++ b/Grid/qcd/action/fermion/AbstractEOFAFermion.h
@ -55,6 +55,11 @@ public:
  RealD alpha; // Mobius scale
  RealD k;     // EOFA normalization constant

+  // Device resident
+  deviceVector<Coeff_t> d_shift_coefficients;
+  deviceVector<Coeff_t> d_MooeeInv_shift_lc;
+  deviceVector<Coeff_t> d_MooeeInv_shift_norm;
+  
  virtual void Instantiatable(void) = 0;

  // EOFA-specific operations
@ -92,6 +97,11 @@ public:
    this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
      ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
      ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
+    
+    d_shift_coefficients.resize(Ls);
+    d_MooeeInv_shift_lc.resize(Ls);
+    d_MooeeInv_shift_norm.resize(Ls);
+
  };
 };

--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -124,6 +124,11 @@ public:
  RealD                _b;
  RealD                _c;

+  // possible boost
+  std::vector<ComplexD> qmu;
+  void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
+  void addQmu(const FermionField &in, FermionField &out, int dag);
+  
  // Cayley form Moebius (tanh and zolotarev)
  std::vector<Coeff_t> omega;
  std::vector<Coeff_t> bs;    // S dependent coeffs
@ -143,6 +148,17 @@ public:
  std::vector<Coeff_t> ueem;
  std::vector<Coeff_t> dee;

+  // Device memory
+  deviceVector<Coeff_t> d_diag;
+  deviceVector<Coeff_t> d_upper;
+  deviceVector<Coeff_t> d_lower;
+
+  deviceVector<Coeff_t> d_lee;
+  deviceVector<Coeff_t> d_dee;
+  deviceVector<Coeff_t> d_uee;
+  deviceVector<Coeff_t> d_leem;
+  deviceVector<Coeff_t> d_ueem;
+
  // Matrices of 5d ee inverse params
  //  std::vector<iSinglet<Simd> >  MatpInv;
  //  std::vector<iSinglet<Simd> >  MatmInv;
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@ -60,6 +60,50 @@ public:
  //      virtual void   Instantiatable(void)=0;
  virtual void   Instantiatable(void) =0;

+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
+  {
+    std::cout << "Free Propagator for PartialFraction"<<std::endl;
+    FermionField in_k(in.Grid());
+    FermionField prop_k(in.Grid());
+    
+    FFT theFFT((GridCartesian *) in.Grid());
+
+    //phase for boundary condition
+    ComplexField coor(in.Grid());
+    ComplexField ph(in.Grid());  ph = Zero();
+    FermionField in_buf(in.Grid()); in_buf = Zero();
+    typedef typename Simd::scalar_type Scalar;
+    Scalar ci(0.0,1.0);
+    assert(twist.size() == Nd);//check that twist is Nd
+    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    int shift = 0;
+    for(unsigned int nu = 0; nu < Nd; nu++)
+      {
+	// Shift coordinate lattice index by 1 to account for 5th dimension.
+	LatticeCoordinate(coor, nu + shift);
+	double boundary_phase = ::acos(real(boundary[nu]));
+	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
+	//momenta for propagator shifted by twist+boundary
+	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
+      }
+    in_buf = exp(ci*ph*(-1.0))*in;
+
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    
+    //phase for boundary condition
+    out = out * exp(ci*ph);
+  };
+
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+    std::vector<Complex> boundary;
+    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+    FreePropagator(in,out,mass,boundary,twist);
+  };
+
+  
  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
--- a/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@ -42,7 +42,7 @@ public:

     void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
       this->MomentumSpacePropagatorHw(out,in,_m,twist);
-  };
+     };

  // Constructors
  OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
--- a/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@ -41,6 +41,10 @@ public:
 public:

  // Constructors
+  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };

  OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
 				      GridCartesian         &FiveDimGrid,
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@ -41,6 +41,9 @@ public:
 public:

  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
  // Constructors
  OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
 				   GridCartesian         &FiveDimGrid,
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@ -40,6 +40,9 @@ public:
  INHERIT_IMPL_TYPES(Impl);

  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
  // Constructors
  OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
 					GridCartesian         &FiveDimGrid,
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@ -41,6 +41,9 @@ public:
 public:

  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
  // Constructors
  OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
 					  GridCartesian         &FiveDimGrid,
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@ -40,6 +40,11 @@ public:
  INHERIT_IMPL_TYPES(Impl);

  virtual void   Instantiatable(void){};
+
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
+
  // Constructors
  OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
 					       GridCartesian         &FiveDimGrid,
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
 public:
  INHERIT_IMPL_TYPES(Impl);

-  const int part_frac_chroma_convention=1;
+  const int part_frac_chroma_convention=0;

  void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
  void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
@ -83,11 +83,70 @@ public:
 			   GridRedBlackCartesian &FourDimRedBlackGrid,
 			   RealD _mass,RealD M5,const ImplParams &p= ImplParams());

+  PartialFractionFermion5D(GaugeField &_Umu,
+			   GridCartesian         &FiveDimGrid,
+			   GridRedBlackCartesian &FiveDimRedBlackGrid,
+			   GridCartesian         &FourDimGrid,
+			   GridRedBlackCartesian &FourDimRedBlackGrid,
+			   RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
+
+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
+  {
+    std::cout << "Free Propagator for PartialFraction"<<std::endl;
+    FermionField in_k(in.Grid());
+    FermionField prop_k(in.Grid());
+    
+    FFT theFFT((GridCartesian *) in.Grid());
+
+    //phase for boundary condition
+    ComplexField coor(in.Grid());
+    ComplexField ph(in.Grid());  ph = Zero();
+    FermionField in_buf(in.Grid()); in_buf = Zero();
+    typedef typename Simd::scalar_type Scalar;
+    Scalar ci(0.0,1.0);
+    assert(twist.size() == Nd);//check that twist is Nd
+    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    int shift = 0;
+    for(unsigned int nu = 0; nu < Nd; nu++)
+      {
+	// Shift coordinate lattice index by 1 to account for 5th dimension.
+	LatticeCoordinate(coor, nu + shift);
+	double boundary_phase = ::acos(real(boundary[nu]));
+	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
+	//momenta for propagator shifted by twist+boundary
+	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
+      }
+    in_buf = exp(ci*ph*(-1.0))*in;
+
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    if ( this->qmu.size() ){
+      this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
+    } else {
+      this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
+    }
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    
+    //phase for boundary condition
+    out = out * exp(ci*ph);
+  };
+
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+    std::vector<Complex> boundary;
+    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+    FreePropagator(in,out,mass,boundary,twist);
+  };
+
+  void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
+  void addQmu(const FermionField &in, FermionField &out, int dag);
+
 protected:

  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
  virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);

+  std::vector<RealD> qmu;
+
  // Part frac
  RealD mass;
  RealD dw_diag;
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -414,29 +414,6 @@ public:
    //    surface_list.resize(0);
    this->same_node.resize(npoints);
  };
-
-  /*
-  void BuildSurfaceList(int Ls,int vol4){
-
-    // find same node for SHM
-    // Here we know the distance is 1 for WilsonStencil
-    for(int point=0;point<this->_npoints;point++){
-      this->same_node[point] = this->SameNode(point);
-    }
-    
-    for(int site = 0 ;site< vol4;site++){
-      int local = 1;
-      for(int point=0;point<this->_npoints;point++){
-	if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ 
-	  local = 0;
-	}
-      }
-      if(local == 0) { 
-	surface_list.push_back(site);
-      }
-    }
-  }
-  */
  
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
@ -507,6 +484,11 @@ public:
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
+#ifdef NVLINK_GET
+    this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
+    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
+    // Or issue barrier AFTER the DMA is running
+#endif    
  }

 };
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -109,6 +109,8 @@ public:
  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
+				  std::vector<double> qmu) ;

  // Implement hopping term non-hermitian hopping term; half cb or both
  // Implement s-diagonal DW
@ -117,6 +119,9 @@ public:
  void DhopOE(const FermionField &in, FermionField &out,int dag);
  void DhopEO(const FermionField &in, FermionField &out,int dag);

+  void DhopComms  (const FermionField &in, FermionField &out);
+  void DhopCalc   (const FermionField &in, FermionField &out,uint64_t *ids);
+  
  // add a DhopComm
  // -- suboptimal interface will presently trigger multiple comms.
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@ -57,6 +57,10 @@ public:
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
 			 int interior=1,int exterior=1) ;

+  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 uint64_t *ids);
+  
  static void DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out,
 			    int interior=1,int exterior=1) ;
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -48,7 +48,8 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
  mass_plus(_mass), mass_minus(_mass)
-{ 
+{
+  // qmu defaults to zero size;
 }

 ///////////////////////////////////////////////////////////////
@ -270,6 +271,34 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  M5Ddag(psi,psi,Din,lower,diag,upper);
 }

+template<class Impl>
+void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
+{
+  if ( qmu.size() ) {
+
+    Gamma::Algebra Gmu [] = {
+      Gamma::Algebra::GammaX,
+      Gamma::Algebra::GammaY,
+      Gamma::Algebra::GammaZ,
+      Gamma::Algebra::GammaT
+    };
+    std::vector<ComplexD> coeff(Nd);
+    ComplexD ci(0,1);
+
+    assert(qmu.size()==Nd);
+
+    for(int mu=0;mu<Nd;mu++){
+       coeff[mu] = ci*qmu[mu];
+       if ( dag ) coeff[mu] = conjugate(coeff[mu]);
+    }
+
+    chi = chi + Gamma(Gmu[0])*psi*coeff[0];
+    for(int mu=1;mu<Nd;mu++){
+      chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
+    }
+  }
+}
+
 template<class Impl>
 void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
@ -277,8 +306,12 @@ void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
  
  // Assemble Din
  Meooe5D(psi,Din);
-  
+
  this->DW(Din,chi,DaggerNo);
+
+  // add i q_mu gamma_mu here
+  addQmu(Din,chi,DaggerNo);
+  
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby(chi,1.0,1.0,chi,psi); 
  
@ -295,6 +328,9 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  FermionField Din(psi.Grid());
  // Apply Dw
  this->DW(psi,Din,DaggerYes); 
+
+  // add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
+  addQmu(psi,Din,DaggerYes);
  
  MeooeDag5D(Din,chi);
  
@ -488,7 +524,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  leem.resize(Ls);
  uee.resize(Ls);
  ueem.resize(Ls);
-  
+
  for(int i=0;i<Ls;i++){
    
    dee[i] = bee[i];
@ -529,6 +565,18 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    dee[Ls-1] += delta_d;
  }  

+  //////////////////////////////////////////
+  // Device buffers
+  //////////////////////////////////////////
+  d_diag.resize(Ls);
+  d_upper.resize(Ls);
+  d_lower.resize(Ls);
+
+  d_dee.resize(Ls);
+  d_lee.resize(Ls);
+  d_uee.resize(Ls);
+  d_leem.resize(Ls);
+  d_ueem.resize(Ls);
  //  int inv=1;
  //  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
  //  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@ -57,9 +57,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,

  int Ls =this->Ls;

-  static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
  
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
@ -99,9 +99,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,

  int Ls=this->Ls;

-  static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
  
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
@ -134,11 +134,11 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi

  int Ls=this->Ls;

-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));

  auto plee  = & d_lee [0];
  auto pdee  = & d_dee [0];
@ -196,11 +196,11 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
  autoView(psi , psi_i,AcceleratorRead);
  autoView(chi , chi_i,AcceleratorWrite);

-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));

  auto plee  = & d_lee [0];
  auto pdee  = & d_dee [0];
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@ -42,13 +42,13 @@ template<class Impl>
 void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
 {
  // How to check Ls matches??
-  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
  int Ls = this->Ls;
+  std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
  assert(zdata->db==Ls);// Beta has Ls coeffs

  R=(1+this->mass)/(1-this->mass);
@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@ -51,13 +51,13 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());

-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
-  
-  auto pdiag = &d_diag[0];
-  auto pupper = &d_upper[0];
-  auto plower = &d_lower[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+
+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));

  // Flops = 6.0*(Nc*Ns) *Ls*vol
  
@ -89,14 +89,14 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  autoView( phi , phi_i, AcceleratorRead);
  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
  
-  auto pdiag = &d_diag[0];
-  auto pupper = &d_upper[0];
-  auto plower = &d_lower[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+
+  acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));

  // Flops = 6.0*(Nc*Ns) *Ls*vol

@ -125,18 +125,18 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
  autoView( chi, chi_i, AcceleratorWrite);
  int Ls = this->Ls;

-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
-
-  auto plee  = & d_lee [0];
-  auto pdee  = & d_dee [0];
-  auto puee  = & d_uee [0];
-  auto pleem = & d_leem[0];
-  auto pueem = & d_ueem[0];
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];
  
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
+
  uint64_t nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@ -50,14 +50,14 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField

  assert(phi.Checkerboard() == psi.Checkerboard());

-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
-  
-  auto pdiag = &d_diag[0];
-  auto pupper = &d_upper[0];
-  auto plower = &d_lower[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];

+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
+  
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -93,15 +93,15 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
  
  assert(phi.Checkerboard() == psi.Checkerboard());

-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t));
-  
-  auto pdiag = &d_diag[0];
-  auto pupper = &d_upper[0];
-  auto plower = &d_lower[0];
-  auto pshift_coeffs = &d_shift_coeffs[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+  auto pshift_coeffs = &this->d_shift_coefficients[0];
+
+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));

  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@ -138,14 +138,14 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
  autoView(chi , chi_i, AcceleratorWrite);

  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
  
-  auto pdiag = &d_diag[0];
-  auto pupper = &d_upper[0];
-  auto plower = &d_lower[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+
+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));

  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@ -180,16 +180,16 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm

  assert(phi.Checkerboard() == psi.Checkerboard());

-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t));
-  
-  auto pdiag = &d_diag[0];
-  auto pupper = &d_upper[0];
-  auto plower = &d_lower[0];
-  auto pshift_coeffs = &d_shift_coeffs[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+  auto pshift_coeffs = &this->d_shift_coefficients[0];

+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
+  
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  auto pm = this->pm;

@ -230,17 +230,17 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);

-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];

-  auto plee  = & d_lee [0];
-  auto pdee  = & d_dee [0];
-  auto puee  = & d_uee [0];
-  auto pleem = & d_leem[0];
-  auto pueem = & d_ueem[0];
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));

  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }

@ -293,23 +293,22 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  autoView(chi , chi_i, AcceleratorWrite);

  // Move into object and constructor
-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
-
  auto pm = this->pm;
-  auto plee  = & d_lee [0];
-  auto pdee  = & d_dee [0];
-  auto puee  = & d_uee [0];
-  auto pleem = & d_leem[0];
-  auto pueem = & d_ueem[0];
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];
+  auto pMooeeInv_shift_lc   = &this->d_MooeeInv_shift_lc[0];
+  auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0];

-  static deviceVector<Coeff_t> d_MooeeInv_shift_lc(Ls); acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&d_MooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_MooeeInv_shift_norm(Ls); acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&d_MooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
-  auto pMooeeInv_shift_lc   = &d_MooeeInv_shift_lc[0];
-  auto pMooeeInv_shift_norm = &d_MooeeInv_shift_norm[0];
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -367,17 +366,17 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);

-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+  auto plee  = &this->d_lee [0];
+  auto pdee  = &this->d_dee [0];
+  auto puee  = &this->d_uee [0];
+  auto pleem = &this->d_leem[0];
+  auto pueem = &this->d_ueem[0];

-  auto plee  = & d_lee [0];
-  auto pdee  = & d_dee [0];
-  auto puee  = & d_uee [0];
-  auto pleem = & d_leem[0];
-  auto pueem = & d_ueem[0];
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -426,25 +425,23 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
  autoView(chi , chi_i, AcceleratorWrite);
  int Ls = this->Ls;

-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
-  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
-
  auto pm = this->pm;
-  auto plee  = & d_lee [0];
-  auto pdee  = & d_dee [0];
-  auto puee  = & d_uee [0];
-  auto pleem = & d_leem[0];
-  auto pueem = & d_ueem[0];
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];

-  static deviceVector<Coeff_t> d_MooeeInvDag_shift_lc(Ls);
-  static deviceVector<Coeff_t> d_MooeeInvDag_shift_norm(Ls);
-  acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&d_MooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
-  acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&d_MooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
-  auto pMooeeInvDag_shift_lc   = &d_MooeeInvDag_shift_lc[0];
-  auto pMooeeInvDag_shift_norm = &d_MooeeInvDag_shift_norm[0];
+  auto pMooeeInvDag_shift_lc   = &this->d_MooeeInv_shift_lc[0];
+  auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0];
+
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));

  //  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
  //  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@ -237,7 +237,32 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
  //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
  //

-  this->DW(psi,D,DaggerNo); 
+  this->DW(psi,D,DaggerNo);
+
+  // DW - DW+iqslash
+  //  (g5 Dw)^dag = g5 Dw
+  //  (iqmu g5 gmu)^dag = (-i qmu gmu^dag g5^dag) = i qmu g5 gmu
+  if ( qmu.size() ) {
+
+    std::cout<< "Mat" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
+    assert(qmu.size()==Nd);
+
+    FermionField qslash_psi(psi.Grid());
+
+    Gamma::Algebra Gmu [] = {
+			     Gamma::Algebra::GammaX,
+			     Gamma::Algebra::GammaY,
+			     Gamma::Algebra::GammaZ,
+			     Gamma::Algebra::GammaT
+    };
+    qslash_psi = qmu[0]*(Gamma(Gmu[0])*psi);
+    for(int mu=1;mu<Nd;mu++){
+      qslash_psi = qslash_psi + qmu[mu]*(Gamma(Gmu[mu])*psi);
+    }
+    ComplexD ci(0.0,1.0);
+    qslash_psi = ci*qslash_psi ; // i qslash
+    D = D + qslash_psi;
+  }

  int nblock=(Ls-1)/2;
  for(int b=0;b<nblock;b++){
@ -255,15 +280,55 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
  }
 	
  {
+    // The 'conventional' Cayley overlap operator is
+    //
+    // Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
+    //
+    //
+    // With massless limit 1/2(1+g5 sgnHw)
+    //
+    // Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
+    //
+    // However, the conventional normalisation has both a leading order factor of 2 in Zq
+    // at tree level AND a mass dependent (1-m) that are convenient to absorb.
+    //
+    // In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
+    //
+    // num = -i sin kmu gmu
+    //
+    // denom ( sqrt(sk^2 + (2shk^2 - 1)^2
+    //    b_k = sk2 - M5;
+    //     
+    //    w_k = sqrt(sk + b_k*b_k);
+    //
+    //    denom= ( w_k + b_k + mass*mass) ;
+    //
+    //    denom= one/denom;
+    //    out = num*denom;
+    //
+    // Chroma, and Grid define partial fraction via 4d operator
+    //
+    //   Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
+    //
+    // Now since:
+    //
+    //      (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
+    //
+    // This corresponds to a modified mass parameter
+    //
+    // It has an annoying 
+    //
+    // 
    double R=(1+this->mass)/(1-this->mass);
-    //R g5 psi[Ls] + p[0] H
+    //R g5 psi[Ls] + p[0] Hw
    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
-	
+    
    for(int b=0;b<nblock;b++){
      int s = 2*b+1;
      double pp = p[nblock-1-b];
      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
    }
+   
  }

 }
@ -411,17 +476,18 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
    {
+      //void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
      int Ls = this->Ls;
      conformable(imported5d.Grid(),this->FermionGrid());
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
@ -442,7 +508,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,

 {
  int Ls = this->Ls;
-
+  qmu.resize(0);
  assert((Ls&0x1)==1); // Odd Ls required
  int nrational=Ls-1;

@ -460,6 +526,22 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
  Approx::zolotarev_free(zdata);

 }
+template<class Impl>
+PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
+							 GridCartesian         &FiveDimGrid,
+							 GridRedBlackCartesian &FiveDimRedBlackGrid,
+							 GridCartesian         &FourDimGrid,
+							 GridRedBlackCartesian &FourDimRedBlackGrid,
+							 RealD _mass,RealD M5,
+							 std::vector<RealD> &_qmu,
+							 const ImplParams &p)
+  : PartialFractionFermion5D<Impl>(_Umu,
+			     FiveDimGrid,FiveDimRedBlackGrid,
+			     FourDimGrid,FourDimRedBlackGrid,
+			     _mass,M5,p)
+{
+  qmu=_qmu;
+}

 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -325,29 +325,25 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
  {
-    std::cout << " WilsonFermion5D gather " <<std::endl;
+    //    std::cout << " WilsonFermion5D gather " <<std::endl;
    GRID_TRACE("Gather");
    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
  }
  
-  std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
+  //  std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
  std::vector<std::vector<CommsRequest_t> > requests;
-  auto id=traceStart("Communicate overlapped");
-  st.CommunicateBegin(requests);

+#if 1
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  {
-  std::cout << " WilsonFermion5D Comms merge " <<std::endl;
-    GRID_TRACE("MergeSHM");
-    st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-  }
-      
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms 
+#endif
+
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
-  std::cout << " WilsonFermion5D Interior " <<std::endl;
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
@ -356,25 +352,35 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
-
+  
+  //ifdef GRID_ACCELERATED
+#if 0
+  /////////////////////////////
+  // Overlap with comms -- on GPU the interior kernel call is nonblocking
+  /////////////////////////////
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+#endif
+  
+  
  /////////////////////////////
  // Complete comms
  /////////////////////////////
-  std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
+  //  std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
  st.CommunicateComplete(requests);
-  traceStop(id);
+  //  traceStop(id);

  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
  {
-    std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
+    //    std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
    GRID_TRACE("Merge");
    st.CommsMerge(compressor);
  }
  

-  std::cout << " WilsonFermion5D Exterior " <<std::endl;
+  //  std::cout << " WilsonFermion5D Exterior " <<std::endl;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
@ -382,7 +388,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
-  std::cout << " WilsonFermion5D Done " <<std::endl;
+  //  std::cout << " WilsonFermion5D Done " <<std::endl;
 }


@ -397,13 +403,13 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,

  int LLs = in.Grid()->_rdimensions[0];

-  std::cout << " WilsonFermion5D Halo exch " <<std::endl;
+  //  std::cout << " WilsonFermion5D Halo exch " <<std::endl;
  {
    GRID_TRACE("HaloExchange");
    st.HaloExchangeOpt(in,compressor);
  }
  
-  std::cout << " WilsonFermion5D Dhop " <<std::endl;
+  //  std::cout << " WilsonFermion5D Dhop " <<std::endl;
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDag");
@ -412,7 +418,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
-  std::cout << " WilsonFermion5D Done " <<std::endl;
+  //  std::cout << " WilsonFermion5D Done " <<std::endl;
 }


@ -438,6 +444,29 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int

  DhopInternal(StencilOdd,UmuEven,in,out,dag);
 }
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopComms(const FermionField &in, FermionField &out)
+{
+  int dag =0 ;
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+  out.Checkerboard() = in.Checkerboard();
+  Compressor compressor(dag);
+  Stencil.HaloExchangeOpt(in,compressor);
+}
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
+{
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  int LLs = in.Grid()->_rdimensions[0];
+  int Opt = WilsonKernelsStatic::Opt;
+  Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
+}
+
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
@ -740,6 +769,15 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe

 template<class Impl>
 void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist)
+{
+  std::vector<double> empty_q(Nd,0.0);
+  MomentumSpacePropagatorHwQ(out,in,mass,twist,empty_q);
+}
+template<class Impl>
+void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,
+						       RealD mass,
+						       std::vector<double> twist,
+						       std::vector<double> qmu)
 {
    Gamma::Algebra Gmu [] = {
      Gamma::Algebra::GammaX,
@ -755,6 +793,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    typedef typename FermionField::scalar_type ScalComplex;

    typedef Lattice<iSinglet<vector_type> > LatComplex;
+    typedef iSpinMatrix<ScalComplex> SpinMat;


    Coordinate latt_size   = _grid->_fdimensions;
@ -772,8 +811,10 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    LatComplex kmu(_grid); 
    ScalComplex ci(0.0,1.0);

+    std::cout<< "Feynman Rule" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
+    
    for(int mu=0;mu<Nd;mu++) {
-
+      
      LatticeCoordinate(kmu,mu);

      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
@ -782,9 +823,18 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
      kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions

      sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
-      sk  = sk  + sin(kmu)*sin(kmu); 

-      num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);
+      sk = sk + (sin(kmu)+qmu[mu])*(sin(kmu)+qmu[mu]); 
+
+      // Terms for boosted Fermion
+      // 1/2 [ -i gamma.(sin p + q )     ]
+      //     [ --------------------- + 1 ]
+      //     [         wq + b            ]
+      //
+      // wq = sqrt( (sinp+q)^2 + b^2 )
+      //
+      
+      num = num - (sin(kmu)+qmu[mu])*ci*(Gamma(Gmu[mu])*in);

    }
    num = num + mass * in ;
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else {							\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  acceleratorSynchronise();						\
+  acceleratorSynchronise();					\
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
  Recon(result, Uchi);

@ -411,6 +411,46 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }

+#ifdef GRID_SYCL
+extern "C" {
+    ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void );
+    void  SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value );
+}
+#ifdef GRID_SIMT
+#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id())
+#else
+#define MAKE_ID(A) (0)
+#endif
+
+#else
+
+#define MAKE_ID(A) (0)
+
+#endif
+
+
+#define KERNEL_CALL_ID(A)						\
+  const uint64_t    NN = Nsite*Ls;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+      const int Nsimd = SiteHalfSpinor::Nsimd();			\
+      const int lane=acceleratorSIMTlane(Nsimd);                        \
+      int idx=sF*Nsimd+lane;						\
+      uint64_t id = MAKE_ID();						\
+      ids[idx]=id;							\
+    });									\
+  accelerator_barrier();

 #define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
@ -418,7 +458,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
      int sF = ss;							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
-  });
+    });

 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();

@ -451,6 +491,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
    });}

+
+
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 				     int Ls, int Nsite, const FermionField &in, FermionField &out,
@ -475,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
-     // dependent on result of merge
+     //     // dependent on result of merge
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
@ -485,6 +527,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   }
   assert(0 && " Kernel optimisation case not covered ");
  }
+
+template <class Impl>
+void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+				     int Ls, int Nsite, const FermionField &in, FermionField &out,
+				     uint64_t *ids)
+{
+    autoView(U_v  ,  U,AcceleratorRead);
+    autoView(in_v , in,AcceleratorRead);
+    autoView(out_v,out,AcceleratorWrite);
+    autoView(st_v , st,AcceleratorRead);
+    KERNEL_CALL_ID(GenericDhopSite);
+}
  template <class Impl>
  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 					  int Ls, int Nsite, const FermionField &in, FermionField &out,
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@ -40,6 +40,11 @@ public:

  INHERIT_GIMPL_TYPES(Gimpl);

+  using Action<GaugeField>::S;
+  using Action<GaugeField>::Sinitial;
+  using Action<GaugeField>::deriv;
+  using Action<GaugeField>::refresh;
+
 private:
  RealD c_plaq;
  RealD c_rect;
--- a/Grid/qcd/action/gauge/WilsonGaugeAction.h
+++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h
@ -43,6 +43,11 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
 public:  
  INHERIT_GIMPL_TYPES(Gimpl);

+  using Action<GaugeField>::S;
+  using Action<GaugeField>::Sinitial;
+  using Action<GaugeField>::deriv;
+  using Action<GaugeField>::refresh;
+  
  /////////////////////////// constructors
  explicit WilsonGaugeAction(RealD beta_):beta(beta_){};

--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
--- a/Grid/qcd/utils/SUn.impl.h
+++ b/Grid/qcd/utils/SUn.impl.h
@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
 ////////////////////////////////////////////////////////////////////////
 // Map a su2 subgroup number to the pair of rows that are non zero
 ////////////////////////////////////////////////////////////////////////
-static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
+static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
  assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));

  int spare = su2_index;
--- a/Grid/qcd/utils/Sp2n.impl.h
+++ b/Grid/qcd/utils/Sp2n.impl.h
@ -207,7 +207,7 @@ static void generatorZtype(int zIndex, iGroupMatrix<cplx> &ta) {
 // Map a su2 subgroup number to the pair of rows that are non zero
 ////////////////////////////////////////////////////////////////////////
 template <ONLY_IF_Sp>
-static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
+static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
  const int nsp=ncolour/2;
  assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2));

--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -121,17 +121,22 @@ class CartesianStencilAccelerator {
  StencilVector same_node;
  Coordinate    _simd_layout;
  Parameters    parameters;
+  ViewMode mode;
  StencilEntry*  _entries_p;
+  StencilEntry*  _entries_host_p;
  cobj* u_recv_buf_p;
  cobj* u_send_buf_p;

  accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; }

-  accelerator_inline int GetNodeLocal(int osite,int point) const {
-    return this->_entries_p[point+this->_npoints*osite]._is_local;
+  // Not a device function
+  inline int GetNodeLocal(int osite,int point) const {
+    StencilEntry SE=this->_entries_host_p[point+this->_npoints*osite];
+    return SE._is_local;
  }
  accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const {
-    ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite];
+    ptype = this->_permute_type[point];
+    return & this->_entries_p[point+this->_npoints*osite];
  }

  accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const {
@ -164,28 +169,22 @@ class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parame
 {
 public:
  int *closed;
-  StencilEntry *cpu_ptr;
-  ViewMode      mode;
+  //  StencilEntry *cpu_ptr;
 public:
  // default copy constructor
  CartesianStencilView (const CartesianStencilView &refer_to_me) = default;

  CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode)
-    : CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me),
-    cpu_ptr(this->_entries_p),
-    mode(_mode)
+    : CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me)
  {
-    this->_entries_p =(StencilEntry *)
-      MemoryManager::ViewOpen(this->_entries_p,
-			      this->_npoints*this->_osites*sizeof(StencilEntry),
-			      mode,
-			      AdviseDefault);
+    this->ViewOpen(_mode);
+  }
+  void ViewOpen(ViewMode _mode)
+  {
+    this->mode = _mode;
  }

-  void ViewClose(void)
-  {
-    MemoryManager::ViewClose(this->cpu_ptr,this->mode);
-  }
+  void ViewClose(void)  {  }

 };

@ -274,8 +273,8 @@ public:
  std::vector<deviceVector<std::pair<int,int> > > face_table ;
  deviceVector<int> surface_list;

-  std::vector<StencilEntry>  _entries; // Resident in host memory
-  deviceVector<StencilEntry>     _entries_device; // Resident in device memory
+  std::vector<StencilEntry>   _entries; // Resident in host memory
+  deviceVector<StencilEntry>  _entries_device; // Resident in device memory
  std::vector<Packet> Packets;
  std::vector<Merge> Mergers;
  std::vector<Merge> MergersSHM;
@ -364,11 +363,32 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    //    std::cout << "Communicate Begin "<<std::endl;
+    //    _grid->Barrier();
+    FlightRecorder::StepLog("Communicate begin");
    // All GPU kernel tasks must complete
    //    accelerator_barrier();     // All kernels should ALREADY be complete
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
    for(int i=0;i<Packets.size();i++){
+      //      std::cout << "Communicate prepare "<<i<<std::endl;
+      //      _grid->Barrier();
+      _grid->StencilSendToRecvFromPrepare(MpiReqs,
+					  Packets[i].send_buf,
+					  Packets[i].to_rank,Packets[i].do_send,
+					  Packets[i].recv_buf,
+					  Packets[i].from_rank,Packets[i].do_recv,
+					  Packets[i].xbytes,Packets[i].rbytes,i);
+    }
+    //    std::cout << "Communicate PollDtoH "<<std::endl;
+    //    _grid->Barrier();
+    _grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
+    //    std::cout << "Communicate CopySynch "<<std::endl;
+    //    _grid->Barrier();
+    acceleratorCopySynchronise();
+    // Starts intranode
+    for(int i=0;i<Packets.size();i++){
+      //      std::cout << "Communicate Begin "<<i<<std::endl;
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
@ -386,18 +406,25 @@ public:

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    //    std::cout << "Communicate Complete "<<std::endl;
+    //    _grid->Barrier();
+    FlightRecorder::StepLog("Start communicate complete");
+    //    std::cout << "Communicate Complete PollIRecv "<<std::endl;
+    //    _grid->Barrier();
+    _grid->StencilSendToRecvFromPollIRecv(MpiReqs);
+    //    std::cout << "Communicate Complete Complete "<<std::endl;
+    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
-    // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
+    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
    //    accelerator_barrier(); 
-    _grid->StencilBarrier(); 
-    // run any checksums
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_recv )
 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
    }
+    FlightRecorder::StepLog("Finish communicate complete");
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
@ -419,6 +446,7 @@ public:
    Communicate();
    CommsMergeSHM(compress);
    CommsMerge(compress);
+    accelerator_barrier();
  }

  template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@ -474,6 +502,9 @@ public:
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
    //    accelerator_barrier();
+    //////////////////////////////////
+    // I will overwrite my send buffers
+    //////////////////////////////////
    _grid->StencilBarrier();// Synch shared memory on a single nodes

    assert(source.Grid()==_grid);
@ -487,6 +518,11 @@ public:
      HaloGatherDir(source,compress,point,face_idx);
    }
    accelerator_barrier(); // All my local gathers are complete
+#ifdef NVLINK_GET
+    _grid->StencilBarrier(); // He can now get mu local gather, I can get his
+    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
+    // Or issue barrier AFTER the DMA is running
+#endif    
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
  }
@ -525,6 +561,7 @@ public:
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
      acceleratorFenceComputeStream();
+      // Also fenced in WilsonKernels
    }
  }
  
@ -622,10 +659,10 @@ public:
  ////////////////////////////////////////
  void PrecomputeByteOffsets(void){
    for(int i=0;i<_entries.size();i++){
-      if( _entries[i]._is_local ) {
-	_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
+      if( this->_entries[i]._is_local ) {
+	this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(vobj);
      } else {
-	_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
+	this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(cobj);
      }
    }
  };
@ -653,7 +690,9 @@ public:
 	}
      }
    }
+    //    std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
    surface_list.resize(surface_list_size);
+    std::vector<int> surface_list_host(surface_list_size);
    int32_t ss=0;
    for(int site = 0 ;site< vol4;site++){
      int local = 1;
@ -665,12 +704,13 @@ public:
      if(local == 0) {
 	for(int s=0;s<Ls;s++){
 	  int idx=site*Ls+s;
-	  acceleratorPut(surface_list[ss],idx);
+	  surface_list_host[ss]= idx;
 	  ss++;
 	}
      }
    }
-    std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
+    //    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@ -758,7 +798,13 @@ public:
    this->_osites  = _grid->oSites();

    _entries.resize(this->_npoints* this->_osites);
-    this->_entries_p = &_entries[0];
+    _entries_device.resize(this->_npoints* this->_osites);
+    this->_entries_host_p = &_entries[0];
+    this->_entries_p = &_entries_device[0];
+
+    //    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
+    //	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
+    
    for(int ii=0;ii<npoints;ii++){

      int i = ii; // reverse direction to get SIMD comms done first
@ -835,6 +881,7 @@ public:
      u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    }
    PrecomputeByteOffsets();
+    acceleratorCopyToDevice(&this->_entries[0],&this->_entries_device[0],this->_entries.size()*sizeof(StencilEntry));
  }

  void Local     (int point, int dimension,int shiftpm,int cbmask)
@ -990,10 +1037,10 @@ public:
      for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 	for(int b=0;b<_grid->_slice_block[dimension];b++){
 	  int idx=point+(lo+o+b)*this->_npoints;
-	  _entries[idx]._offset  =ro+o+b;
-	  _entries[idx]._permute=permute;
-	  _entries[idx]._is_local=1;
-	  _entries[idx]._around_the_world=wrap;
+	  this->_entries[idx]._offset  =ro+o+b;
+	  this->_entries[idx]._permute=permute;
+	  this->_entries[idx]._is_local=1;
+	  this->_entries[idx]._around_the_world=wrap;
 	}
 	o +=_grid->_slice_stride[dimension];
      }
@ -1011,10 +1058,10 @@ public:

 	  if ( ocb&cbmask ) {
 	    int idx = point+(lo+o+b)*this->_npoints;
-	    _entries[idx]._offset =ro+o+b;
-	    _entries[idx]._is_local=1;
-	    _entries[idx]._permute=permute;
-	    _entries[idx]._around_the_world=wrap;
+	    this->_entries[idx]._offset =ro+o+b;
+	    this->_entries[idx]._is_local=1;
+	    this->_entries[idx]._permute=permute;
+	    this->_entries[idx]._around_the_world=wrap;
 	  }

 	}
@ -1038,10 +1085,10 @@ public:
      for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 	for(int b=0;b<_grid->_slice_block[dimension];b++){
 	  int idx=point+(so+o+b)*this->_npoints;
-	  _entries[idx]._offset  =offset+(bo++);
-	  _entries[idx]._is_local=0;
-	  _entries[idx]._permute=0;
-	  _entries[idx]._around_the_world=wrap;
+	  this->_entries[idx]._offset  =offset+(bo++);
+	  this->_entries[idx]._is_local=0;
+	  this->_entries[idx]._permute=0;
+	  this->_entries[idx]._around_the_world=wrap;
 	}
 	o +=_grid->_slice_stride[dimension];
      }
@ -1058,10 +1105,10 @@ public:
 	  int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	  if ( ocb & cbmask ) {
 	    int idx = point+(so+o+b)*this->_npoints;
-	    _entries[idx]._offset  =offset+(bo++);
-	    _entries[idx]._is_local=0;
-	    _entries[idx]._permute =0;
-	    _entries[idx]._around_the_world=wrap;
+	    this->_entries[idx]._offset  =offset+(bo++);
+	    this->_entries[idx]._is_local=0;
+	    this->_entries[idx]._permute =0;
+	    this->_entries[idx]._around_the_world=wrap;
 	  }
 	}
 	o +=_grid->_slice_stride[dimension];
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@ -202,13 +202,13 @@ void acceleratorInit(void)

 #ifdef GRID_SYCL

-cl::sycl::queue *theGridAccelerator;
-cl::sycl::queue *theCopyAccelerator;
+sycl::queue *theGridAccelerator;
+sycl::queue *theCopyAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
-  //  cl::sycl::gpu_selector selector;
-  //  cl::sycl::device selectedDevice { selector };
+  //  sycl::gpu_selector selector;
+  //  sycl::device selectedDevice { selector };
  theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
  theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
  //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
@ -242,14 +242,14 @@ void acceleratorInit(void)
  gethostname(hostname, HOST_NAME_MAX+1);
  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);

-  auto devices = cl::sycl::device::get_devices();
+  auto devices = sycl::device::get_devices();
  for(int d = 0;d<devices.size();d++){

 #define GPU_PROP_STR(prop) \
-    printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str());
+    printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());

 #define GPU_PROP_FMT(prop,FMT) \
-    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
+    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());

 #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld");
    if ( world_rank == 0) {
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -132,27 +132,17 @@ inline void cuda_mem(void)

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
-    int nt=acceleratorThreads();					\
-    typedef uint64_t Iterator;						\
-    auto lambda = [=] accelerator					\
-      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
-      __VA_ARGS__;							\
-    };									\
-    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
-    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
-  }
-#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
-  {									\
-    int nt=acceleratorThreads();					\
-    typedef uint64_t Iterator;						\
-    auto lambda = [=] accelerator					\
-      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
-      __VA_ARGS__;							\
-    };									\
-    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
-    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
+    if ( num1*num2 ) {							\
+      int nt=acceleratorThreads();					\
+      typedef uint64_t Iterator;					\
+      auto lambda = [=] accelerator					\
+	(Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
+		      __VA_ARGS__;					\
+		    };							\
+      dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
+      dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
+      LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
+    }									\
  }

 #define accelerator_for6dNB(iter1, num1,				\
@ -175,19 +165,6 @@ inline void cuda_mem(void)
  }


-#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
-  {									\
-    int nt=acceleratorThreads();					\
-    typedef uint64_t Iterator;						\
-    auto lambda = [=] accelerator					\
-      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
-      __VA_ARGS__;							\
-    };									\
-    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
-    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
-  }
-
 template<typename lambda>  __global__
 void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
 {
@ -199,17 +176,6 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
    Lambda(x,y,z);
  }
 }
-template<typename lambda>  __global__
-void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
-{
-  // Weird permute is to make lane coalesce for large blocks
-  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
-  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
-  uint64_t z = threadIdx.x;
-  if ( (x < num1) && (y<num2) && (z<num3) ) {
-    Lambda(x,y,z);
-  }
-}

 template<typename lambda>  __global__
 void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
@ -243,6 +209,17 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
    }									\
  }

+inline void *acceleratorAllocHost(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = cudaMallocHost((void **)&ptr,bytes);
+  if( err != cudaSuccess ) {
+    ptr = (void *) NULL;
+    printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
+    assert(0);
+  }
+  return ptr;
+}
 inline void *acceleratorAllocShared(size_t bytes)
 {
  void *ptr=NULL;
@ -264,18 +241,34 @@ inline void *acceleratorAllocDevice(size_t bytes)
  }
  return ptr;
 };
+
+typedef int acceleratorEvent_t;
+
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
+inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //auto discard=cudaStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}


 inline int  acceleratorIsCommunicable(void *ptr)
@ -302,7 +295,7 @@ NAMESPACE_END(Grid);

 // Force deterministic reductions
 #define SYCL_REDUCTION_DETERMINISTIC
-#include <sycl/CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
@ -314,8 +307,8 @@ inline void acceleratorMem(void)
  std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
 }

-extern cl::sycl::queue *theGridAccelerator;
-extern cl::sycl::queue *theCopyAccelerator;
+extern sycl::queue *theGridAccelerator;
+extern sycl::queue *theCopyAccelerator;

 #ifdef __SYCL_DEVICE_ONLY__
 #define GRID_SIMT
@ -326,24 +319,24 @@ extern cl::sycl::queue *theCopyAccelerator;

 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
- return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; 
+ return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2]; 
 #else
 return 0;
 #endif
 } // SYCL specific

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
+  theGridAccelerator->submit([&](sycl::handler &cgh) {		\
    unsigned long nt=acceleratorThreads();				\
    if(nt < 8)nt=8;							\
    unsigned long unum1 = num1;						\
    unsigned long unum2 = num2;						\
    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
-    cl::sycl::range<3> local {nt,1,nsimd};				\
-    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
+    sycl::range<3> local {nt,1,nsimd};				\
+    sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
    cgh.parallel_for(							\
-		     cl::sycl::nd_range<3>(global,local),		\
-		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
+		     sycl::nd_range<3>(global,local),			\
+		     [=] (sycl::nd_item<3> item) /*mutable*/		\
 		     [[intel::reqd_sub_group_size(16)]]			\
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
@ -356,26 +349,50 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }

 inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
+inline void *acceleratorAllocHost(size_t bytes)  { return malloc_host(bytes,*theGridAccelerator);};
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
+inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};

 inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+
+
+///////
+// Asynch event interface
+///////
+typedef sycl::event acceleratorEvent_t;
+
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  ev.wait();
+}
+
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
+{
+  return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
+}
+
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);}
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
+
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}

 inline int  acceleratorIsCommunicable(void *ptr)
 {
 #if 0
-  auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
-  if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
+  auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
+  if ( uvm = sycl::usm::alloc::shared ) return 1;
  else return 0;
 #endif
  return 1;
+
 }

+
 #endif

 //////////////////////////////////////////////
@ -472,6 +489,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
    }								\
  }

+inline void *acceleratorAllocHost(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = hipHostMalloc((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
+    ptr = (void *) NULL;
+    fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
+  }
+  return ptr;
+};
 inline void *acceleratorAllocShared(size_t bytes)
 {
  void *ptr=NULL;
@ -495,37 +522,53 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };

+inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
-//inline void acceleratorCopySynchronise(void) {  }
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}

-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+typedef int acceleratorEvent_t;
+
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  return 0;
 }
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyToDevice(from,to,bytes);
+  return 0;
 }
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };

+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //  auto discard=hipStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
+
+
 #endif

+inline void acceleratorPin(void *ptr,unsigned long bytes)
+{
+#ifdef GRID_SYCL
+  sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context());
+#endif
+}
+
 //////////////////////////////////////////////
 // Common on all GPU targets
 //////////////////////////////////////////////
 #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
 // FIXME -- the non-blocking nature got broken March 30 2023 by PAB
 #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );  
-#define prof_accelerator_for( iter1, num1, nsimd, ... ) \
-  prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\
-  accelerator_barrier(dummy);

 #define accelerator_for( iter, num, nsimd, ... )		\
  accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } );	\
@ -547,6 +590,8 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize

 #undef GRID_SIMT

+typedef int acceleratorEvent_t;
+
 inline void acceleratorMem(void)
 {
  /*
@ -566,16 +611,21 @@ inline void acceleratorMem(void)

 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific

-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { acceleratorCopyToDevice(from,to,bytes); return 0; }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { acceleratorCopyFromDevice(from,to,bytes); return 0; }
+inline void acceleratorEventWait(acceleratorEvent_t ev){}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); return 0;}
+
 inline void acceleratorCopySynchronise(void) {};

 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
 #ifdef HAVE_MM_MALLOC_H
+inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
+inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);};
 inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
 inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
 #else
@ -655,9 +705,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
  acceleratorCopySynchronise();
 }

-template<class T> void acceleratorPut(T& dev,T&host)
+template<class T> void acceleratorPut(T& dev,const T&host)
 {
-  acceleratorCopyToDevice(&host,&dev,sizeof(T));
+  acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
 }
 template<class T> T acceleratorGet(T& dev)
 {
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_critical                                     DO_PRAGMA(omp critical)

 #ifdef GRID_OMP
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
-  uint64_t *ufrom = (uint64_t *)from;
+  const uint64_t *ufrom = (const uint64_t *)from;
  uint64_t *uto   = (uint64_t *)to;
  assert(bytes%8==0);
  uint64_t words=bytes/8;
@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
  });
 }
 #else
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
  bcopy(from,to,bytes);
 }
--- a/Grid/util/FlightRecorder.cc
+++ b/Grid/util/FlightRecorder.cc
@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail;
 int FlightRecorder::LoggingMode;
 int FlightRecorder::ChecksumComms;
 int FlightRecorder::ChecksumCommsSend;
+const char *   FlightRecorder::StepName;
+int32_t  FlightRecorder::StepLoggingCounter;
 int32_t  FlightRecorder::XmitLoggingCounter;
 int32_t  FlightRecorder::RecvLoggingCounter;
 int32_t  FlightRecorder::CsumLoggingCounter;
@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void)
  CsumLoggingCounter=0;
  NormLoggingCounter=0;
  ReductionLoggingCounter=0;
+  StepName = "No steps started";
+  StepLoggingCounter=0;
 }
 void FlightRecorder::Truncate(void)
 {
@ -88,6 +92,12 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
    assert(0);
  }
 }
+bool FlightRecorder::StepLog(const char *name)
+{
+  StepName = name;
+  StepLoggingCounter ++;
+  return true;
+}

 void FlightRecorder::SetLoggingModePrint(void)
 {
@ -111,17 +121,19 @@ uint64_t FlightRecorder::ErrorCount(void)
 {
  return ErrorCounter;
 }
-void FlightRecorder::NormLog(double value)
+bool FlightRecorder::NormLog(double value)
 {
  uint64_t hex = * ( (uint64_t *)&value );
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    NormLoggingCounter++;
+    return true;
  }
  if(LoggingMode == LoggingModeRecord) {
    std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    NormLogVector.push_back(value);
    NormLoggingCounter++;
+    return true;
  }
  if(LoggingMode == LoggingModeVerify) {

@ -130,6 +142,9 @@ void FlightRecorder::NormLog(double value)

      if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {

+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
 		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
@ -142,7 +157,9 @@ void FlightRecorder::NormLog(double value)
 		NormLoggingCounter,NormLogVector.size(),
 		value, NormLogVector[NormLoggingCounter]); fflush(stderr);

-	if(!ContinueOnFail)assert(0); // Force takedown of job
+	BACKTRACEFP(stderr);
+
+	if(!ContinueOnFail) return false;
 	  
 	ErrorCounter++;
      } else {
@ -159,18 +176,21 @@ void FlightRecorder::NormLog(double value)
    }
    NormLoggingCounter++;
  }
+  return true;
 }
-void FlightRecorder::CsumLog(uint64_t hex)
+bool FlightRecorder::CsumLog(uint64_t hex)
 {
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    CsumLoggingCounter++;
+    return true;
  }

  if(LoggingMode == LoggingModeRecord) {
    std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    CsumLogVector.push_back(hex);
    CsumLoggingCounter++;
+    return true;
  }

  if(LoggingMode == LoggingModeVerify) {
@ -181,6 +201,9 @@ void FlightRecorder::CsumLog(uint64_t hex)

      if ( hex != hexref ) {

+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
        std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;

@ -188,9 +211,10 @@ void FlightRecorder::CsumLog(uint64_t hex)
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		CsumLoggingCounter,hex, hexref);
+	BACKTRACEFP(stderr);
 	fflush(stderr);

-	if(!ContinueOnFail) assert(0); // Force takedown of job
+	if(!ContinueOnFail) return false;
 	  
 	ErrorCounter++;

@ -207,7 +231,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
    }
    CsumLoggingCounter++;
  }
+  return true;
 }
+
 void FlightRecorder::ReductionLog(double local,double global)
 {
  uint64_t hex_l = * ( (uint64_t *)&local );
@ -224,11 +250,15 @@ void FlightRecorder::ReductionLog(double local,double global)
  if(LoggingMode == LoggingModeVerify) {
    if(ReductionLoggingCounter < ReductionLogVector.size()){
      if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		ReductionLoggingCounter,ReductionLogVector.size(),
 		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
+	BACKTRACEFP(stderr);
 	
 	if ( !ContinueOnFail ) assert(0);

@ -250,10 +280,11 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
  if(LoggingMode == LoggingModeNone) return;

  if ( ChecksumCommsSend ){
-  uint64_t *ubuf = (uint64_t *)buf;
-  if(LoggingMode == LoggingModeNone) return;
+
+    if(LoggingMode == LoggingModeNone) return;
  
 #ifdef GRID_SYCL
+  uint64_t *ubuf = (uint64_t *)buf;
  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
@ -267,11 +298,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
  if(LoggingMode == LoggingModeVerify) {
    if(XmitLoggingCounter < XmitLogVector.size()){
      if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		XmitLoggingCounter,XmitLogVector.size(),
 		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
+	BACKTRACEFP(stderr);
 	
 	if ( !ContinueOnFail ) assert(0);

@ -293,9 +328,9 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
 void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
 {
  if ( ChecksumComms ){
-  uint64_t *ubuf = (uint64_t *)buf;
  if(LoggingMode == LoggingModeNone) return;
 #ifdef GRID_SYCL
+  uint64_t *ubuf = (uint64_t *)buf;
  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
@ -309,11 +344,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
  if(LoggingMode == LoggingModeVerify) {
    if(RecvLoggingCounter < RecvLogVector.size()){
      if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		RecvLoggingCounter,RecvLogVector.size(),
 		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
+	BACKTRACEFP(stderr);
 	
 	if ( !ContinueOnFail ) assert(0);

--- a/Grid/util/FlightRecorder.h
+++ b/Grid/util/FlightRecorder.h
@ -12,6 +12,8 @@ class FlightRecorder {
  
  static int                   LoggingMode;
  static uint64_t              ErrorCounter;
+  static const char *                StepName;
+  static int32_t               StepLoggingCounter;
  static int32_t               XmitLoggingCounter;
  static int32_t               RecvLoggingCounter;
  static int32_t               CsumLoggingCounter;
@ -30,8 +32,9 @@ class FlightRecorder {
  static void SetLoggingModeRecord(void);
  static void SetLoggingModeVerify(void);
  static void SetLoggingMode(LoggingMode_t mode);
-  static void NormLog(double value);
-  static void CsumLog(uint64_t csum);
+  static bool StepLog(const char *name);
+  static bool NormLog(double value);
+  static bool CsumLog(uint64_t csum);
  static void ReductionLog(double lcl, double glbl);
  static void Truncate(void);
  static void ResetCounters(void);
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);

-
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
+    std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
+    FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
+    FlightRecorder::PrintEntireLog = 1;
+    FlightRecorder::ChecksumComms  = 1;
+    FlightRecorder::ChecksumCommsSend=1;
+  }
+  
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@ -549,8 +556,34 @@ void GridLogLayout() {

 void * Grid_backtrace_buffer[_NBACKTRACE];

+void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
+{
+  fprintf(stderr,"Signal handler on host %s\n",hostname);
+  fprintf(stderr,"FlightRecorder step %d stage %s \n",
+	  FlightRecorder::StepLoggingCounter,
+	  FlightRecorder::StepName);
+  fprintf(stderr,"Caught signal %d\n",si->si_signo);
+  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
+  fprintf(stderr,"         code %d\n",si->si_code);
+  // x86 64bit
+#ifdef __linux__
+#ifdef __x86_64__
+  ucontext_t * uc= (ucontext_t *)ptr;
+  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
+  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
+#endif
+#endif
+  fflush(stderr);
+  BACKTRACEFP(stderr);
+  fprintf(stderr,"Called backtrace\n");
+  fflush(stdout);
+  fflush(stderr);
+  return;
+}
+
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
+  fprintf(stderr,"Signal handler on host %s\n",hostname);
  fprintf(stderr,"Caught signal %d\n",si->si_signo);
  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
  fprintf(stderr,"         code %d\n",si->si_code);
@ -561,7 +594,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
-#define REG(A)  printf("  %s %lx\n",#A,sc-> A);
+#define REG(A)  fprintf(stderr,"  %s %lx\n",#A,sc-> A);
  REG(rdi);
  REG(rsi);
  REG(rbp);
@ -594,8 +627,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)

 void Grid_exit_handler(void)
 {
-  BACKTRACEFP(stdout);
-  fflush(stdout);
+  //  BACKTRACEFP(stdout);
+  //  fflush(stdout);
 }
 void Grid_debug_handler_init(void)
 {
@ -603,10 +636,10 @@ void Grid_debug_handler_init(void)
  sigemptyset (&sa.sa_mask);
  sa.sa_sigaction= Grid_sa_signal_handler;
  sa.sa_flags    = SA_SIGINFO;
-  sigaction(SIGSEGV,&sa,NULL);
+  //  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  sigaction(SIGBUS,&sa,NULL);
-  sigaction(SIGUSR2,&sa,NULL);
+  //  sigaction(SIGUSR2,&sa,NULL);

  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);

@ -614,7 +647,15 @@ void Grid_debug_handler_init(void)
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);

-  atexit(Grid_exit_handler);
+  // Non terminating SIGUSR1/2 handler
+  struct sigaction sa_ping;
+  sigemptyset (&sa_ping.sa_mask);
+  sa_ping.sa_sigaction= Grid_usr_signal_handler;
+  sa_ping.sa_flags    = SA_SIGINFO;
+  sigaction(SIGHUP,&sa_ping,NULL);
+
+  //  atexit(Grid_exit_handler);
 }

 NAMESPACE_END(Grid);
+
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@ -50,7 +50,7 @@ namespace Grid{
      int64_t index64;
      IndexFromCoorReversed(coor,index64,dims);
      if ( index64>=2*1024*1024*1024LL ){
-	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+	//	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
      }
      assert(index64<2*1024*1024*1024LL);
      index = (int) index64;
--- a/Makefile.am
+++ b/Makefile.am
@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-SUBDIRS = Grid HMC benchmarks tests examples
+SUBDIRS = Grid  benchmarks tests examples HMC

 include $(top_srcdir)/doxygen.inc

--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@ -118,7 +118,7 @@ public:
    fprintf(FP,"Packet bytes, direction, GB/s per node\n");
    for(int lat=16;lat<=maxlat;lat+=8){
      //      for(int Ls=8;Ls<=8;Ls*=2){
-      { int Ls=12;
+      { int Ls=8;

 	Coordinate latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
@ -175,8 +175,8 @@ public:
 	    timestat.statistics(t_time);
 	  
 	    dbytes=dbytes*ppn;
-	    double xbytes    = dbytes*0.5;
-	    double bidibytes = dbytes;
+	    double xbytes    = dbytes;
+	    double bidibytes = dbytes*2.0;
 	  
 	    std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
 		     << bytes << " \t "
@ -492,17 +492,18 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Dw.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Dw.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@ -520,11 +521,11 @@ public:
 	double mf_hi, mf_lo, mf_err;

 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@ -535,6 +536,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;

      }

@ -654,17 +656,19 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;

-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Ds.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@ -675,11 +679,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@ -689,6 +693,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
      
      }

@ -792,19 +797,18 @@ public:
 	  Dc.M(src,r);
 	}
 	FGrid->Barrier();
-	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t ni = 100;
+	uint64_t no = 50;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  Dc.M(src,r);
-	  t1=usecond();
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
+	  double t0=usecond();
+	  for(uint64_t j=0;j<ni;j++){
+	    Dc.M(src,r);
+	  }
+	  double t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
@ -814,20 +818,21 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call   "<< timestat.mean/ni<<std::endl;
      
      }

--- a/configure.ac
+++ b/configure.ac
@ -72,6 +72,7 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
+AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])

@ -128,6 +129,20 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac

+############### internal reduction
+AC_ARG_ENABLE([reduction],
+    [AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])],
+    [ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid])
+
+case ${ac_REDUCTION} in
+    mpi)
+        ;;
+    grid)
+        AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
+    *)
+        AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
+esac
+
 ############### tracing
 AC_ARG_ENABLE([tracing],
    [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
@ -226,6 +241,20 @@ case ${ac_SFW_FP16} in
 esac


+############### MPI BOUNCE TO HOST
+AC_ARG_ENABLE([accelerator-aware-mpi],
+    [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
+    [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
+
+# Force accelerator CSHIFT now
+AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device])
+
+case ${ac_ACCELERATOR_AWARE_MPI} in
+    yes)
+      AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
+    *);;
+esac
+
 ############### SYCL/CUDA/HIP/none
 AC_ARG_ENABLE([accelerator],
    [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
--- a/examples/Example_taku.cc
+++ b/examples/Example_taku.cc
@ -1,383 +0,0 @@
-/*
- * Warning: This code illustrative only: not well tested, and not meant for production use
- * without regression / tests being applied
- */
-
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-
-RealD LLscale =1.0;
-RealD LCscale =1.0;
-
-template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
-{
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  GridBase *grid;
-  GaugeField U;
-  
-  CovariantLaplacianCshift(GaugeField &_U)    :
-    grid(_U.Grid()),
-    U(_U) {  };
-
-  virtual GridBase *Grid(void) { return grid; };
-
-  virtual void  M    (const Field &in, Field &out)
-  {
-    out=Zero();
-    for(int mu=0;mu<Nd-1;mu++) {
-      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
-      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
-      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
-      out = out + 2.0*in;
-    }
-  };
-  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
-  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
-};
-
-void MakePhase(Coordinate mom,LatticeComplex &phase)
-{
-  GridBase *grid = phase.Grid();
-  auto latt_size = grid->GlobalDimensions();
-  ComplexD ci(0.0,1.0);
-  phase=Zero();
-
-  LatticeComplex coor(phase.Grid());
-  for(int mu=0;mu<Nd;mu++){
-    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-    LatticeCoordinate(coor,mu);
-    phase = phase + (TwoPiL * mom[mu]) * coor;
-  }
-  phase = exp(phase*ci);
-}
-void PointSource(Coordinate &coor,LatticePropagator &source)
-{
-  //  Coordinate coor({0,0,0,0});
-  source=Zero();
-  SpinColourMatrix kronecker; kronecker=1.0;
-  pokeSite(kronecker,source,coor);
-}
-void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
-{
-  GridBase *grid = source.Grid();
-  LatticeComplex noise(grid);
-  LatticeComplex zz(grid); zz=Zero();
-  LatticeInteger t(grid);
-
-  RealD nrm=1.0/sqrt(2);
-  bernoulli(RNG, noise); // 0,1 50:50
-
-  noise = (2.*noise - Complex(1,1))*nrm;
-
-  LatticeCoordinate(t,Tdir);
-  noise = where(t==Integer(tslice), noise, zz);
-
-  source = 1.0;
-  source = source*noise;
-  std::cout << " Z2 wall " << norm2(source) << std::endl;
-}
-template<class Field>
-void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
-{
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
-  Laplacian_t Laplacian(U);
-
-  Integer Iterations = 40;
-  Real width = 2.0;
-  Real coeff = (width*width) / Real(4*Iterations);
-
-  Field tmp(U.Grid());
-  smeared=unsmeared;
-  //  chi = (1-p^2/2N)^N kronecker
-  for(int n = 0; n < Iterations; ++n) {
-    Laplacian.M(smeared,tmp);
-    smeared = smeared - coeff*tmp;
-    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
-  }
-}
-void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
-{
-  LatticePropagator tmp(source.Grid());
-  PointSource(site,source);
-  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
-  tmp = source;
-  GaussianSmear(U,tmp,source);
-  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
-}
-void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
-{
-  Z2WallSource(RNG,tslice,source);
-  auto tmp = source;
-  GaussianSmear(U,tmp,source);
-}
-void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
-{
-  assert(mom.size()==Nd);
-  assert(mom[Tdir] == 0);
-
-  GridBase * grid = spectator.Grid();
-
-
-  LatticeInteger ts(grid);
-  LatticeCoordinate(ts,Tdir);
-  source = Zero();
-  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
-
-  LatticeComplex phase(grid);
-  MakePhase(mom,phase);
-
-  source = source *phase;
-}
-template<class Action>
-void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{
-  GridBase *UGrid = D.GaugeGrid();
-  GridBase *FGrid = D.FermionGrid();
-
-  LatticeFermion src4  (UGrid); 
-  LatticeFermion src5  (FGrid); 
-  LatticeFermion result5(FGrid);
-  LatticeFermion result4(UGrid);
-  LatticePropagator prop5(FGrid);
-  
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
-  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
-   for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
-      PropToFerm<Action>(src4,source,s,c);
-
-      D.ImportPhysicalFermionSource(src4,src5);
-
-      result5=Zero();
-      schur(D,src5,result5,ZG);
-      std::cout<<GridLogMessage
-	       <<"spin "<<s<<" color "<<c
-	       <<" norm2(src5d) "   <<norm2(src5)
-               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
-
-      D.ExportPhysicalFermionSolution(result5,result4);
-
-      FermToProp<Action>(prop5,result5,s,c);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-  LatticePropagator Axial_mu(UGrid); 
-  LatticePropagator Vector_mu(UGrid); 
-
-  LatticeComplex    PA (UGrid); 
-  LatticeComplex    VV (UGrid); 
-  LatticeComplex    PJ5q(UGrid);
-  LatticeComplex    PP (UGrid);
-
-  std::vector<TComplex> sumPA;
-  std::vector<TComplex> sumVV;
-  std::vector<TComplex> sumPP;
-  std::vector<TComplex> sumPJ5q;
-
-  Gamma g5(Gamma::Algebra::Gamma5);
-  D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
-  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
-  sliceSum(PA,sumPA,Tdir);
-
-  int Nt{static_cast<int>(sumPA.size())};
-
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
-
-  PP       = trace(adj(propagator)*propagator); // Pseudoscalar density
-  sliceSum(PP,sumPP,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
-  
-  D.ContractJ5q(prop5,PJ5q);
-  sliceSum(PJ5q,sumPJ5q,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
-
-  Gamma::Algebra GammaV[3] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ
-  };
-  for( int mu=0;mu<3;mu++ ) {
-    Gamma gV(GammaV[mu]);
-    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
-    //    auto ss=sliceSum(Vector_mu,Tdir);
-    //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
-    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
-    sliceSum(VV,sumVV,Tdir);
-    for(int t=0;t<Nt;t++){
-      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
-      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
-	       << " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
-    }
-  }
-
-}
-
-class MesonFile: Serializable {
-public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
-};
-
-void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
-{
-  const int nchannel=3;
-  Gamma::Algebra Gammas[nchannel][2] = {
-    {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX},
-    {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY},
-    {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ}
-  };
-
-  Gamma G5(Gamma::Algebra::Gamma5);
-
-  LatticeComplex meson_CF(q1.Grid());
-  MesonFile MF;
-
-  for(int ch=0;ch<nchannel;ch++){
-
-    Gamma Gsrc(Gammas[ch][0]);
-    Gamma Gsnk(Gammas[ch][1]);
-
-    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
-
-    std::vector<TComplex> meson_T;
-    sliceSum(meson_CF,meson_T, Tdir);
-
-    int nt=meson_T.size();
-
-    std::vector<Complex> corr(nt);
-    for(int t=0;t<nt;t++){
-      corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
-      std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *real(corr[t])<<std::endl;
-    }
-    MF.data.push_back(corr);
-  }
-
-  {
-    XmlWriter WR(file);
-    write(WR,"MesonFile",MF);
-  }
-}
-
-int main (int argc, char ** argv)
-{
-  const int Ls=32;
-
-  Grid_init(&argc,&argv);
-
-  // Double precision grids
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
-								   GridDefaultSimd(Nd,vComplex::Nsimd()),
-								   GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  //////////////////////////////////////////////////////////////////////
-  // You can manage seeds however you like.
-  // Recommend SeedUniqueString.
-  //////////////////////////////////////////////////////////////////////
-  std::vector<int> seeds4({1,2,3,4}); 
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-
-  LatticeGaugeField Umu(UGrid);
-  std::string config;
-  RealD M5=1.8;
-  if( argc > 1 && argv[1][0] != '-' )
-  {
-    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
-    FieldMetaData header;
-    NerscIO::readConfiguration(Umu, header, argv[1]);
-    config=argv[1];
-    M5=1.8;
-  }
-  else
-  {
-    SU<Nc>::ColdConfiguration(Umu);
-    config="ColdConfig";
-    //    RealD P=1.0; // Don't scale
-    RealD P=0.5871119; // 48I
-    //    RealD P=0.6153342; // 64I
-    //    RealD P=0.6388238 // 32Ifine
-    RealD u0 = sqrt(sqrt(P));
-    RealD M5mf = M5 - 4.0*(1.0-u0);
-    RealD w0   = 1.0 - M5mf;
-#if 0
-    // M5=1.8 with U=u0
-    Umu = Umu * u0;
-    LLscale = 1.0;
-    LCscale = 1.0;
-    std::cout<<GridLogMessage <<"Gauge links are u=u0= "<<u0<<std::endl;
-    std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
-#else
-    M5 = M5mf;
-    std::cout<<GridLogMessage <<"Gauge links are u=1  "<<std::endl;
-    std::cout<<GridLogMessage <<"u0="<<u0<<std::endl;
-    std::cout<<GridLogMessage <<"M5=M5mf =  "<<M5<<std::endl;
-    LLscale = 1.0/(1-w0*w0)/(1-w0*w0);
-    LCscale = 1.0/(1-w0*w0)/(1-w0*w0);
-#endif
-    std::cout<<GridLogMessage <<"LLscale =  "<<LLscale<<std::endl;
-    std::cout<<GridLogMessage <<"LCscale =  "<<LCscale<<std::endl;
-  }
-
-  std::vector<RealD> masses({ 0.00} ); // u/d, s, c ??
-
-  int nmass = masses.size();
-
-  std::vector<MobiusFermionD *> FermActs;
-  
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-
-  for(auto mass: masses) {
-
-    RealD b=1.5;// Scale factor b+c=2, b-c=1
-    RealD c=0.5;
-    
-    FermActs.push_back(new MobiusFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c));
-   
-  }
-
-  LatticePropagator point_source(UGrid);
-  //  LatticePropagator wall_source(UGrid);
-
-  Coordinate Origin({0,0,0,0});
-  PointSource   (Origin,point_source);
-  //  Z2WallSource  (RNG4,0,wall_source);
-  
-  std::vector<LatticePropagator> PointProps(nmass,UGrid);
-  //  std::vector<LatticePropagator> GaussProps(nmass,UGrid);
-  //  std::vector<LatticePropagator> Z2Props   (nmass,UGrid);
-
-  for(int m=0;m<nmass;m++) {
-    
-    Solve(*FermActs[m],point_source   ,PointProps[m]);
-  }
-
-  LatticeComplex phase(UGrid);
-  Coordinate mom({0,0,0,0});
-  MakePhase(mom,phase);
-  
-  for(int m1=0 ;m1<nmass;m1++) {
-  for(int m2=m1;m2<nmass;m2++) {
-    std::stringstream ssp,ssg,ssz;
-
-    ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_wall_meson.xml";
-
-    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
-    //    MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
-  }}
-
-  Grid_finalize();
-}
-
-
-
--- a/examples/Example_taku1.cc
+++ b/examples/Example_taku1.cc
@ -1,479 +0,0 @@
-/*
- * Warning: This code illustrative only: not well tested, and not meant for production use
- * without regression / tests being applied
- */
-
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-
-RealD LLscale =1.0;
-RealD LCscale =1.0;
-
-template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
-{
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  GridBase *grid;
-  GaugeField U;
-  
-  CovariantLaplacianCshift(GaugeField &_U)    :
-    grid(_U.Grid()),
-    U(_U) {  };
-
-  virtual GridBase *Grid(void) { return grid; };
-
-  virtual void  M    (const Field &in, Field &out)
-  {
-    out=Zero();
-    for(int mu=0;mu<Nd-1;mu++) {
-      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
-      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
-      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
-      out = out + 2.0*in;
-    }
-  };
-  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
-  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
-};
-
-void MakePhase(Coordinate mom,LatticeComplex &phase)
-{
-  GridBase *grid = phase.Grid();
-  auto latt_size = grid->GlobalDimensions();
-  ComplexD ci(0.0,1.0);
-  phase=Zero();
-
-  LatticeComplex coor(phase.Grid());
-  for(int mu=0;mu<Nd;mu++){
-    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-    LatticeCoordinate(coor,mu);
-    phase = phase + (TwoPiL * mom[mu]) * coor;
-  }
-  phase = exp(phase*ci);
-}
-
-void PointSource(Coordinate &coor,LatticePropagator &source)
-{
-  //  Coordinate coor({0,0,0,0});
-  source=Zero();
-  SpinColourMatrix kronecker; kronecker=1.0;
-  pokeSite(kronecker,source,coor);
-}
-void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
-{
-  GridBase *grid = source.Grid();
-  LatticeComplex noise(grid);
-  LatticeComplex zz(grid); zz=Zero();
-  LatticeInteger t(grid);
-
-  RealD nrm=1.0/sqrt(2);
-  bernoulli(RNG, noise); // 0,1 50:50
-
-  noise = (2.*noise - Complex(1,1))*nrm;
-
-  LatticeCoordinate(t,Tdir);
-  noise = where(t==Integer(tslice), noise, zz);
-
-  source = 1.0;
-  source = source*noise;
-  std::cout << " Z2 wall " << norm2(source) << std::endl;
-}
-template<class Field>
-void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
-{
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
-  Laplacian_t Laplacian(U);
-
-  Integer Iterations = 40;
-  Real width = 2.0;
-  Real coeff = (width*width) / Real(4*Iterations);
-
-  Field tmp(U.Grid());
-  smeared=unsmeared;
-  //  chi = (1-p^2/2N)^N kronecker
-  for(int n = 0; n < Iterations; ++n) {
-    Laplacian.M(smeared,tmp);
-    smeared = smeared - coeff*tmp;
-    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
-  }
-}
-void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
-{
-  LatticePropagator tmp(source.Grid());
-  PointSource(site,source);
-  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
-  tmp = source;
-  GaussianSmear(U,tmp,source);
-  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
-}
-void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
-{
-  Z2WallSource(RNG,tslice,source);
-  auto tmp = source;
-  GaussianSmear(U,tmp,source);
-}
-void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
-{
-  assert(mom.size()==Nd);
-  assert(mom[Tdir] == 0);
-
-  GridBase * grid = spectator.Grid();
-
-
-  LatticeInteger ts(grid);
-  LatticeCoordinate(ts,Tdir);
-  source = Zero();
-  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
-
-  LatticeComplex phase(grid);
-  MakePhase(mom,phase);
-
-  source = source *phase;
-}
-
-template<class Action>
-void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{			   
- GridBase *UGrid = source.Grid();
-  GridBase *FGrid = D.FermionGrid();
-  bool fiveD = true; //calculate 5d free propagator
-  RealD mass = D.Mass();
-  LatticeFermion src4  (UGrid);
-  LatticeFermion result4  (UGrid);
-  LatticeFermion result5(FGrid);
-  LatticeFermion src5(FGrid);
-  LatticePropagator prop5(FGrid);
-  for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
- 
-      PropToFerm<Action>(src4,source,s,c);
-
-      D.ImportPhysicalFermionSource(src4,src5);
-      D.FreePropagator(src5,result5,mass,true);
-      std::cout<<GridLogMessage
-               <<"Free 5D prop spin "<<s<<" color "<<c
-               <<" norm2(src5d) "   <<norm2(src5)
-               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
-
-      D.ExportPhysicalFermionSolution(result5,result4);
-
-      FermToProp<Action>(prop5,result5,s,c);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-
-  LatticePropagator Vector_mu(UGrid);
-  LatticeComplex    VV (UGrid);
-  std::vector<TComplex> sumVV;
-  Gamma::Algebra GammaV[3] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ
-  };
-  for( int mu=0;mu<3;mu++ ) {
-    Gamma gV(GammaV[mu]);
-    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
-    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
-    sliceSum(VV,sumVV,Tdir);
-    int Nt = sumVV.size();
-    for(int t=0;t<Nt;t++){
-      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
-      RealD Cont=0;
-      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
-      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
-               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
-    }
-  }
-}
-template<class Action>
-void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{			   
-  bool fiveD = false; //calculate 4d free propagator
-  RealD mass = D.Mass();
-  GridBase *UGrid = source.Grid();
-  LatticeFermion src4  (UGrid); 
-  LatticeFermion result4  (UGrid); 
-  for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
-      PropToFerm<Action>(src4,source,s,c);
-      D.FreePropagator(src4,result4,mass,false);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-}
-
-template<class Action>
-void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{
-  GridBase *UGrid = D.GaugeGrid();
-  GridBase *FGrid = D.FermionGrid();
-
-  LatticeFermion src4  (UGrid); 
-  LatticeFermion src5  (FGrid); 
-  LatticeFermion result5(FGrid);
-  LatticeFermion result4(UGrid);
-  LatticePropagator prop5(FGrid);
-  
-  ConjugateGradient<LatticeFermion> CG(1.0e-10,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
-  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
-   for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
-      PropToFerm<Action>(src4,source,s,c);
-
-      D.ImportPhysicalFermionSource(src4,src5);
-
-      result5=Zero();
-      schur(D,src5,result5,ZG);
-      std::cout<<GridLogMessage
-	       <<"spin "<<s<<" color "<<c
-	       <<" norm2(src5d) "   <<norm2(src5)
-               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
-
-      D.ExportPhysicalFermionSolution(result5,result4);
-
-      FermToProp<Action>(prop5,result5,s,c);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-  LatticePropagator Axial_mu(UGrid); 
-  LatticePropagator Vector_mu(UGrid); 
-
-  LatticeComplex    PA (UGrid); 
-  LatticeComplex    VV (UGrid); 
-  LatticeComplex    PJ5q(UGrid);
-  LatticeComplex    PP (UGrid);
-
-  std::vector<TComplex> sumPA;
-  std::vector<TComplex> sumVV;
-  std::vector<TComplex> sumPP;
-  std::vector<TComplex> sumPJ5q;
-
-  Gamma g5(Gamma::Algebra::Gamma5);
-  D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
-  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
-  sliceSum(PA,sumPA,Tdir);
-
-  int Nt{static_cast<int>(sumPA.size())};
-
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
-
-  PP       = trace(adj(propagator)*propagator); // Pseudoscalar density
-  sliceSum(PP,sumPP,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
-  
-  D.ContractJ5q(prop5,PJ5q);
-  sliceSum(PJ5q,sumPJ5q,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
-
-  Gamma::Algebra GammaV[3] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ
-  };
-  for( int mu=0;mu<3;mu++ ) {
-    Gamma gV(GammaV[mu]);
-    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
-    //    auto ss=sliceSum(Vector_mu,Tdir);
-    //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
-    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
-    sliceSum(VV,sumVV,Tdir);
-    for(int t=0;t<Nt;t++){
-      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
-      RealD Cont=0;
-      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
-      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
-               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
-    }
-  }
-
-}
-
-class MesonFile: Serializable {
-public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
-};
-
-void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
-{
-  const int nchannel=4;
-  Gamma::Algebra Gammas[nchannel][2] = {
-    {Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5},
-    {Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5},
-    {Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5},
-    {Gamma::Algebra::Identity,Gamma::Algebra::Identity}
-  };
-
-  LatticeComplex meson_CF(q1.Grid());
-  MesonFile MF;
-
-  for(int ch=0;ch<nchannel;ch++){
-
-    Gamma Gsrc(Gammas[ch][0]);
-    Gamma Gsnk(Gammas[ch][1]);
-
-    meson_CF = trace(adj(q1)*Gsnk*q2*adj(Gsrc));
-
-    std::vector<TComplex> meson_T;
-    sliceSum(meson_CF,meson_T, Tdir);
-
-    int nt=meson_T.size();
-
-    std::vector<Complex> corr(nt);
-    for(int t=0;t<nt;t++){
-      corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
-      RealD Ct = real(corr[t]);
-      RealD Cont=0;
-      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
-      std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t * Ct
-		<< " deltaC " <<Ct-Cont<<std::endl;
-    }
-    MF.data.push_back(corr);
-  }
-
-  {
-    XmlWriter WR(file);
-    write(WR,"MesonFile",MF);
-  }
-}
-
-int main (int argc, char ** argv)
-{
-  const int Ls=10;
-
-  Grid_init(&argc,&argv);
-
-  // Double precision grids
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
-								   GridDefaultSimd(Nd,vComplex::Nsimd()),
-								   GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  //////////////////////////////////////////////////////////////////////
-  // You can manage seeds however you like.
-  // Recommend SeedUniqueString.
-  //////////////////////////////////////////////////////////////////////
-  //  std::vector<int> seeds4({1,2,3,4}); 
-  //  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-
-  LatticeGaugeField Umu(UGrid);
-  std::string config;
-  RealD M5=atof(getenv("M5"));
-  RealD mq = atof(getenv("mass"));
-  int   tadpole = atof(getenv("tadpole"));
-  std::vector<RealD> masses({ mq} ); // u/d, s, c ??
-  if( argc > 1 && argv[1][0] != '-' )
-  {
-    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
-    FieldMetaData header;
-    NerscIO::readConfiguration(Umu, header, argv[1]);
-    config=argv[1];
-    LLscale = 1.0;
-    LCscale = 1.0;
-  }
-  else
-  {
-    SU<Nc>::ColdConfiguration(Umu);
-    config="ColdConfig";
-    //    RealD P=1.0; // Don't scale
-    //    RealD P=0.6388238 // 32Ifine
-    //    RealD P=0.6153342; // 64I
-    RealD P=0.5871119; // 48I
-    RealD u0 = sqrt(sqrt(P));
-    RealD w0 = 1 - M5;
-    std::cout<<GridLogMessage <<"For plaquette P="<<P<<" u0= "<<u0<<std::endl;
-    if ( tadpole == 1 ) {
-      Umu = Umu * u0;
-      //      LLscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
-      //      LCscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
-      LLscale = 1.0;
-      LCscale = 1.0;
-      std::cout<<GridLogMessage <<"Gauge links are u= u0 "<<std::endl;
-      std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
-    } else if ( tadpole == 2) {
-      std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
-      LLscale = 1.0;
-      LCscale = 1.0;
-      std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
-    } else {
-      LLscale = 1.0/u0/u0;
-      LCscale = 1.0/u0/u0;
-      M5 = M5 - 4.0 * (1-u0);
-      std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
-      std::cout<<GridLogMessage <<"M5mf =  "<<M5<<std::endl;
-    }
-    std::cout<<GridLogMessage <<"mq =  "<<mq<<std::endl;
-    std::cout<<GridLogMessage <<"LLscale =  "<<LLscale<<std::endl;
-    std::cout<<GridLogMessage <<"LCscale =  "<<LCscale<<std::endl;
-  }
-
-  int nmass = masses.size();
-
-  typedef DomainWallFermionD FermionActionD;
-  //  typedef MobiusFermionD FermionActionD;
-  std::vector<FermionActionD *> FermActs;
-  std::vector<DomainWallFermionD *> DWFActs;
-  
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-  std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-
-  for(auto mass: masses) {
-    std::vector<Complex> boundary = {1,1,1,-1};
-    FermionActionD::ImplParams Params(boundary);
-    RealD b=1.5;
-    RealD c=0.5;
-    std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl;
-    //    DWFActs.push_back(new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5));
-    FermActs.push_back(new FermionActionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params));
-    //    FermActs.push_back(new FermionActionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass+0.001,M5,b,c));
-    std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl;
-  }
-
-  LatticePropagator point_source(UGrid);
-
-  Coordinate Origin({0,0,0,0});
-  PointSource   (Origin,point_source);
-  
-  std::vector<LatticePropagator> PointProps(nmass,UGrid);
-  //  std::vector<LatticePropagator> FreeProps(nmass,UGrid);
-  //  LatticePropagator delta(UGrid);
-
-  for(int m=0;m<nmass;m++) {
-    Solve(*FermActs[m],point_source   ,PointProps[m]);
-    //    MasslessFreePropagator(*FermActs[m],point_source   ,FreeProps[m]);
-
-    //    delta = PointProps[m] - FreeProps[m];
-    //    std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl;
-  }
-
-  LatticeComplex phase(UGrid);
-  Coordinate mom({0,0,0,0});
-  MakePhase(mom,phase);
-  
-  for(int m1=0 ;m1<nmass;m1++) {
-  for(int m2=m1;m2<nmass;m2++) {
-    std::stringstream ssp,ssg,ssz;
-
-    ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
-
-    std::cout << "CG determined VV correlation function"<<std::endl;
-    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
-    
-    //    std::cout << "FFT derived VV correlation function"<<std::endl;
-    //    MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase);
-  }}
-
-  Grid_finalize();
-}
-
-
-
--- a/examples/Example_taku2.cc
+++ b/examples/Example_taku2.cc
@ -1,433 +0,0 @@
-/*
- * Warning: This code illustrative only: not well tested, and not meant for production use
- * without regression / tests being applied
- */
-
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-
-RealD LLscale =1.0;
-RealD LCscale =1.0;
-
-template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
-{
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  GridBase *grid;
-  GaugeField U;
-  
-  CovariantLaplacianCshift(GaugeField &_U)    :
-    grid(_U.Grid()),
-    U(_U) {  };
-
-  virtual GridBase *Grid(void) { return grid; };
-
-  virtual void  M    (const Field &in, Field &out)
-  {
-    out=Zero();
-    for(int mu=0;mu<Nd-1;mu++) {
-      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
-      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
-      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
-      out = out + 2.0*in;
-    }
-  };
-  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
-  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
-};
-
-void MakePhase(Coordinate mom,LatticeComplex &phase)
-{
-  GridBase *grid = phase.Grid();
-  auto latt_size = grid->GlobalDimensions();
-  ComplexD ci(0.0,1.0);
-  phase=Zero();
-
-  LatticeComplex coor(phase.Grid());
-  for(int mu=0;mu<Nd;mu++){
-    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-    LatticeCoordinate(coor,mu);
-    phase = phase + (TwoPiL * mom[mu]) * coor;
-  }
-  phase = exp(phase*ci);
-}
-
-void PointSource(Coordinate &coor,LatticePropagator &source)
-{
-  //  Coordinate coor({0,0,0,0});
-  source=Zero();
-  SpinColourMatrix kronecker; kronecker=1.0;
-  pokeSite(kronecker,source,coor);
-}
-void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
-{
-  GridBase *grid = source.Grid();
-  LatticeComplex noise(grid);
-  LatticeComplex zz(grid); zz=Zero();
-  LatticeInteger t(grid);
-
-  RealD nrm=1.0/sqrt(2);
-  bernoulli(RNG, noise); // 0,1 50:50
-
-  noise = (2.*noise - Complex(1,1))*nrm;
-
-  LatticeCoordinate(t,Tdir);
-  noise = where(t==Integer(tslice), noise, zz);
-
-  source = 1.0;
-  source = source*noise;
-  std::cout << " Z2 wall " << norm2(source) << std::endl;
-}
-template<class Field>
-void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
-{
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
-  Laplacian_t Laplacian(U);
-
-  Integer Iterations = 40;
-  Real width = 2.0;
-  Real coeff = (width*width) / Real(4*Iterations);
-
-  Field tmp(U.Grid());
-  smeared=unsmeared;
-  //  chi = (1-p^2/2N)^N kronecker
-  for(int n = 0; n < Iterations; ++n) {
-    Laplacian.M(smeared,tmp);
-    smeared = smeared - coeff*tmp;
-    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
-  }
-}
-void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
-{
-  LatticePropagator tmp(source.Grid());
-  PointSource(site,source);
-  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
-  tmp = source;
-  GaussianSmear(U,tmp,source);
-  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
-}
-void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
-{
-  Z2WallSource(RNG,tslice,source);
-  auto tmp = source;
-  GaussianSmear(U,tmp,source);
-}
-void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
-{
-  assert(mom.size()==Nd);
-  assert(mom[Tdir] == 0);
-
-  GridBase * grid = spectator.Grid();
-
-
-  LatticeInteger ts(grid);
-  LatticeCoordinate(ts,Tdir);
-  source = Zero();
-  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
-
-  LatticeComplex phase(grid);
-  MakePhase(mom,phase);
-
-  source = source *phase;
-}
-
-template<class Action>
-void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{			   
- GridBase *UGrid = source.Grid();
-  GridBase *FGrid = D.FermionGrid();
-  bool fiveD = true; //calculate 4d free propagator                                                                                                                 
-  RealD mass = D.Mass();
-  LatticeFermion src4  (UGrid);
-  LatticeFermion result4  (UGrid);
-  LatticeFermion result5(FGrid);
-  LatticeFermion src5(FGrid);
-  LatticePropagator prop5(FGrid);
-  for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
- 
-      PropToFerm<Action>(src4,source,s,c);
-
-      D.ImportPhysicalFermionSource(src4,src5);
-      D.FreePropagator(src5,result5,mass,true);
-      std::cout<<GridLogMessage
-               <<"spin "<<s<<" color "<<c
-               <<" norm2(src5d) "   <<norm2(src5)
-               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
-
-      D.ExportPhysicalFermionSolution(result5,result4);
-
-      FermToProp<Action>(prop5,result5,s,c);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-
-  LatticePropagator Vector_mu(UGrid);
-  LatticeComplex    VV (UGrid);
-  std::vector<TComplex> sumVV;
-  Gamma::Algebra GammaV[3] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ
-  };
-  for( int mu=0;mu<3;mu++ ) {
-    Gamma gV(GammaV[mu]);
-    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
-    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
-    sliceSum(VV,sumVV,Tdir);
-    int Nt = sumVV.size();
-    for(int t=0;t<Nt;t++){
-      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
-      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
-               << " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
-    }
-  }
-}
-
-template<class Action>
-void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{
-  GridBase *UGrid = D.GaugeGrid();
-  GridBase *FGrid = D.FermionGrid();
-
-  LatticeFermion src4  (UGrid); 
-  LatticeFermion src5  (FGrid); 
-  LatticeFermion result5(FGrid);
-  LatticeFermion result4(UGrid);
-  LatticePropagator prop5(FGrid);
-  
-  ConjugateGradient<LatticeFermion> CG(1.0e-6,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
-  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
-   for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
-      PropToFerm<Action>(src4,source,s,c);
-
-      D.ImportPhysicalFermionSource(src4,src5);
-
-      result5=Zero();
-      schur(D,src5,result5,ZG);
-      std::cout<<GridLogMessage
-	       <<"spin "<<s<<" color "<<c
-	       <<" norm2(src5d) "   <<norm2(src5)
-               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
-
-      D.ExportPhysicalFermionSolution(result5,result4);
-
-      FermToProp<Action>(prop5,result5,s,c);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-  LatticePropagator Axial_mu(UGrid); 
-  LatticePropagator Vector_mu(UGrid); 
-
-  LatticeComplex    PA (UGrid); 
-  LatticeComplex    VV (UGrid); 
-  LatticeComplex    PJ5q(UGrid);
-  LatticeComplex    PP (UGrid);
-
-  std::vector<TComplex> sumPA;
-  std::vector<TComplex> sumVV;
-  std::vector<TComplex> sumPP;
-  std::vector<TComplex> sumPJ5q;
-
-  Gamma g5(Gamma::Algebra::Gamma5);
-  D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
-  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
-  sliceSum(PA,sumPA,Tdir);
-
-  int Nt{static_cast<int>(sumPA.size())};
-
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
-
-  PP       = trace(adj(propagator)*propagator); // Pseudoscalar density
-  sliceSum(PP,sumPP,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
-  
-  D.ContractJ5q(prop5,PJ5q);
-  sliceSum(PJ5q,sumPJ5q,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
-
-  Gamma::Algebra GammaV[3] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ
-  };
-  for( int mu=0;mu<3;mu++ ) {
-    Gamma gV(GammaV[mu]);
-    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
-    //    auto ss=sliceSum(Vector_mu,Tdir);
-    //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
-    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
-    sliceSum(VV,sumVV,Tdir);
-    for(int t=0;t<Nt;t++){
-      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
-      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
-	       << " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
-    }
-  }
-
-}
-
-class MesonFile: Serializable {
-public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
-};
-
-void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
-{
-  const int nchannel=3;
-  Gamma::Algebra Gammas[nchannel][2] = {
-    {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX},
-    {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY},
-    //    {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ}
-    {Gamma::Algebra::Gamma5,Gamma::Algebra::Gamma5}
-  };
-
-  Gamma G5(Gamma::Algebra::Gamma5);
-
-  LatticeComplex meson_CF(q1.Grid());
-  MesonFile MF;
-
-  for(int ch=0;ch<nchannel;ch++){
-
-    Gamma Gsrc(Gammas[ch][0]);
-    Gamma Gsnk(Gammas[ch][1]);
-
-    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
-
-    std::vector<TComplex> meson_T;
-    sliceSum(meson_CF,meson_T, Tdir);
-
-    int nt=meson_T.size();
-
-    std::vector<Complex> corr(nt);
-    for(int t=0;t<nt;t++){
-      corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
-      std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *real(corr[t])<<std::endl;
-    }
-    MF.data.push_back(corr);
-  }
-
-  {
-    XmlWriter WR(file);
-    write(WR,"MesonFile",MF);
-  }
-}
-
-int main (int argc, char ** argv)
-{
-  const int Ls=8;
-
-  Grid_init(&argc,&argv);
-
-  // Double precision grids
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
-								   GridDefaultSimd(Nd,vComplex::Nsimd()),
-								   GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  //////////////////////////////////////////////////////////////////////
-  // You can manage seeds however you like.
-  // Recommend SeedUniqueString.
-  //////////////////////////////////////////////////////////////////////
-  //  std::vector<int> seeds4({1,2,3,4}); 
-  //  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-
-  LatticeGaugeField Umu(UGrid);
-  std::string config;
-  RealD M5=atof(getenv("M5"));
-  RealD mq = atof(getenv("mass"));
-  std::vector<RealD> masses({ mq} ); // u/d, s, c ??
-  if( argc > 1 && argv[1][0] != '-' )
-  {
-    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
-    FieldMetaData header;
-    NerscIO::readConfiguration(Umu, header, argv[1]);
-    config=argv[1];
-    LLscale = 1.0;
-    LCscale = 1.0;
-  }
-  else
-  {
-    SU<Nc>::ColdConfiguration(Umu);
-    config="ColdConfig";
-    //    RealD P=1.0; // Don't scale
-    //    RealD P=0.6153342; // 64I
-    //    RealD P=0.6388238 // 32Ifine
-    //    RealD P=0.5871119; // 48I
-    //    RealD u0 = sqrt(sqrt(P));
-    //    Umu = Umu * u0;
-    RealD w0 = 1 - M5;
-    LLscale = 1.0/(1-w0*w0)/(1-w0*w0);
-    LCscale = 1.0/(1-w0*w0)/(1-w0*w0);
-    std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
-    std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
-    std::cout<<GridLogMessage <<"mq =  "<<mq<<std::endl;
-    std::cout<<GridLogMessage <<"LLscale =  "<<LLscale<<std::endl;
-    std::cout<<GridLogMessage <<"LCscale =  "<<LCscale<<std::endl;
-  }
-
-  int nmass = masses.size();
-
-  std::vector<DomainWallFermionD *> FermActs;
-  
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-  std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-
-  for(auto mass: masses) {
-
-    std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl;
-    FermActs.push_back(new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5));
-    std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl;
-   
-  }
-
-  LatticePropagator point_source(UGrid);
-
-  Coordinate Origin({0,0,0,0});
-  PointSource   (Origin,point_source);
-  
-  //  std::vector<LatticePropagator> PointProps(nmass,UGrid);
-  std::vector<LatticePropagator> FreeProps(nmass,UGrid);
-  LatticePropagator delta(UGrid);
-
-  for(int m=0;m<nmass;m++) {
-    //    Solve(*FermActs[m],point_source   ,PointProps[m]);
-    MasslessFreePropagator(*FermActs[m],point_source   ,FreeProps[m]);
-
-    //    delta = PointProps[m] - FreeProps[m];
-    //    std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl;
-  }
-
-  LatticeComplex phase(UGrid);
-  Coordinate mom({0,0,0,0});
-  MakePhase(mom,phase);
-  
-  for(int m1=0 ;m1<nmass;m1++) {
-  for(int m2=m1;m2<nmass;m2++) {
-    std::stringstream ssp,ssg,ssz;
-
-    ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
-
-    //    std::cout << "CG determined VV correlation function"<<std::endl;
-    //    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
-    
-    std::cout << "FFT derived VV correlation function"<<std::endl;
-    MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase);
-  }}
-
-  Grid_finalize();
-}
-
-
-
--- a/systems/Aurora-AOT/config-command
+++ b/systems/Aurora-AOT/config-command
@ -0,0 +1,23 @@
+#Ahead of time compile for PVC
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
+
+#JIT compile 
+#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
+#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
+
+../../configure \
+	--enable-simd=GPU \
+	--enable-gen-simd-width=64 \
+	--enable-comms=mpi-auto \
+	--enable-debug \
+	--disable-gparity \
+	--disable-fermion-reps \
+	--with-lime=$CLIME \
+	--enable-shm=nvlink \
+	--enable-accelerator=sycl \
+	--enable-accelerator-aware-mpi=yes\
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=icpx 
+
--- a/systems/Aurora-AOT/sourceme.sh
+++ b/systems/Aurora-AOT/sourceme.sh
@ -0,0 +1,15 @@
+#module load oneapi/release/2023.12.15.001
+#module load mpich/icc-all-debug-pmix-gpu/52.2
+#module load mpich-config/mode/deterministic
+#module load intel_compute_runtime/release/821.35
+
+source ~/spack/share/spack/setup-env.sh 
+spack load c-lime
+spack load openssl
+export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
+export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
+export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
+export http_proxy=http://proxy.alcf.anl.gov:3128
+export https_proxy=http://proxy.alcf.anl.gov:3128
+git config --global http.proxy http://proxy.alcf.anl.gov:3128
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/Aurora-AOT/tests/reproBigJob.pbs
+++ b/systems/Aurora-AOT/tests/reproBigJob.pbs
@ -0,0 +1,74 @@
+#!/bin/bash
+
+#PBS -l select=512
+#PBS -q EarlyAppAccess
+#PBS -A LatticeQCD_aesp_CNDA
+#PBS -l walltime=6:00:00
+#PBS -N reproBigJob
+#PBS -k doe
+
+#export OMP_PROC_BIND=spread
+#unset OMP_PLACES
+
+#module load oneapi/eng-compiler/2023.05.15.003
+#module load mpich/51.2/icc-all-deterministic-pmix-gpu
+
+# 56 cores / 6 threads ~9
+export OMP_NUM_THREADS=6
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+
+#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
+
+export GRID_PRINT_ENTIRE_LOG=0
+export GRID_CHECKSUM_RECV_BUF=0
+export GRID_CHECKSUM_SEND_BUF=0
+
+export MPICH_OFI_NIC_POLICY=GPU
+
+#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
+#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
+#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
+#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
+#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
+#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
+
+cd $PBS_O_WORKDIR
+
+cp $PBS_NODEFILE nodefile
+
+DIR=reproBigJob.$PBS_JOBID
+
+mkdir -p $DIR
+cd $DIR
+
+cp $PBS_NODEFILE nodefile
+
+BINARY=../Test_dwf_mixedcg_prec
+
+echo > pingjob <<EOF
+while read node ; 
+do
+	echo ssh $node killall -s USR1 -- ../Test_dwf_mixedcg_prec
+done < nodefile
+EOF
+
+CMD="mpiexec -np 6144 -ppn 12  -envall --hostfile nodefile \
+	     ../gpu_tile_compact.sh \
+	     $BINARY --mpi 8.8.8.12 --grid 128.128.128.288 \
+	--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 18000 --debug-stdout --log Message --debug-signals --comms-overlap"
+
+echo $CMD > command-line
+env > environment
+$CMD
+grep Oops Grid.stderr.* > failures.$PBS_JOBID
+rm core.*
--- a/systems/Aurora/benchmarks/bench1.pbs
+++ b/systems/Aurora/benchmarks/bench1.pbs
@ -1,27 +1,24 @@
 #!/bin/bash

-#PBS -q EarlyAppAccess
+##PBS -q EarlyAppAccess
+#PBS -q debug
 #PBS -l select=1
 #PBS -l walltime=00:20:00
 #PBS -A LatticeQCD_aesp_CNDA

-#export OMP_PROC_BIND=spread
-#unset OMP_PLACES
-
 cd $PBS_O_WORKDIR

 source ../sourceme.sh
-module load pti-gpu

-#cat $PBS_NODEFILE
+cp $PBS_NODEFILE nodefile

 export OMP_NUM_THREADS=4
-export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+export MPICH_OFI_NIC_POLICY=GPU

+#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
-
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
@ -29,39 +26,11 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
-export MPICH_OFI_NIC_POLICY=GPU
-
-# 12 ppn, 2 nodes, 24 ranks
-#
-CMD="mpiexec -np 1 -ppn 1  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_usqcd --mpi 1.1.1.1 --grid 24.32.32.24 \
-		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
-$CMD | tee usqcd.log
-
-
-CMD="mpiexec -np 1 -ppn 1  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
-		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
-$CMD | tee 1tile.dwf

 CMD="mpiexec -np 12 -ppn 12  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \
-		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-#$CMD | tee 1node.32.32.32.48.dwf
+	     ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.96 \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 8 "

-
-CMD="mpiexec -np 12 -ppn 12  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
-		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-#$CMD | tee 1node.64.64.32.96.dwf
-
-CMD="mpiexec -np 12 -ppn 12  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
-		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-#$CMD | tee 1node.64.32.32.48.dwf
+echo $CMD
+$CMD

--- a/systems/Aurora/benchmarks/bench16.pbs
+++ b/systems/Aurora/benchmarks/bench16.pbs
@ -0,0 +1,74 @@
+#!/bin/bash
+
+##PBS -q LatticeQCD_aesp_CNDA
+#PBS -q debug-scaling
+##PBS -q prod
+#PBS -l select=16
+#PBS -l walltime=00:20:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+cd $PBS_O_WORKDIR
+
+source ../sourceme.sh
+
+cp $PBS_NODEFILE nodefile
+
+export OMP_NUM_THREADS=4
+export MPICH_OFI_NIC_POLICY=GPU
+
+#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+
+#
+# Local vol 16.16.16.32
+#
+
+LX=16
+LY=16
+LZ=16
+LT=32
+
+NX=2
+NY=2
+NZ=4
+NT=1
+
+GX=2
+GY=2
+GZ=1
+GT=3
+
+PX=$((NX * GX ))
+PY=$((NY * GY ))
+PZ=$((NZ * GZ ))
+PT=$((NT * GT ))
+
+VX=$((PX * LX ))
+VY=$((PY * LY ))
+VZ=$((PZ * LZ ))
+VT=$((PT * LT ))
+
+NP=$((PX*PY*PZ*PT))
+VOL=${VX}.${VY}.${VZ}.${VT}
+AT=8
+MPI=${PX}.${PY}.${PZ}.${PT}
+
+CMD="mpiexec -np $NP -ppn 12  -envall \
+	     ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi $MPI --grid $VOL \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
+
+echo VOL $VOL
+echo MPI $MPI
+echo NPROC $NP
+echo $CMD
+$CMD
+
--- a/systems/Aurora/benchmarks/bench2.pbs
+++ b/systems/Aurora/benchmarks/bench2.pbs
@ -1,55 +1,48 @@
 #!/bin/bash

-#PBS -q EarlyAppAccess
+##PBS -q EarlyAppAccess
+#PBS -q debug
 #PBS -l select=2
 #PBS -l walltime=00:20:00
 #PBS -A LatticeQCD_aesp_CNDA

-#export OMP_PROC_BIND=spread
-#unset OMP_PLACES
-
 cd $PBS_O_WORKDIR

 source ../sourceme.sh
-module load pti-gpu

-#cat $PBS_NODEFILE
+cp $PBS_NODEFILE nodefile

 export OMP_NUM_THREADS=4
-export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+export MPICH_OFI_NIC_POLICY=GPU

+#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16

-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
-export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
-export MPICH_OFI_NIC_POLICY=GPU
-
-# 12 ppn, 2 nodes, 24 ranks
 #
+# Local vol 16.16.16.32
+#
+
+#VOL=32.64.64.96
+
+for VOL in 32.32.32.96 32.64.64.96
+do
+for AT in 32
+do
 CMD="mpiexec -np 24 -ppn 12  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
-		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
-$CMD | tee 2node.comms
+	     ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid $VOL \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "

-
-CMD="mpiexec -np 24 -ppn 12  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
-$CMD | tee 2node.32.32.64.48.dwf
-
-
-CMD="mpiexec -np 24 -ppn 12  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
-$CMD | tee 2node.64.64.64.96.dwf
+echo $CMD
+$CMD
+done
+done

--- a/systems/Aurora/benchmarks/gpu_tile_compact.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh
@ -4,10 +4,12 @@
 #export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1);
 #export  GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)

-export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 );
+export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 );
+export NUMA_HMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 );
 export  GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )

-export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
+export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
+export NUMAH=${NUMA_HMAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
  
 unset EnableWalkerPartition
@ -17,18 +19,19 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero

 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5
-#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1

+#export MPI_BUF_NUMA=$NUMAH
+
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "

 if [ $PALS_RANKID = "0" ]
 then
-#    numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline  "$@"
-#    numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
-    numactl -m $NUMA -N $NUMA  "$@"
+#    numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
+    numactl -p $NUMAP -N $NUMAP  "$@"
 else 
-    numactl -m $NUMA -N $NUMA  "$@"
+    numactl -p $NUMAP -N $NUMAP  "$@"
 fi
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@ -1,17 +1,25 @@
+#Ahead of time compile for PVC

-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions "
-../../configure \
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
+
+#JIT compile 
+#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
+#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
+
+../configure \
 	--enable-simd=GPU \
+	--enable-reduction=grid \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
+	--prefix $HOME/gpt-install \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
-	--enable-accelerator-aware-mpi=yes\
+	--enable-accelerator-aware-mpi=no\
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx 
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@ -1,5 +1,9 @@
-module load oneapi/release/2023.12.15.001
+#module load oneapi/release/2023.12.15.001
+#module load mpich/icc-all-debug-pmix-gpu/52.2
+#module load mpich-config/mode/deterministic
 #module load intel_compute_runtime/release/821.35
+module load pti-gpu
+
 source ~/spack/share/spack/setup-env.sh 
 spack load c-lime
 spack load openssl
--- a/systems/Aurora/tests/reproBigJob.pbs
+++ b/systems/Aurora/tests/reproBigJob.pbs
@ -15,13 +15,13 @@

 # 56 cores / 6 threads ~9
 export OMP_NUM_THREADS=6
-#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
-#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
-#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
-#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
-#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
-#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
-#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1

 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
@ -30,20 +30,22 @@ export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"

 export GRID_PRINT_ENTIRE_LOG=0
-export GRID_CHECKSUM_RECV_BUF=1
-export GRID_CHECKSUM_SEND_BUF=1
+export GRID_CHECKSUM_RECV_BUF=0
+export GRID_CHECKSUM_SEND_BUF=0

 export MPICH_OFI_NIC_POLICY=GPU

-export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
-export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
-export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
-unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
-unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
-unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
+#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
+#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
+#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
+#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
+#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
+#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE

 cd $PBS_O_WORKDIR

+cp $PBS_NODEFILE nodefile
+
 DIR=reproBigJob.$PBS_JOBID

 mkdir -p $DIR
@ -51,10 +53,19 @@ cd $DIR

 cp $PBS_NODEFILE nodefile

+BINARY=../Test_dwf_mixedcg_prec
+
+echo > pingjob <<EOF
+while read node ; 
+do
+	echo ssh $node killall -s USR1 -- ../Test_dwf_mixedcg_prec
+done < nodefile
+EOF
+
 CMD="mpiexec -np 384 -ppn 12  -envall --hostfile nodefile \
 	     ../gpu_tile_compact.sh \
-	     ../Test_dwf_mixedcg_prec --mpi 4.4.4.6 --grid 128.128.128.96  \
-		--shm-mpi 1 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --debug-signals"
+	     $BINARY --mpi 4.4.4.6 --grid 128.128.128.96  \
+		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --debug-signals"

 echo $CMD > command-line
 env > environment
--- a/systems/Frontier-rocm631/config-command
+++ b/systems/Frontier-rocm631/config-command
@ -0,0 +1,22 @@
+CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
+../../configure --enable-comms=mpi-auto \
+--with-lime=$CLIME \
+--enable-unified=no \
+--enable-shm=nvlink \
+--enable-tracing=none \
+--enable-accelerator=hip \
+--enable-gen-simd-width=64 \
+--disable-gparity \
+--disable-fermion-reps \
+--enable-simd=GPU \
+--with-gmp=$OLCF_GMP_ROOT \
+--with-fftw=$FFTW_DIR/.. \
+--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
+--disable-fermion-reps \
+CXX=hipcc MPICXX=mpicxx \
+CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
+ LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
+
+
+
+
--- a/systems/Frontier-rocm631/sourceme631.sh
+++ b/systems/Frontier-rocm631/sourceme631.sh
@ -0,0 +1,16 @@
+
+echo spack
+. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
+
+#module load cce/15.0.1
+
+module load rocm/6.3.1
+module load cray-fftw
+module load craype-accel-amd-gfx90a
+export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+
+#Ugly hacks to get down level software working on current system
+#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
+#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
+#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
+
--- a/systems/Frontier/benchmarks/bench2.slurm
+++ b/systems/Frontier/benchmarks/bench2.slurm
@ -30,14 +30,10 @@ source ${root}/sourceme.sh

 export OMP_NUM_THREADS=7
 export MPICH_GPU_SUPPORT_ENABLED=1
-export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
-
-for vol in 32.32.32.64
+#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+#64.64.32.96
+for vol in 64.64.32.64
 do
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
-
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
 done

--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@ -3,20 +3,19 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --with-lime=$CLIME \
 --enable-unified=no \
 --enable-shm=nvlink \
--enable-tracing=timer \
+--enable-tracing=none \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
--enable-accelerator-cshift \
 --with-gmp=$OLCF_GMP_ROOT \
 --with-fftw=$FFTW_DIR/.. \
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
-CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
- LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
+CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
+ LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"



--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@ -1,12 +1,25 @@
+
+echo spack
 . /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
-spack load c-lime
-module load emacs 
-module load PrgEnv-gnu
-module load rocm
-module load cray-mpich
-module load gmp
+
+module load cce/15.0.1
+module load rocm/5.3.0
 module load cray-fftw
 module load craype-accel-amd-gfx90a
+
+#Ugly hacks to get down level software working on current system
+export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
+ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
+
+#echo spack load c-lime
+#spack load c-lime
+#module load emacs 
+##module load PrgEnv-gnu
+##module load cray-mpich
+##module load cray-fftw
+##module load craype-accel-amd-gfx90a
+##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 #Hack for lib
-#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
+##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
--- a/systems/Linux-cuda/config-command
+++ b/systems/Linux-cuda/config-command
@ -0,0 +1,18 @@
+../../configure \
+    --enable-comms=mpi \
+    --enable-simd=GPU \
+    --enable-gen-simd-width=64 \
+    --enable-shm=nvlink \
+    --with-lime=$CLIME \
+    --with-hdf5=$HDF5 \
+    --with-fftw=$FFTW \
+    --with-gmp=$GMP \
+    --with-mpfr=$MPFR \
+    --enable-accelerator=cuda \
+    --disable-gparity \
+    --disable-fermion-reps \
+    --disable-unified \
+    CXX=nvcc \
+    LDFLAGS="-cudart shared -L$NVIDIALIB -lcublas" \
+    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared"
+
--- a/systems/Linux-cuda/sourceme.sh
+++ b/systems/Linux-cuda/sourceme.sh
@ -0,0 +1,16 @@
+. /home/paboyle/spack/share/spack/setup-env.sh
+spack load cuda@12.0.0
+spack load c-lime
+spack load gmp
+spack load mpfr
+spack load hdf5
+spack load fftw
+spack load openmpi
+export FFTW=`spack find --paths fftw | grep fftw | cut -c 14-`
+export HDF5=`spack find --paths hdf5 | grep hdf5 | cut -c 14-`
+export CUDA=`spack find --paths cuda@11.8.0 | grep cuda | cut -c 14-`
+export CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
+export GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
+export MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
+export NVIDIALIB=$CUDA/targets/x86_64-linux/lib/
+export LD_LIBRARY_PATH=$NVIDIALIB:$LD_LIBRARY_PATH:$HDF5/lib:$FFTW/lib:$CLIME/lib/:$MPFR/lib
--- a/systems/Lumi/config-command
+++ b/systems/Lumi/config-command
@ -1,7 +1,7 @@
 spack load c-lime
 spack load gmp
 spack load mpfr
-CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
+CLIME=`spack find --paths c-lime | grep c-lime| cut -c 13-`
 GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
 MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
 echo clime X$CLIME
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@ -0,0 +1,206 @@
+The purpose of this file is to collate all non-obvious known magic shell variables
+and compiler flags required for either correctness or performance on various systems.
+
+A repository of work-arounds.
+
+Contents:
+1. Interconnect + MPI
+2. Compilation
+3. Profiling
+
+************************
+* 1. INTERCONNECT + MPI
+************************
+
+--------------------------------------------------------------------
+MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O 
+--------------------------------------------------------------------
+export OMPI_MCA_io=romio321
+
+--------------------------------------
+ROMIO fail with > 2GB per node read (32 bit issue)
+--------------------------------------
+
+Use later MPICH
+
+https://github.com/paboyle/Grid/issues/381
+
+https://github.com/pmodels/mpich/commit/3a479ab0
+
+--------------------------------------------------------------------
+Slingshot: Frontier and Perlmutter libfabric slow down 
+and physical memory fragmentation 
+--------------------------------------------------------------------
+export FI_MR_CACHE_MONITOR=disabled
+or
+export FI_MR_CACHE_MONITOR=kdreg2
+
+--------------------------------------------------------------------
+Perlmutter
+--------------------------------------------------------------------
+
+export MPICH_RDMA_ENABLED_CUDA=1
+export MPICH_GPU_IPC_ENABLED=1
+export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
+export MPICH_GPU_NO_ASYNC_MEMCPY=0
+
+--------------------------------------------------------------------
+Frontier/LumiG
+--------------------------------------------------------------------
+
+Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
+
+cat << EOF > select_gpu
+#!/bin/bash
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+export GPU_MAP=(0 1 2 3 7 6 5 4)
+export NUMA_MAP=(3 3 1 1 2 2 0 0)
+export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
+export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
+export HIP_VISIBLE_DEVICES=\$GPU
+unset ROCR_VISIBLE_DEVICES
+echo RANK \$SLURM_LOCALID using GPU \$GPU    
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+chmod +x ./select_gpu
+
+srun ./select_gpu BINARY
+
+
+--------------------------------------------------------------------
+Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
+--------------------------------------------------------------------
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+
+--------------------------------------------------------------------
+Mellanox + A100 correctness (Tursa, Booster, Leonardo)
+--------------------------------------------------------------------
+export UCX_MEMTYPE_CACHE=n
+
+--------------------------------------------------------------------
+MPICH/Aurora/PVC correctness and performance 
+--------------------------------------------------------------------
+
+https://github.com/pmodels/mpich/issues/7302
+
+--enable-cuda-aware-mpi=no  
+--enable-unified=no
+
+Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
+Do not use SVM
+
+Ideally use MPICH with fix to issue 7302:
+
+https://github.com/pmodels/mpich/pull/7312
+
+Ideally:
+MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
+
+Alternatives:
+export MPIR_CVAR_NOLOCAL=1
+export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
+
+--------------------------------------------------------------------
+MPICH/Aurora/PVC correctness and performance 
+--------------------------------------------------------------------
+
+Broken:
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+This gives good peformance without requiring 
+--enable-cuda-aware-mpi=no  
+
+But is an open issue reported by James Osborn
+https://github.com/pmodels/mpich/issues/7139
+
+Possibly resolved but unclear if in the installed software yet.
+
+************************
+* 2. COMPILATION
+************************
+
+--------------------------------------------------------------------
+G++ compiler breakage / graveyard
+--------------------------------------------------------------------
+
+9.3.0, 10.3.1, 
+https://github.com/paboyle/Grid/issues/290
+https://github.com/paboyle/Grid/issues/264
+
+Working (-) Broken (X):
+
+4.9.0 -
+4.9.1 -
+5.1.0 X
+5.2.0 X
+5.3.0 X
+5.4.0 X
+6.1.0 X
+6.2.0 X
+6.3.0 -
+7.1.0 -
+8.0.0 (HEAD) -
+
+https://github.com/paboyle/Grid/issues/100
+
+--------------------------------------------------------------------
+AMD GPU nodes :
+--------------------------------------------------------------------
+
+multiple ROCM versions broken; use 5.3.0
+manifests itself as wrong results in fp32 
+
+https://github.com/paboyle/Grid/issues/464
+
+--------------------------------------------------------------------
+Aurora/PVC
+--------------------------------------------------------------------
+
+SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
+SYCL slow link and relocatable code issues (Christoph Lehner)
+Opt large register file required for good performance in fp64
+
+
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fPIC"
+
+--------------------------------------------------------------------
+Aurora/PVC useful extra options
+--------------------------------------------------------------------
+
+Host only sanitizer:
+-Xarch_host -fsanitize=leak
+-Xarch_host -fsanitize=address
+
+Deterministic MPI reduction:
+export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
+unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
+
+
+
+************************
+* 3. Visual profile tools
+************************
+
+--------------------------------------------------------------------
+Frontier/rocprof
+--------------------------------------------------------------------
+
+--------------------------------------------------------------------
+Aurora/unitrace
+--------------------------------------------------------------------
+
+
+--------------------------------------------------------------------
+Tursa/nsight-sys
+--------------------------------------------------------------------
--- a/systems/sdcc-genoa/bench.slurm
+++ b/systems/sdcc-genoa/bench.slurm
@ -0,0 +1,32 @@
+#!/bin/bash
+#SBATCH --partition lqcd
+#SBATCH --time=00:50:00
+#SBATCH -A lqcdtest
+#SBATCH -q lqcd
+#SBATCH --exclusive
+#SBATCH --nodes=1
+#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=64
+#SBATCH --qos lqcd
+
+source sourceme.sh
+
+export PLACES=(1:16:4 1:32:2 0:64:1);
+export THR=(16 32 64)
+
+for t in  2 
+do
+   
+export OMP_NUM_THREADS=${THR[$t]}
+export OMP_PLACES=${PLACES[$t]}
+export thr=${THR[$t]}
+
+#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
+for vol in 48.48.48.96
+do
+srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
+done
+#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
+done
+
--- a/systems/sdcc-genoa/bench2.slurm
+++ b/systems/sdcc-genoa/bench2.slurm
@ -0,0 +1,36 @@
+#!/bin/bash
+#SBATCH --partition lqcd
+#SBATCH --time=00:50:00
+#SBATCH -A lqcdtest
+#SBATCH -q lqcd
+#SBATCH --exclusive
+#SBATCH --nodes=2
+#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
+#SBATCH --ntasks=2
+#SBATCH --cpus-per-task=64
+#SBATCH --qos lqcd
+
+source sourceme.sh
+
+export PLACES=(1:16:4 1:32:2 0:64:1);
+export THR=(16 32 64)
+
+nodes=2
+mpi=1.1.1.2
+
+for t in 2 
+do
+   
+export OMP_NUM_THREADS=${THR[$t]}
+export OMP_PLACES=${PLACES[$t]}
+export thr=${THR[$t]}
+
+#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
+
+for vol in 64.64.64.128
+do
+srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
+done
+
+done
+
--- a/systems/sdcc-genoa/config-command
+++ b/systems/sdcc-genoa/config-command
@ -0,0 +1,16 @@
+../../configure \
+--enable-comms=mpi-auto \
+--enable-unified=yes \
+--enable-shm=shmopen \
+--enable-shm-fast-path=shmopen \
+--enable-accelerator=none \
+--enable-simd=AVX512 \
+--disable-accelerator-cshift \
+--disable-fermion-reps \
+--disable-gparity \
+CXX=clang++ \
+MPICXX=mpicxx \
+CXXFLAGS="-std=c++17"
+
+
+
--- a/systems/sdcc-genoa/sourceme.sh
+++ b/systems/sdcc-genoa/sourceme.sh
@ -0,0 +1,4 @@
+source $HOME/spack/share/spack/setup-env.sh
+spack load llvm@17.0.4
+export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
+module load openmpi
--- a/systems/spack-linux/config-command
+++ b/systems/spack-linux/config-command
@ -0,0 +1,17 @@
+../../src/Grid/configure \
+    --prefix /home/pab/NPR/install \
+    --enable-comms=mpi-auto \
+    --enable-simd=AVX2 \
+    --enable-shm=none \
+    --enable-debug \
+    --with-lime=$CLIME \
+    --with-hdf5=$HDF5 \
+    --with-fftw=$FFTW \
+    --with-gmp=$GMP \
+    --with-mpfr=$MPFR \
+    --disable-gparity \
+    --disable-fermion-reps \
+    CXX=clang++ \
+    MPICXX=mpicxx \
+    CXXFLAGS="-std=c++17 "
+
--- a/systems/spack-linux/sourceme.sh
+++ b/systems/spack-linux/sourceme.sh
@ -0,0 +1,28 @@
+source $HOME/spack/share/spack/setup-env.sh
+spack load llvm@12
+spack load autoconf%clang@12.0.1
+spack load automake%clang@12.0.1
+spack load c-lime%clang@12.0.1
+spack load fftw%clang@12.0.1
+spack load gmp%clang@12.0.1
+spack load mpfr%clang@12.0.1
+spack load openmpi%clang@12.0.1
+spack load openssl%clang@12.0.1
+spack load hdf5+cxx%clang@12.0.1
+spack load cmake%clang@12.0.1
+export FFTW=`spack find --paths fftw%clang@12.0.1    | grep ^fftw   | awk '{print $2}' `
+export HDF5=`spack find --paths hdf5+cxx%clang@12.0.1   | grep ^hdf5   | awk '{print $2}' `
+export CLIME=`spack find --paths c-lime%clang@12.0.1 | grep ^c-lime | awk '{print $2}' `
+export MPFR=`spack find --paths mpfr%clang@12.0.1    | grep ^mpfr  | awk '{print $2}' `
+export LLVM=`spack find --paths llvm@12    | grep ^llvm  | awk '{print $2}' `
+export OPENSSL=`spack find --paths openssl%clang@12.0.1 | grep openssl | awk  '{print $2}' `
+export GMP=`spack find --paths gmp%clang@12.0.1      | grep ^gmp | awk '{print $2}' `
+export TCLAP=`spack find --paths tclap%clang@12.0.1  | grep ^tclap | awk '{print $2}' `
+export LD_LIBRARY_PATH=${TCLAP}/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$FFTW/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$LLVM/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$LLVM/lib/x86_64-unknown-linux-gnu/:$LD_LIBRARY_PATH
+
+ulimit -s 81920
--- a/systems/spack-linux/spack-install
+++ b/systems/spack-linux/spack-install
@ -0,0 +1,19 @@
+cd
+git clone https://github.com/spack/spack.git
+source $HOME/spack/share/spack/setup-env.sh
+
+spack install llvm@12
+
+spack install autoconf%clang@12.0.1
+spack install automake%clang@12.0.1
+spack install c-lime%clang@12.0.1
+spack install fftw%clang@12.0.1
+spack install gmp%clang@12.0.1
+spack install mpfr%clang@12.0.1
+spack install openmpi%clang@12.0.1
+spack install openssl%clang@12.0.1
+spack install hdf5+cxx%clang@12.0.1
+spack install cmake%clang@12.0.1
+spack install tclap%clang@12.0.1
+spack install emacs%clang@12.0.1
+
--- a/tests/Test_dwf_dslash_repro.cc
+++ b/tests/Test_dwf_dslash_repro.cc
@ -0,0 +1,239 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_cg_prec.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+#ifndef HOST_NAME_MAX
+#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
+#endif
+
+typedef LatticeFermionD FermionField;
+
+int VerifyOnDevice(const FermionField &res, FermionField &ref)
+{
+  deviceVector<int> Fails(1);
+  int * Fail = &Fails[0];
+  int FailHost=0;
+  
+  typedef typename FermionField::vector_object vobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  const uint64_t NN = res.Grid()->oSites();
+
+  acceleratorPut(*Fail,FailHost);
+
+  accelerator_barrier();
+  // Inject an error
+
+  int injection=0;
+  if(getenv("GRID_ERROR_INJECT")) injection=1;
+  autoView(res_v,res,AcceleratorWrite);
+  autoView(ref_v,ref,AcceleratorRead);
+  if ( res.Grid()->ThisRank()== 0 )
+  {
+    if (((random()&0xF)==0)&&injection) {
+      uint64_t sF = random()%(NN);
+      int lane=0;
+      printf("Error injection site %ld on rank %d\n",sF,res.Grid()->ThisRank());
+      auto vv = acceleratorGet(res_v[sF]);
+      double *dd = (double *)&vv;
+      *dd=M_PI;
+      acceleratorPut(res_v[sF],vv);
+    }
+  }
+
+  accelerator_for( sF, NN, vobj::Nsimd(), {
+#ifdef GRID_SIMT
+      {
+        int blane = acceleratorSIMTlane(vobj::Nsimd());
+#else
+      for(int blane;blane<vobj::Nsimd();blane++){
+#endif
+	vector_type *vtrr = (vector_type *)&res_v[sF];
+	vector_type *vtrf = (vector_type *)&ref_v[sF];
+	int words = sizeof(vobj)/sizeof(vector_type);
+	
+	for(int w=0;w<words;w++){
+	  scalar_type rrtmp = getlane(vtrr[w], blane);
+	  scalar_type rftmp = getlane(vtrf[w], blane);
+	  if ( rrtmp != rftmp) {
+	      *Fail=1;
+	  }
+	}
+      }
+  });
+
+  FailHost = acceleratorGet(*Fail);
+
+  return FailHost;
+}
+void PrintFails(const FermionField &res, FermionField &ref,uint64_t *ids)
+{
+  typedef typename FermionField::vector_object vobj;
+
+  const int Nsimd=vobj::Nsimd();
+  const uint64_t NN = res.Grid()->oSites();
+
+  ///////////////////////////////
+  // Pull back to host
+  ///////////////////////////////
+  autoView(res_v,res,CpuRead);
+  autoView(ref_v,ref,CpuRead);
+  
+  std::vector<uint64_t> ids_host(NN*Nsimd);
+  
+  acceleratorCopyFromDevice(ids,&ids_host[0],NN*Nsimd*sizeof(uint64_t));
+
+  //////////////////////////////////////////////////////////////
+  // Redo check on host and print IDs
+  //////////////////////////////////////////////////////////////
+  
+  for(int ss=0;ss< NN; ss++){				
+      int sF = ss;
+      for(int lane=0;lane<Nsimd;lane++){
+	
+	auto rr = extractLane(lane,res_v[sF]);
+	auto rf = extractLane(lane,ref_v[sF]);
+	uint64_t id = ids_host[lane+Nsimd*sF];
+	//	std::cout << GridHostname()<<" id["<<sF<<"] lane "<<lane<<" id "<<id<<std::endl;
+	for(int s=0;s<4;s++){
+	  for(int c=0;c<3;c++){
+	    if ( rr()(s)(c)!=rf()(s)(c) ) {
+	      int subslice=(id>>0 )&0xFF;
+	      int slice   =(id>>8 )&0xFF;
+	      int eu      =(id>>16)&0xFF;
+	      std::cout << GridHostname()<<" miscompare site "<<sF<<" "<<rr()(s)(c)<<" "<<rf()(s)(c)<<" EU "<<eu<<" slice "<<slice<<" subslice "<<subslice<<std::endl;
+	    }
+	  }
+	}
+      }
+  };
+  return;
+}
+
+
+
+int main (int argc, char ** argv)
+{
+  char hostname[HOST_NAME_MAX+1];
+  gethostname(hostname, HOST_NAME_MAX+1);
+  std::string host(hostname);
+  
+  Grid_init(&argc,&argv);
+
+  const int Ls=12;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeGaugeField Umu(UGrid);
+  LatticeFermionD    src(FGrid); random(RNG5,src);
+  LatticeFermionD   junk(FGrid); random(RNG5,junk);
+
+  LatticeFermionD result(FGrid); result=Zero();
+  LatticeFermionD ref(FGrid); ref=Zero();
+  
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+
+  RealD mass=0.1;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  int nsecs=600;
+  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
+    std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds");
+    GridCmdOptionInt(arg,nsecs);
+  }
+  
+  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl;
+  UGrid->Barrier();
+  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl;
+
+  std::cout << GridLogMessage << "::::::::::::: Starting DWF repro for "<<nsecs <<" seconds" << std::endl;
+
+  time_t now;
+  time_t start = time(NULL);
+  UGrid->Broadcast(0,(void *)&start,sizeof(start));
+
+  FlightRecorder::ContinueOnFail = 0;
+  FlightRecorder::PrintEntireLog = 0;
+  FlightRecorder::ChecksumComms  = 0;
+  FlightRecorder::ChecksumCommsSend=0;
+
+  if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s);
+  if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms      = atoi(s);
+  if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend  = atoi(s);
+
+  const uint64_t NN = FGrid->oSites()*vComplexD::Nsimd();
+  
+  deviceVector<uint64_t> ids_device(NN);
+  uint64_t *ids = &ids_device[0];
+  
+
+  Ddwf.DhopComms(src,ref);
+  Ddwf.DhopCalc(src,ref,ids);
+
+  Ddwf.DhopComms(src,result);
+  
+  int iter=0;
+  do {
+    
+    result=junk;
+
+    Ddwf.DhopCalc(src,result,ids);
+
+    if ( VerifyOnDevice(result, ref) ) {
+      printf("Node %s Iter %d detected fails\n",GridHostname(),iter);
+      PrintFails(result,ref,ids);
+      //      std::cout << " Dslash "<<iter<<" is WRONG! "<<std::endl;
+    }
+    //else {
+    //      printf("Node %s Iter %d detected NO fails\n",GridHostname(),iter);
+    //      PrintFails(result,ref,ids);
+    //      std::cout << " Dslash "<<iter<<" is OK! "<<std::endl;
+    //}
+
+
+    iter ++;
+    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
+  } while (now < (start + nsecs) );
+
+  
+  Grid_finalize();
+}
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@ -124,6 +124,8 @@ int main (int argc, char ** argv)

  SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
  SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
+  //  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
+  //  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);

  int nsecs=600;
  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
@ -131,6 +133,10 @@ int main (int argc, char ** argv)
    GridCmdOptionInt(arg,nsecs);
  }
  
+  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl;
+  UGrid->Barrier();
+  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl;
+
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;

  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
@ -148,7 +154,7 @@ int main (int argc, char ** argv)

  FlightRecorder::ContinueOnFail = 0;
  FlightRecorder::PrintEntireLog = 0;
-  FlightRecorder::ChecksumComms  = 1;
+  FlightRecorder::ChecksumComms  = 0;
  FlightRecorder::ChecksumCommsSend=0;

  if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s);
@ -180,7 +186,7 @@ int main (int argc, char ** argv)
    iter ++;
    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
  } while (now < (start + nsecs/10) );
-    
+
  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
  int i=0;
--- a/tests/Test_meson_field.cc
+++ b/tests/Test_meson_field.cc
@ -31,7 +31,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 using namespace Grid;

 const int TSRC = 0;  //timeslice where rho is nonzero
-const int VDIM = 5; //length of each vector
+const int VDIM = 8; //length of each vector

 typedef typename DomainWallFermionD::ComplexField ComplexField;
 typedef typename DomainWallFermionD::FermionField FermionField;
@ -55,19 +55,26 @@ int main(int argc, char *argv[])
  pRNG.SeedFixedIntegers(seeds);

  // MesonField lhs and rhs vectors
+  const int Nem=1;
  std::vector<FermionField> phi(VDIM,&grid);
-  std::vector<FermionField> rho(VDIM,&grid);
-  FermionField rho_tmp(&grid);
+  std::vector<ComplexField> B0(Nem,&grid);
+  std::vector<ComplexField> B1(Nem,&grid);
  std::cout << GridLogMessage << "Initialising random meson fields" << std::endl;
  for (unsigned int i = 0; i < VDIM; ++i){
    random(pRNG,phi[i]);
-    random(pRNG,rho_tmp); //ideally only nonzero on t=0
-    rho[i] = where((t==TSRC), rho_tmp, 0.*rho_tmp); //ideally only nonzero on t=0
+  }
+  for (unsigned int i = 0; i < Nem; ++i){
+    random(pRNG,B0[i]);
+    random(pRNG,B1[i]);
  }
  std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl;

  // Gamma matrices used in the contraction
  std::vector<Gamma::Algebra> Gmu = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT,
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
@ -78,11 +85,15 @@ int main(int argc, char *argv[])
  std::vector<std::vector<double>> momenta = {
 	  {0.,0.,0.},
 	  {1.,0.,0.},
+	  {-1.,0.,0.},
+	  {0,1.,0.},
+	  {0,-1.,0.},
+	  {0,0,1.},
+	  {0,0,-1.},
 	  {1.,1.,0.},
 	  {1.,1.,1.},
 	  {2.,0.,0.}
  };
-
  std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl;

  std::cout << GridLogMessage << "Computing complex phases" << std::endl;
@ -102,28 +113,29 @@ int main(int argc, char *argv[])
  std::cout << GridLogMessage << "Computing complex phases done." << std::endl;

  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
-  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
-  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mrr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
+  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> App(B0.size(),1,Nt,VDIM,VDIM);

  // timer
  double start,stop;

+  /////////////////////////////////////////////////////////////////////////
  //execute meson field routine
+  /////////////////////////////////////////////////////////////////////////
+  A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
  start = usecond();
  A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
  stop = usecond();
  std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl;
-  start = usecond();
-  /* Ideally, for this meson field we could pass TSRC (even better a list of timeslices)
-   * to the routine so that all the compnents which are predictably equal to zero are not computed. */
-  A2Autils<WilsonImplR>::MesonField(Mpr,&phi[0],&rho[0],Gmu,phases,Tp);
-  stop = usecond();
-  std::cout << GridLogMessage << "M(phi,rho) created, execution time " << stop-start << " us" << std::endl;
-  start = usecond();
-  A2Autils<WilsonImplR>::MesonField(Mrr,&rho[0],&rho[0],Gmu,phases,Tp);
-  stop = usecond();
-  std::cout << GridLogMessage << "M(rho,rho) created, execution time " << stop-start << " us" << std::endl;

+  /////////////////////////////////////////////////////////////////////////
+  //execute aslash field routine
+  /////////////////////////////////////////////////////////////////////////
+  A2Autils<WilsonImplR>::AslashField(App,&phi[0],&phi[0],B0,B1,Tp);
+  start = usecond();
+  A2Autils<WilsonImplR>::AslashField(App,&phi[0],&phi[0],B0,B1,Tp);
+  stop = usecond();
+  std::cout << GridLogMessage << "Alash(phi,phi) created, execution time " << stop-start << " us" << std::endl;
+  
  std::string FileName = "Meson_Fields";
 #ifdef HAVE_HDF5
  using Default_Reader = Grid::Hdf5Reader;
@ -134,12 +146,11 @@ int main(int argc, char *argv[])
  using Default_Writer = Grid::BinaryWriter;
  FileName.append(".bin");
 #endif
-
-  Default_Writer w(FileName);
-  write(w,"phi_phi",Mpp);
-  write(w,"phi_rho",Mpr);
-  write(w,"rho_rho",Mrr);
-
+  {
+    Default_Writer w(FileName);
+    write(w,"MesonField",Mpp);
+    write(w,"AslashField",App);
+  }
  // epilogue
  std::cout << GridLogMessage << "Grid is finalizing now" << std::endl;
  Grid_finalize();
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@ -39,7 +39,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  Coordinate latt_size   = GridDefaultLatt();
-  Coordinate simd_layout( { vComplexD::Nsimd(),1,1,1});
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();

  int vol = 1;
@ -279,6 +279,7 @@ int main (int argc, char ** argv)
    
    result5 = result5 - Kinetic;
    std::cout<<"diff "<< norm2(result5)<<std::endl;
+    assert(norm2(result5)<1.0e-4);
    
  }

@ -357,6 +358,7 @@ int main (int argc, char ** argv)
    
    diff = ref - result4;
    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
+    assert(norm2(diff)<1.0e-4);

  }

@ -440,6 +442,7 @@ int main (int argc, char ** argv)
    
    diff = ref - result4;
    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
+    assert(norm2(diff)<1.0e-4);

  }

--- a/tests/core/Test_fft_pf.cc
+++ b/tests/core/Test_fft_pf.cc
@ -38,7 +38,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  Coordinate latt_size   = GridDefaultLatt();
-  Coordinate simd_layout( { vComplexD::Nsimd(),1,1,1});
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();

  int vol = 1;
@ -74,7 +74,7 @@ int main (int argc, char ** argv)
  
  {
    std::cout<<"****************************************"<<std::endl;
-    std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator \n";
+    std::cout << "Testing OverlapWilsonPartialFractionTanhFermionD Hw kernel Mom space 4d propagator \n";
    std::cout<<"****************************************"<<std::endl;

    //    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
@ -88,7 +88,7 @@ int main (int argc, char ** argv)

    RealD mass=0.1;
    RealD M5  =0.8;
-    OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0);
+    OverlapWilsonPartialFractionTanhFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,1.0);

    // Momentum space prop
    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
@ -119,9 +119,10 @@ int main (int argc, char ** argv)
    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
    Dov.Mdag(src5,tmp5);
    src5=tmp5;
-    MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov);
+    MdagMLinearOperator<OverlapWilsonPartialFractionTanhFermionD,LatticeFermionD> HermOp(Dov);
    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
    CG(HermOp,src5,result5);
+    std::cout << " Solved by Conjugate Gradient (CGNE)" <<std::endl;
    ////////////////////////////////////////////////////////////////////////
    // Domain wall physical field propagator
    ////////////////////////////////////////////////////////////////////////
@ -153,7 +154,7 @@ int main (int argc, char ** argv)
  ////////////////////////////////////////////////////
  {
    std::cout<<"****************************************"<<std::endl;
-    std::cout << "Testing Dov(Hw) Mom space 4d propagator \n";
+    std::cout << "Testing OverlapWilsonCayleyTanhFermionD space 4d propagator \n";
    std::cout<<"****************************************"<<std::endl;

    LatticeFermionD    tmp(&GRID);
@ -228,7 +229,7 @@ int main (int argc, char ** argv)
  
  {
    std::cout<<"****************************************"<<std::endl;
-    std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator with q\n";
+    std::cout<<"Testing OverlapWilsonPartialFractionTanhFermionD Hw kernel Mom space 4d propagator with q\n";
    std::cout<<"****************************************"<<std::endl;

    //    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
@ -242,7 +243,9 @@ int main (int argc, char ** argv)

    RealD mass=0.1;
    RealD M5  =0.8;
-    OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0);
+    OverlapWilsonPartialFractionTanhFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,1.0);
+    std::vector<RealD> qmu({1.0,0.0,0.0,0.0});
+    Dov.set_qmu(qmu);

    // Momentum space prop
    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
@ -273,7 +276,7 @@ int main (int argc, char ** argv)
    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
    Dov.Mdag(src5,tmp5);
    src5=tmp5;
-    MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov);
+    MdagMLinearOperator<OverlapWilsonPartialFractionTanhFermionD,LatticeFermionD> HermOp(Dov);
    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
    CG(HermOp,src5,result5);
    ////////////////////////////////////////////////////////////////////////
--- a/tests/core/Test_fftf.cc
+++ b/tests/core/Test_fftf.cc
@ -39,7 +39,8 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  Coordinate latt_size   = GridDefaultLatt();
-  Coordinate simd_layout( { vComplexF::Nsimd(),1,1,1});
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  //  Coordinate simd_layout( { vComplexF::Nsimd(),1,1,1});
  Coordinate mpi_layout  = GridDefaultMpi();

  int vol = 1;
--- a/tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc
@ -0,0 +1,781 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_general_coarse_hdcg.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
+#include <Grid/algorithms/iterative/AdefMrhs.h>
+#include <Grid/algorithms/iterative/PowerSpectrum.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class aggregation>
+void SaveFineEvecs(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg[0].Grid()->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.size();b++){
+    WR.writeScidacFieldRecord(Agg[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void SaveBasis(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg.FineGrid->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    //    WR.writeScidacFieldRecord(Agg.subspace[b],record);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void LoadBasis(aggregation &Agg, std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+
+template<class aggregation>
+void LoadBasisSkip(aggregation &Agg, std::string file,int N,LatticeFermionF & tmp)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    for(int n=0;n<N;n++){
+      RD.readScidacFieldRecord(tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+      if(n==0) precisionChange(Agg.subspace[b],tmp);
+    }
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+template<class aggregation>
+void LoadBasisSum(aggregation &Agg, std::string file,int N,LatticeFermionF & tmp)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  
+  LatticeFermionF sum(tmp.Grid());
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    sum=Zero();
+    for(int n=0;n<N;n++){
+      RD.readScidacFieldRecord(tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+      sum=sum+tmp;
+    }
+    precisionChange(Agg.subspace[b],sum);
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+
+template<class CoarseVector>
+void SaveEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(evec[0].Grid()->IsBoss());
+  WR.open(evec_file);
+  for(int b=0;b<evec.size();b++){
+    WR.writeScidacFieldRecord(evec[b],record,0,0);
+  }
+  WR.close();
+  XmlWriter WRx(eval_file);
+  write(WRx,"evals",eval);
+#endif
+}
+template<class CoarseVector>
+void LoadEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+    XmlReader RDx(eval_file);
+    read(RDx,"evals",eval);
+    emptyUserRecord record;
+
+    Grid::ScidacReader RD ;
+    RD.open(evec_file);
+    assert(evec.size()==eval.size());
+    for(int k=0;k<eval.size();k++) {
+      RD.readScidacFieldRecord(evec[k],record);
+    }
+    RD.close();
+#endif
+}
+
+// Want Op in CoarsenOp to call MatPcDagMatPc
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
+  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
+  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
+  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+};
+
+template<class Field> class FixedCGPolynomial : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  ConjugateGradientPolynomial<Field>  CG;
+  int iters;
+  bool record;
+  int replay_count;
+  FixedCGPolynomial(int _iters, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    iters(_iters),
+    record(true),
+    CG(0.0,_iters,false)
+  {
+    std::cout << GridLogMessage<<" FixedCGPolynomial order "<<iters<<std::endl;
+    replay_count = 0;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+#if 1
+    GridBase *grid = in.Grid();
+    Field Mx0(grid);
+    Field r0(grid);
+    Field Minvr0(grid);
+
+    _SmootherOperator.HermOp(out,Mx0);
+
+    r0 = in - Mx0;
+
+    Minvr0 = Zero();
+    Minvr0.Checkerboard()=in.Checkerboard();
+    
+    if ( record ) {
+      std::cout << " FixedCGPolynomial recording polynomial "<<std::endl;
+      CG.Solve(_SmootherOperator,r0,Minvr0);
+      record = false;
+      /*
+      std::cout << "P(x) = 0 "<<std::endl;
+      for(int i=0;i<CG.polynomial.size();i++){
+	std::cout<<" + "<< CG.polynomial[i]<<" * (x**"<<i<<")"<<std::endl;
+	}
+      */
+      Field tmp(Minvr0.Grid());
+      CG.CGsequenceHermOp(_SmootherOperator,r0,tmp);
+      tmp = tmp - Minvr0;
+      std::cout << " CGsequence error "<<norm2(tmp)<<" / "<<norm2(out)<<std::endl;
+    } else {
+      std::cout << " FixedCGPolynomial replaying polynomial "<<std::endl;
+      CG.CGsequenceHermOp(_SmootherOperator,r0,Minvr0);
+      if ( replay_count %5== 0 ) record=true;
+      replay_count++;
+    }
+    out = out + Minvr0;
+    _SmootherOperator.HermOp(out,r0);
+    r0 = r0 - in;
+    RealD rr=norm2(r0);
+    RealD ss=norm2(in);
+    std::cout << " FixedCGPolynomial replayed polynomial resid "<<::sqrt(rr/ss)<<std::endl;
+#else
+    out = Zero();
+    out.Checkerboard()=in.Checkerboard();
+    if ( record ) {
+      std::cout << " FixedCGPolynomial recording polynomial "<<std::endl;
+      CG.Solve(_SmootherOperator,in,out);
+      record = false;
+      std::cout << "P(x) = 0 "<<std::endl;
+      for(int i=0;i<CG.polynomial.size();i++){
+	std::cout<<" + "<< CG.polynomial[i]<<" * (x**"<<i<<")"<<std::endl;
+      }
+      Field tmp(in.Grid());
+      CG.CGsequenceHermOp(_SmootherOperator,in,tmp);
+      tmp = tmp - out;
+      std::cout << " CGsequence error "<<norm2(tmp)<<" / "<<norm2(out)<<std::endl;
+    } else {
+      std::cout << " FixedCGPolynomial replaying polynomial "<<std::endl;
+      CG.CGsequenceHermOp(_SmootherOperator,in,out);
+      if ( replay_count %5== 5 ) record=true;
+      replay_count++;
+    }
+#endif
+    
+  }
+  void operator() (const std::vector<Field> &in, std::vector<Field> &out)
+  {
+    for(int i=0;i<out.size();i++){
+      out[i]=Zero();
+    }
+    int blockDim = 0;//not used for BlockCGVec
+    BlockConjugateGradient<Field>    BCGV  (BlockCGrQVec,blockDim,0.0,iters,false);
+    BCGV(_SmootherOperator,in,out);
+  }
+  
+};
+template<class Field> class CGSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  int iters;
+  CGSmoother(int _iters, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    iters(_iters)
+  {
+    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
+
+    out=Zero();
+
+    CG(_SmootherOperator,in,out);
+  }
+};
+
+
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+template<class Field> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  Chebyshev<Field> Cheby;
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {
+    std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    //    Field r(out.Grid());
+    Cheby(_SmootherOperator,in,out);
+    //    _SmootherOperator.HermOp(out,r);
+    //    r=r-in;
+    //    RealD rr=norm2(r);
+    //    RealD ss=norm2(in);
+    //    std::cout << GridLogMessage<<" Chebyshev smoother resid "<<::sqrt(rr/ss)<<std::endl;
+  }
+};
+
+
+template<class Field> class ChebyshevInverter : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _Operator;
+  Chebyshev<Field> Cheby;
+  ChebyshevInverter(RealD _lo,RealD _hi,int _ord, FineOperator &Operator) :
+    _Operator(Operator),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {
+    std::cout << GridLogMessage<<" Chebyshev Inverter order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    Field r(in.Grid());
+    Field AinvR(in.Grid());
+    _Operator.HermOp(out,r);
+    r = in - r; // b - A x
+    Cheby(_Operator,r,AinvR); // A^{-1} ( b - A x ) ~ A^{-1} b - x
+    out = out + AinvR;
+    _Operator.HermOp(out,r);
+    r = in - r; // b - A x
+    RealD rr = norm2(r);
+    RealD ss = norm2(in);
+    std::cout << "ChebshevInverse resid " <<::sqrt(rr/ss)<<std::endl;
+  }
+};
+
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int sample=1;
+  if( GridCmdOptionExists(argv,argv+argc,"--sample") ){
+    std::string arg;
+    arg = GridCmdOptionPayload(argv,argv+argc,"--sample");
+    GridCmdOptionInt(arg,sample);
+  }
+  
+  const int Ls=24;
+  const int nbasis = 62;
+  const int cb = 0 ;
+  RealD mass=0.00078;
+
+  if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
+    std::string arg;
+    arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
+    GridCmdOptionFloat(arg,mass);
+  }
+
+  RealD M5=1.8;
+  RealD b=1.5;
+  RealD c=0.5;
+
+  std::cout << GridLogMessage << " *************************** " <<std::endl;
+  std::cout << GridLogMessage << " Mass " <<mass<<std::endl;
+  std::cout << GridLogMessage << " M5   " <<M5<<std::endl;
+  std::cout << GridLogMessage << " Ls   " <<Ls<<std::endl;
+  std::cout << GridLogMessage << " b    " <<b<<std::endl;
+  std::cout << GridLogMessage << " c    " <<c<<std::endl;
+  std::cout << GridLogMessage << " *************************** " <<std::endl;
+  
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  //////////////////////////////////////////
+  // Single precision grids -- lanczos + smoother
+  //////////////////////////////////////////
+  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplexF::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  ///////////////////////// Configuration /////////////////////////////////
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.1000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  //////////////////////// Fermion action //////////////////////////////////
+  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "         Fine Power method            "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+
+  {
+    LatticeFermionD pm_src(FrbGrid);
+    pm_src = ComplexD(1.0);
+    PowerMethod<LatticeFermionD>       fPM;
+    fPM(HermOpEO,pm_src);
+  }
+
+  if(0)
+  {
+
+    std::cout << "**************************************"<<std::endl;
+    std::cout << "         Fine Lanczos           "<<std::endl;
+    std::cout << "**************************************"<<std::endl;
+
+    typedef LatticeFermionF FermionField;
+    LatticeGaugeFieldF UmuF(UGridF);
+    precisionChange(UmuF,Umu);
+    MobiusFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,b,c);
+    SchurDiagMooeeOperator<MobiusFermionF, LatticeFermionF> HermOpEOF(DdwfF);
+
+    const int Fine_Nstop = 200;
+    const int Fine_Nk = 200;
+    const int Fine_Np = 200;
+    const int Fine_Nm = Fine_Nk+Fine_Np;
+    const int Fine_MaxIt= 10;
+
+    RealD Fine_resid = 1.0e-4;
+    std::cout << GridLogMessage << "Fine Lanczos "<<std::endl;
+    std::cout << GridLogMessage << "Nstop "<<Fine_Nstop<<std::endl;
+    std::cout << GridLogMessage << "Nk "<<Fine_Nk<<std::endl;
+    std::cout << GridLogMessage << "Np "<<Fine_Np<<std::endl;
+    std::cout << GridLogMessage << "resid "<<Fine_resid<<std::endl;
+
+    Chebyshev<FermionField> Cheby(0.002,92.0,401);
+    //    Chebyshev<FermionField> Cheby(0.1,92.0,401);
+    FunctionHermOp<FermionField> OpCheby(Cheby,HermOpEOF);
+    PlainHermOp<FermionField> Op     (HermOpEOF);
+    ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Fine_Nstop,Fine_Nk,Fine_Nm,Fine_resid,Fine_MaxIt);
+    std::vector<RealD>          Fine_eval(Fine_Nm);
+    FermionField                Fine_src(FrbGridF); 
+    Fine_src = ComplexF(1.0);
+    std::vector<FermionField> Fine_evec(Fine_Nm,FrbGridF);
+
+    int Fine_Nconv;
+    std::cout << GridLogMessage <<" Calling IRL.calc single prec"<<std::endl;
+    IRL.calc(Fine_eval,Fine_evec,Fine_src,Fine_Nconv);
+
+    std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
+    SaveFineEvecs(Fine_evec,evec_file);
+  }
+
+
+  //////////////////////////////////////////
+  // Construct a coarsened grid with 4^4 cell
+  //////////////////////////////////////////
+  Coordinate Block({4,4,6,4});
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/Block[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
+							    GridDefaultSimd(Nd,vComplex::Nsimd()),
+							    GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  ///////////////////////// RNGs /////////////////////////////////
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  
+  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
+  HermFineMatrix FineHermOp(HermOpEO);
+
+  ////////////////////////////////////////////////////////////
+  ///////////// Coarse basis and Little Dirac Operator ///////
+  ////////////////////////////////////////////////////////////
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse5d,FrbGrid,cb);
+
+  ////////////////////////////////////////////////////////////
+  // Need to check about red-black grid coarsening
+  ////////////////////////////////////////////////////////////
+  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.mixed.2500.60");
+  //  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62");
+  //  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
+  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.mixed.2500.60");
+  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.mixed.60");
+  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
+  std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
+  bool load_agg=true;
+  bool load_refine=true;
+  bool load_mat=false;
+  bool load_evec=false;
+
+  int refine=1;
+  if ( load_agg ) {
+    if ( !(refine) || (!load_refine) ) { 
+      LoadBasis(Aggregates,subspace_file);
+    }
+  } else {
+    //    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
+    //    					0.0003,1.0e-5,2000); // Lo, tol, maxit
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run
+    Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 
+    SaveBasis(Aggregates,subspace_file);
+  }
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Building MultiRHS Coarse operator"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  ConjugateGradient<CoarseVector>  coarseCG(4.0e-2,20000,true);
+    
+  const int nrhs=24;
+    
+  Coordinate mpi=GridDefaultMpi();
+  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
+  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
+  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
+    
+  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
+  typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
+  MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "         Coarse Lanczos               "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+
+  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
+  Chebyshev<CoarseVector>      IRLCheby(0.005,42.0,301);  // 1 iter
+  MrhsHermMatrix MrhsCoarseOp     (mrhs);
+
+  //  CoarseVector pm_src(CoarseMrhs);
+  //  pm_src = ComplexD(1.0);
+  //  PowerMethod<CoarseVector>       cPM;   cPM(MrhsCoarseOp,pm_src);
+
+  int Nk=192;
+  int Nm=384;
+  int Nstop=Nk;
+  int Nconv_test_interval=1;
+  
+  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
+							  Coarse5d,
+							  CoarseMrhs,
+							  nrhs,
+							  IRLCheby,
+							  Nstop,
+							  Nconv_test_interval,
+							  nrhs,
+							  Nk,
+							  Nm,
+							  1e-5,10);
+
+  int Nconv;
+  std::vector<RealD>            eval(Nm);
+  std::vector<CoarseVector>     evec(Nm,Coarse5d);
+  std::vector<CoarseVector>     c_src(nrhs,Coarse5d);
+
+  ///////////////////////
+  // Deflation guesser object
+  ///////////////////////
+  MultiRHSDeflation<CoarseVector> MrhsGuesser;
+
+  //////////////////////////////////////////
+  // Block projector for coarse/fine
+  //////////////////////////////////////////
+  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
+
+  //////////////////////////
+  // Extra HDCG parameters
+  //////////////////////////
+  int maxit=300;
+  ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false);
+  ConjugateGradient<CoarseVector>  CGstart(5.0e-2,maxit,false);
+  RealD lo=2.0;
+  int ord = 7;
+  //  int ord = 11;
+
+  int blockDim = 0;//not used for BlockCG
+  BlockConjugateGradient<CoarseVector>    BCG  (BlockCGrQ,blockDim,5.0e-5,maxit,true);
+
+  DoNothingGuesser<CoarseVector> DoNothing;
+  //  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
+  //  HPDSolver<CoarseVector> HPDSolveMrhsStart(MrhsCoarseOp,CGstart,DoNothing);
+  //  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,BCG,DoNothing);
+  //  HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,BCG,DoNothing);
+  //  FixedCGPolynomial<CoarseVector>  HPDSolveMrhs(maxit,MrhsCoarseOp);
+
+  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,120,MrhsCoarseOp);  //
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,110,MrhsCoarseOp);  // 114 iter with Chebysmooth and BlockCG
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,120,MrhsCoarseOp); // 138 iter with Chebysmooth
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,200,MrhsCoarseOp); // 139 iter
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(3.0e-3,40.0,200,MrhsCoarseOp); // 137 iter, CG smooth, flex
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-3,40.0,200,MrhsCoarseOp); // 146 iter, CG smooth, flex
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(3.0e-4,40.0,200,MrhsCoarseOp); // 156 iter, CG smooth, flex
+
+  /////////////////////////////////////////////////
+  // Mirs smoother
+  /////////////////////////////////////////////////
+  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,lo);
+  //  FixedCGPolynomial<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
+  //  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
+  ChebyshevSmoother<LatticeFermionD> CGsmooth(2.0,92.0,8,HermOpEO) ;
+  
+  if ( load_refine ) {
+    LoadBasis(Aggregates,refine_file);
+    //    LatticeFermionF conv_tmp(FrbGridF);
+    //    LoadBasisSum(Aggregates,refine_file,sample,conv_tmp);
+  } else {
+    Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters
+    SaveBasis(Aggregates,refine_file);
+  }
+  Aggregates.Orthogonalise();
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Coarsen after refine"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Recompute coarse evecs  "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  evec.resize(Nm,Coarse5d);
+  eval.resize(Nm);
+  for(int r=0;r<nrhs;r++){
+    random(CRNG,c_src[r]);
+  }
+ IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Reimport coarse evecs  "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  MrhsGuesser.ImportEigenBasis(evec,eval);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Setting up mRHS HDCG"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
+  MrhsProjector.ImportBasis(Aggregates.subspace);
+      
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Calling mRHS HDCG"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
+    HDCGmrhs(1.0e-8, 300,
+	     FineHermOp,
+	     CGsmooth,
+	     HPDSolveMrhs,    // Used in M1
+	     HPDSolveMrhs,          // Used in Vstart
+	     MrhsProjector,
+	     MrhsGuesser,
+	     CoarseMrhs);
+    
+  std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
+  std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
+  LatticeFermionD result_accurate(FrbGrid);
+  LatticeFermionD result_sloppy(FrbGrid);
+  LatticeFermionD error(FrbGrid);
+  LatticeFermionD residual(FrbGrid);
+
+  for(int r=0;r<nrhs;r++){
+    random(RNG5,src_mrhs[r]);
+    res_mrhs[r]=Zero();
+  }
+  HDCGmrhs(src_mrhs,res_mrhs);
+  result_accurate = res_mrhs[0];
+
+#if 0
+
+  std::vector<RealD>   bins({1.0e-3,1.0e-2,1.0e-1,1.0,10.0,100.0});
+  std::vector<int>   orders({6000  ,4000  ,1000  ,500,500 ,500});
+  PowerSpectrum GraphicEqualizer(bins,orders);
+  
+  std::cout << "**************************************"<<std::endl;
+  std::cout << GridLogMessage << " PowerSpectrum of rrr "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  GraphicEqualizer(FineHermOp,HDCGmrhs.rrr);
+  std::cout << "**************************************"<<std::endl;
+  std::cout << GridLogMessage << " PowerSpectrum of sss "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  GraphicEqualizer(FineHermOp,HDCGmrhs.sss);
+  std::cout << "**************************************"<<std::endl;
+  std::cout << GridLogMessage << " PowerSpectrum of qqq "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  GraphicEqualizer(FineHermOp,HDCGmrhs.qqq);
+  std::cout << "**************************************"<<std::endl;
+  std::cout << GridLogMessage << " PowerSpectrum of zzz "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  GraphicEqualizer(FineHermOp,HDCGmrhs.zzz);
+
+  std::vector<RealD> tols({1.0e-3,1.0e-4,1.0e-5});
+
+
+  for(auto tol : tols) {
+    
+    TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
+      HDCGmrhsSloppy(tol, 500,
+		     FineHermOp,
+		     CGsmooth,
+		     HPDSolveMrhs,    // Used in M1
+		     HPDSolveMrhs,    // Used in Vstart
+		     MrhsProjector,
+		     MrhsGuesser,
+		     CoarseMrhs);
+  
+    //  Solve again to 10^-5
+    for(int r=0;r<nrhs;r++){
+      res_mrhs[r]=Zero();
+    }
+    HDCGmrhsSloppy(src_mrhs,res_mrhs);
+    
+    result_sloppy = res_mrhs[0];
+    error = result_sloppy - result_accurate;
+    FineHermOp.HermOp(result_sloppy,residual);
+    residual = residual - src_mrhs[0];
+    
+    std::cout << "**************************************"<<std::endl;
+    std::cout << GridLogMessage << " Converged to tolerance "<< tol<<std::endl;
+    std::cout << GridLogMessage << " Absolute error "<<norm2(error)<<std::endl;
+    std::cout << GridLogMessage << " Residual       "<<norm2(residual)<<std::endl;
+    std::cout << "**************************************"<<std::endl;
+
+    std::cout << "**************************************"<<std::endl;
+    std::cout << GridLogMessage << " PowerSpectrum of error   "<<std::endl;
+    std::cout << "**************************************"<<std::endl;
+    GraphicEqualizer(FineHermOp,error);
+    std::cout << "**************************************"<<std::endl;
+    std::cout << GridLogMessage << " PowerSpectrum of residual   "<<std::endl;
+    std::cout << "**************************************"<<std::endl;
+    GraphicEqualizer(FineHermOp,residual);
+
+  };
+#endif
+  
+  // Standard CG
+#if 0
+  {
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Calling red black CG"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+      
+    LatticeFermion result(FrbGrid); result=Zero();
+    LatticeFermion    src(FrbGrid); random(RNG5,src);
+    result=Zero();
+
+    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
+    CGfine(HermOpEO, src, result);
+  }
+#endif  
+  Grid_finalize();
+  return 0;
+}
--- a/tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc
@ -0,0 +1,355 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_general_coarse_hdcg.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
+#include <Grid/algorithms/iterative/AdefMrhs.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class aggregation>
+void SaveFineEvecs(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg[0].Grid()->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.size();b++){
+    WR.writeScidacFieldRecord(Agg[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void SaveBasis(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg.FineGrid->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    //    WR.writeScidacFieldRecord(Agg.subspace[b],record);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void LoadBasis(aggregation &Agg, std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+template<class aggregation>
+void LoadFineEvecs(aggregation &Agg, std::string file,LatticeFermionF & conv_tmp)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.size();b++){
+    RD.readScidacFieldRecord(conv_tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    precisionChange(Agg[b],conv_tmp);
+  }    
+  RD.close();
+#endif
+}
+template<class CoarseVector>
+void SaveEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(evec[0].Grid()->IsBoss());
+  WR.open(evec_file);
+  for(int b=0;b<evec.size();b++){
+    WR.writeScidacFieldRecord(evec[b],record,0,0);
+  }
+  WR.close();
+  XmlWriter WRx(eval_file);
+  write(WRx,"evals",eval);
+#endif
+}
+template<class CoarseVector>
+void LoadEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+    XmlReader RDx(eval_file);
+    read(RDx,"evals",eval);
+    emptyUserRecord record;
+
+    Grid::ScidacReader RD ;
+    RD.open(evec_file);
+    assert(evec.size()==eval.size());
+    for(int k=0;k<eval.size();k++) {
+      RD.readScidacFieldRecord(evec[k],record);
+    }
+    RD.close();
+#endif
+}
+
+// Want Op in CoarsenOp to call MatPcDagMatPc
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
+  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
+  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
+  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+};
+
+template<class Field> class CGSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  int iters;
+  CGSmoother(int _iters, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    iters(_iters)
+  {
+    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
+
+    out=Zero();
+
+    CG(_SmootherOperator,in,out);
+  }
+};
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+  const int nbasis = 62;
+  const int cb = 0 ;
+  RealD mass=0.00078;
+  RealD M5=1.8;
+  RealD b=1.5;
+  RealD c=0.5;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid with 4^4 cell
+  Coordinate Block({4,4,6,4});
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/Block[d];
+  }
+
+  //////////////////////////////////////////
+  // Double precision grids 
+  //////////////////////////////////////////
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
+							    GridDefaultSimd(Nd,vComplex::Nsimd()),
+							    GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  //////////////////////////////////////////
+  // Single precision grids -- lanczos + smoother
+  //////////////////////////////////////////
+  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplexF::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+  ///////////////////////// RNGs /////////////////////////////////
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  ///////////////////////// Configuration /////////////////////////////////
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.1000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  //////////////////////// Fermion action //////////////////////////////////
+  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+
+  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
+  
+  const int Fine_Nstop = 200;
+  const int Fine_Nk = 100;
+  const int Fine_Np = 100;
+  const int Fine_Nm = Fine_Nk+Fine_Np;
+
+  typedef LatticeFermion FermionField;
+  std::vector<RealD>        Fine_eval;
+  std::vector<FermionField> Fine_evec;
+
+  LatticeFermionF conv_tmp(FrbGridF);
+  Fine_eval.resize(Fine_Nstop);
+  Fine_evec.resize(Fine_Nstop,FrbGrid);
+  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
+  LoadFineEvecs(Fine_evec,evec_file,conv_tmp);
+  
+  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
+  HermFineMatrix FineHermOp(HermOpEO);
+
+  ////////////////////////////////////////////////////////////
+  ///////////// Coarse basis and Little Dirac Operator ///////
+  ////////////////////////////////////////////////////////////
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse5d,FrbGrid,cb);
+
+  ////////////////////////////////////////////////////////////
+  // Need to check about red-black grid coarsening
+  ////////////////////////////////////////////////////////////
+  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.mixed.2500.60");
+  //  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62");
+  //  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evec");
+  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.mixed.2500.60");
+  //  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.mixed.60");
+  //  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
+  //  std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
+  bool load_agg=true;
+  bool load_refine=true;
+
+  //////////////////////////////////////////
+  // Block projector for coarse/fine
+  //////////////////////////////////////////
+  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
+
+
+  /////////////////////////////////////////////////
+  // Mirs smoother
+  /////////////////////////////////////////////////
+  int ord=8;
+  RealD lo=2.0;
+  RealD MirsShift = lo;
+  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
+  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
+  
+  LoadBasis(Aggregates,refine_file);
+  Aggregates.Orthogonalise();
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Using filtered subspace"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
+  MrhsProjector.ImportBasis(Aggregates.subspace);
+
+  FermionField Ftmp(FrbGrid);
+  std::vector<FermionField> Fine_ev(1,FrbGrid);
+  std::vector<FermionField> Fine_ev_compressed(1,FrbGrid);
+  std::vector<CoarseVector>  c_evec(1,Coarse5d);
+  for(int ev=0;ev<Fine_evec.size();ev++){
+    Fine_ev[0] = Fine_evec[ev];
+    MrhsProjector.blockProject(Fine_ev,c_evec);
+    MrhsProjector.blockPromote(Fine_ev_compressed,c_evec);
+    Ftmp = Fine_ev_compressed[0];
+    RealD div = 1.0/ sqrt(norm2(Ftmp));
+    Ftmp = Ftmp * div;
+    std::cout << GridLogMessage<<" "<<ev<<" uncomp "<< norm2(Fine_ev[0])  <<std::endl;
+    std::cout << GridLogMessage<<" "<<ev<<" comp   "<< norm2(Ftmp)  <<std::endl;
+    Ftmp = Fine_ev[0] - Ftmp;
+    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
+    CGsmooth(Fine_ev_compressed[0],Ftmp);
+    Ftmp = Ftmp *lo;
+    std::cout << GridLogMessage<<" "<<ev<<" smoothed "<< norm2(Ftmp)  <<std::endl;
+    div = 1.0/ sqrt(norm2(Ftmp));
+    Ftmp=Ftmp*div;
+    Ftmp = Fine_ev[0]-Ftmp;
+    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
+  }
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Using eigenvector subspace "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  for(int i=0;i<Aggregates.subspace.size();i++){
+    Aggregates.subspace[i] = Fine_evec[i];
+  }
+  Aggregates.Orthogonalise();
+  MrhsProjector.ImportBasis(Aggregates.subspace);
+  for(int ev=0;ev<Fine_evec.size();ev++){
+    Fine_ev[0] = Fine_evec[ev];
+    MrhsProjector.blockProject(Fine_ev,c_evec);
+    MrhsProjector.blockPromote(Fine_ev_compressed,c_evec);
+    Ftmp = Fine_ev_compressed[0];
+    RealD div = 1.0/ sqrt(norm2(Ftmp));
+    Ftmp = Ftmp * div;
+    std::cout << GridLogMessage<<" "<<ev<<" uncomp "<< norm2(Fine_ev[0])  <<std::endl;
+    std::cout << GridLogMessage<<" "<<ev<<" comp   "<< norm2(Ftmp)  <<std::endl;
+    Ftmp = Fine_ev[0] - Ftmp;
+    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
+    CGsmooth(Fine_ev_compressed[0],Ftmp);
+    Ftmp = Ftmp *lo;
+    std::cout << GridLogMessage<<" "<<ev<<" smoothed "<< norm2(Ftmp)  <<std::endl;
+    div = 1.0/ sqrt(norm2(Ftmp));
+    Ftmp=Ftmp*div;
+    Ftmp = Fine_ev[0]-Ftmp;
+    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
+  }
+
+  // Standard CG
+  Grid_finalize();
+  return 0;
+}
--- a/tests/debug/Test_general_coarse_pvdagm.cc
+++ b/tests/debug/Test_general_coarse_pvdagm.cc
@ -36,28 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;

-template<class Field>
-class HermOpAdaptor : public LinearOperatorBase<Field>
-{
-  LinearOperatorBase<Field> & wrapped;
-public:
-  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
-  void OpDiag (const Field &in, Field &out) {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
-  void Op     (const Field &in, Field &out){
-    wrapped.HermOp(in,out);
-  }
-  void AdjOp     (const Field &in, Field &out){
-    wrapped.HermOp(in,out);
-  }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
-  void HermOp(const Field &in, Field &out){
-    wrapped.HermOp(in,out);
-  }
-  
-};
-
 template<class Matrix,class Field>
 class PVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
@ -69,78 +47,169 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
+    //    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
-    _PV.M(tmp,out);
-    _Mat.Mdag(in,tmp);
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    std::cout << "HermOp"<<std::endl;
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    //    _Mat.M(in,tmp);
+    //    _PV.Mdag(tmp,out);
+    //    _PV.M(out,tmp);
+    //    _Mat.Mdag(tmp,out);
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
-    _PV.M(out,tmp);
-    _Mat.Mdag(tmp,out);
-    std::cout << "HermOp done "<<norm2(out)<<std::endl;
-    
+    out = out + shift * in;
  }
-};
-
-template<class Field> class DumbOperator  : public LinearOperatorBase<Field> {
-public:
-  LatticeComplex scale;
-  DumbOperator(GridBase *grid) : scale(grid)
-  {
-    scale = 0.0;
-    LatticeComplex scalesft(grid);
-    LatticeComplex scaletmp(grid);
-    for(int d=0;d<4;d++){
-      Lattice<iScalar<vInteger> > x(grid); LatticeCoordinate(x,d+1);
-      LatticeCoordinate(scaletmp,d+1);
-      scalesft = Cshift(scaletmp,d+1,1);
-      scale = 100.0*scale + where( mod(x    ,2)==(Integer)0, scalesft,scaletmp);
-    }
-    std::cout << " scale\n" << scale << std::endl;
-  }
-  // Support for coarsening to a multigrid
-  void OpDiag (const Field &in, Field &out) {};
-  void OpDir  (const Field &in, Field &out,int dir,int disp){};
-  void OpDirAll  (const Field &in, std::vector<Field> &out) {};
-
-  void Op     (const Field &in, Field &out){
-    out = scale * in;
-  }
-  void AdjOp  (const Field &in, Field &out){
-    out = scale * in;
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    double n1, n2;
-    HermOpAndNorm(in,out,n1,n2);
-  }
-  void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
-    ComplexD dot;
-
-    out = scale * in;
-
-    dot= innerProduct(in,out);
-    n1=real(dot);
-
-    dot = innerProduct(out,out);
-    n2=real(dot);
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
  }
 };
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();

+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};

 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-  const int Ls=2;
+  const int Ls=16;

  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@ -151,7 +220,8 @@ int main (int argc, char ** argv)
  // Construct a coarsened grid
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
-    clatt[d] = clatt[d]/4;
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
@ -173,15 +243,14 @@ int main (int argc, char ** argv)
  FieldMetaData header;
  std::string file("ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu,header,file);
-  //Umu = 1.0;
  
-  RealD mass=0.5;
+  RealD mass=0.01;
  RealD M5=1.8;

  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);

-  const int nbasis = 1;
+  const int nbasis = 20;
  const int cb = 0 ;
  LatticeFermion prom(FGrid);

@ -193,25 +262,51 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
-  
-  PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM(Ddwf,Dpv);
-  HermOpAdaptor<LatticeFermionD> HOA(PVdagM);
+
+  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+  PVdagM_t PVdagM(Ddwf,Dpv);
+  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
+  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
+  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
+

  // Run power method on HOA??
-  PowerMethod<LatticeFermion>       PM;   PM(HOA,src);
+  //  PowerMethod<LatticeFermion>       PM;   PM(PVdagM,src);
 
  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace AggregatesPD(Coarse5d,FGrid,cb);
+  /*
  AggregatesPD.CreateSubspaceChebyshev(RNG5,
-				       HOA,
+				       PVdagM,
 				       nbasis,
-				       5000.0,
-				       0.02,
-				       100,
-				       50,
-				       50,
+				       4000.0,
+				       2.0,
+				       200,
+				       200,
+				       200,
 				       0.0);
+  */
+  AggregatesPD.CreateSubspaceGCR(RNG5,
+				 PVdagM,
+				 nbasis);
  
  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
  LittleDiracOpPV.CoarsenOperator(PVdagM,AggregatesPD);
@ -257,6 +352,60 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;

+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-2, 100, LinOpCoarse,simple,10,10); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(AggregatesPD,
+			    PVdagM,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Chulwoo Jung	a957e7bfa1	Adding DWF evec Chirality measurement	2025-04-22 22:17:51 +00:00
Chulwoo Jung	cee4c8ce8c	Merge branch 'develop' of https://github.com/paboyle/Grid into specflow	2025-04-18 19:55:36 +00:00
Peter Boyle	e652fc2825	Shared Memory test reenabled on every Grid object creation. Const improvements in Accelerator.h	2025-04-07 11:51:40 -04:00
Peter Boyle	a49fa3f8d0	ROCM 6.3.1 appears to work	2025-04-07 11:50:59 -04:00
Peter Boyle	cd452a2f91	Slurm update	2025-04-04 18:40:20 -04:00
Peter Boyle	4f89f603ae	Changes to add back shared memory test on GPU	2025-04-04 18:40:15 -04:00
Peter Boyle	11dc2c5e1d	PVdagM initialise	2025-04-04 18:35:06 -04:00
Peter Boyle	6fec3c15ca	Cleaner printing	2025-04-04 18:35:06 -04:00
Peter Boyle	938c47480f	Updated compile on frontier. Unsatisfactory hacsk	2025-04-04 18:35:06 -04:00
Peter Boyle	3811d19298	Fence	2025-04-04 18:35:06 -04:00
Peter Boyle	83a3ab6b6f	Barrier -- not sure 100% this was needed	2025-04-04 18:35:05 -04:00
Peter Boyle	d66a9af6a3	No compile fix	2025-04-04 18:35:05 -04:00
Peter Boyle	adc90d3a86	NVLINK GET/PUT on cuda aware mpi	2025-04-04 18:35:05 -04:00
Peter Boyle	ebbd015c5c	Deprecate shared memory copy as direction matters on nvidia GPU	2025-04-04 18:35:05 -04:00
Peter Boyle	4ab73b36b2	Deprecate shared memory copy as direction matters on GPU	2025-04-04 18:35:05 -04:00
Peter Boyle	130e07a422	Non hermitian support	2025-04-04 18:35:05 -04:00
Peter Boyle	8f47bb367e	Shifted non herm	2025-04-04 18:35:05 -04:00
Peter Boyle	0c3cb60135	Script update	2025-04-04 18:35:05 -04:00
Peter Boyle	9eae8fca5d	Size outut	2025-04-04 18:35:05 -04:00
Peter Boyle	882a217074	Example of Useful prerequisite installs with spack	2025-03-26 11:28:53 -04:00
Peter Boyle	199818bd6c	Merge pull request #475 from lehner/feature-aurora Sync with GPT on Aurora	2025-03-13 08:55:55 -04:00
Christoph Lehner	fe66c7ca30	verbosity	2025-03-13 12:49:36 +00:00
Christoph Lehner	e9177e4af3	Blas compatibility	2025-03-13 08:48:23 +00:00
Christoph Lehner	d15a6c5933	Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora	2025-03-13 07:29:55 +00:00
Peter Boyle	25ab9325e7	Use hostVector but remove construct resize	2025-03-11 15:02:32 +00:00
Peter Boyle	19f9378b98	Should work on Aurora nowb	2025-03-11 13:50:43 +00:00
Christoph Lehner	9ffd1ed4ce	Merged	2025-03-08 15:30:08 +00:00
Peter Boyle	3d014864e2	Makinig LLVM happy	2025-03-06 14:19:25 -05:00
Peter Boyle	1d22841811	Working on aurora, GPT issue turned up is fixed	2025-03-06 03:20:18 +00:00
Peter Boyle	a1cdda833f	Update WorkArounds.txt	2025-03-05 14:04:23 -05:00
Peter Boyle	ad6db92690	Update WorkArounds.txt	2025-03-05 14:00:26 -05:00
Peter Boyle	e8ff9d8e50	Update WorkArounds.txt	2025-03-05 14:00:04 -05:00
Peter Boyle	795769c636	Update WorkArounds.txt	2025-03-05 13:50:41 -05:00
Peter Boyle	267a39d943	Update WorkArounds.txt	2025-03-05 13:49:43 -05:00
Peter Boyle	3624bd3d22	Update WorkArounds.txt	2025-03-05 13:45:09 -05:00
Peter Boyle	bc12dbbb38	Update WorkArounds.txt	2025-03-05 12:48:56 -05:00
Peter Boyle	eb8a008a8f	Create WorkArounds.txt	2025-03-05 12:41:59 -05:00
Peter Boyle	c4d9aa1a21	Config command that makes GPT happier	2025-02-27 20:12:49 +00:00
Peter Boyle	6ae809ed40	Print not liked on GPT compile	2025-02-27 20:12:49 +00:00
Peter Boyle	311e2aab3f	Update Accelerator.h	2025-02-26 11:42:52 -05:00
Peter Boyle	438dfbdb83	Only throw if there is a pending list entry in CommsComplete	2025-02-25 16:57:27 +00:00
Peter Boyle	b2ce760cf4	Verbose issue with GPT	2025-02-25 16:55:23 +00:00
Peter Boyle	ba9bbe0221	Bounce MPI through host	2025-02-12 19:34:59 +00:00
Peter Boyle	4c3dd82d84	CSHIFT with bounce throuhgh Host memory on MPI packets	2025-02-12 19:09:53 +00:00
Peter Boyle	44e911b5b7	Comment change	2025-02-12 17:37:55 +00:00
Peter Boyle	a7a16df9d0	GET not put has kinder barrier sequence for NVLINK type access as when GET is done, I can use it without barrier. Moves a barrier to a nicer place, overlapped with DtoH DMA	2025-02-12 14:59:28 +00:00
Peter Boyle	382e0abefd	Was issueing a double fence -- the gather also fences	2025-02-12 14:57:28 +00:00
Peter Boyle	6fdefe5b90	Barrier sequencing if doing "GET" not "PUT" is different. This is somewhat better timing for Barriers	2025-02-12 14:55:20 +00:00
Peter Boyle	4788dd8e2e	More states in packet progression for GPU non aware MPI	2025-02-12 14:53:57 +00:00
Peter Boyle	1cc5f221f3	GET not put ordering is better as I know when I've got all MY data	2025-02-12 14:53:05 +00:00
Peter Boyle	93251bfba0	GET not put for better ordering in the downstream dependent kernels -- I know when I'm done, so we can move a barrier / handshake between ranks intranode to a point off critical path	2025-02-12 14:50:21 +00:00
Peter Boyle	18b79508b8	New line better for pretty print	2025-02-12 14:49:48 +00:00
Peter Boyle	4de5ed1613	Remove vector view. The std::vector will not inform Memory manager of deletion and so a stale entry could be left. It is not and should not be used.	2025-02-12 14:48:46 +00:00
Peter Boyle	0baaddbe98	Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384 nodes. More concurrency/fine grained scheduling is possible.	2025-02-04 19:27:26 +00:00
Peter Boyle	b50fb34e71	Perf on Aurora	2025-02-01 18:39:34 +00:00
Peter Boyle	de84d730ff	Fastest run config on Aurora to date	2025-02-01 18:08:40 +00:00
Peter Boyle	c74d11e3d7	PVdagM MG	2025-02-01 11:04:13 -05:00
Christoph Lehner	84cab5e6e7	no comms and log cleanup	2025-02-01 16:37:21 +01:00
Peter Boyle	c4fc972fec	Merge branch 'feature/deprecate-uvm' into develop	2025-01-31 16:32:36 +00:00
Peter Boyle	8cf809e231	Best results on Aurora so far	2025-01-31 16:14:45 +00:00
Peter Boyle	94019a922e	Significantly better performance on Aurora without using pipeline mode	2025-01-30 16:36:46 +00:00
Peter Boyle	d6b2727f86	Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora	2025-01-29 09:22:21 +00:00
Peter Boyle	74a4f43946	Optional host buffer bounce for no CUDA aware MPI	2025-01-28 15:22:46 +00:00
Peter Boyle	1caf8b0f86	Rename	2025-01-28 15:22:37 +00:00
Chulwoo Jung	570b72a47b	Bugfix. Sorry!	2025-01-21 15:37:39 -05:00
Chulwoo Jung	a5798a89ed	Merge branch 'develop' into specflow	2025-01-21 12:13:24 -05:00
Peter Boyle	3f3661a86f	Heading towards PVdagM multigrid	2025-01-17 14:33:35 +00:00
Chulwoo Jung	f7e2f9a401	Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement	2025-01-16 20:47:33 +00:00
Chulwoo Jung	2848a9b558	DWF Kernel lanczos working(?)	2025-01-16 01:29:56 +00:00
Peter Boyle	8fe429346f	Dslash testing for reproduce	2024-11-11 23:11:11 +00:00
Peter Boyle	5a4f9bf2e3	Force the ROCM version	2024-10-29 18:12:31 -04:00
Peter Boyle	b91fc1b6b4	Merge branch 'feature/boosted' into feature/deprecate-uvm Fixed boosted free field test	2024-10-28 16:53:09 -04:00
Peter Boyle	eafc150034	Test fft asserts	2024-10-23 16:46:26 -04:00
Peter Boyle	2877f1a268	Verbose reduce	2024-10-23 15:14:16 -04:00
Peter Boyle	1e893af775	GPU happy	2024-10-23 14:52:15 -04:00
Peter Boyle	d9f430a575	Happy GPU	2024-10-23 14:51:16 -04:00
Peter Boyle	63abe87f36	Memory manager verbose improvements that were useful to track an error	2024-10-23 14:49:13 -04:00
Peter Boyle	368d649c8a	feature/deprecate-uvm happier -- preallocate device resident neigbour table	2024-10-23 14:47:55 -04:00
Peter Boyle	5603464f39	Fix in partial fraction import/export physical and make the GPU happier on the deprecate-uvm -- don't use static vectors, make member of class	2024-10-23 14:45:58 -04:00
Peter Boyle	655c79f39e	Suppress warning on partial override	2024-10-23 14:44:41 -04:00
Peter Boyle	565b231c03	Nvcc happy	2024-10-23 14:44:17 -04:00
Peter Boyle	62a9f180fa	NVCC happy	2024-10-23 14:44:04 -04:00
Peter Boyle	5ae77876a8	Meson field and Aslash field on GPU; some compiler warning removed	2024-10-18 19:08:06 -04:00
Peter Boyle	4ed2c2c74f	Config command	2024-10-18 13:58:33 -04:00
Peter Boyle	955da582b6	Working on NVCC	2024-10-18 13:58:03 -04:00
Peter Boyle	11b07b950d	Vanilla linux compile, assuming spack prerequisites	2024-10-18 13:57:40 -04:00
Peter Boyle	8f70cfeda9	Clean up	2024-10-18 13:56:53 -04:00
Peter Boyle	ce64271048	Remove the copying version	2024-10-18 13:56:24 -04:00
Peter Boyle	5cc4f3241d	Meson field test	2024-10-18 15:42:30 +00:00
Peter Boyle	6815e138b4	Boosted fermion attempt	2024-10-17 18:37:33 +01:00
Peter Boyle	a78a61d76f	Update configure	2024-10-15 14:38:45 +00:00
Peter Boyle	2eff3f34ed	Alternate reduction; default to grids own but make a configure flag --enable-reduction=grid\|mpi	2024-10-15 14:36:06 +00:00
Peter Boyle	03687c1d62	Final version of test, closer to original again	2024-10-15 14:35:17 +00:00
Peter Boyle	febfe4e77f	Make my own reduction a configure flag	2024-10-15 14:32:35 +00:00
Peter Boyle	4d1aa134b5	Use normal reduction, configure flag to force deterministic	2024-10-15 14:32:11 +00:00
Peter Boyle	5ec879860a	Odd rounding issue - bears looking into	2024-10-15 14:30:54 +00:00
Peter Boyle	f617468e04	Update Lattice_base.h	2024-10-11 10:39:16 -04:00
Peter Boyle	b728af903c	Fast axpy norm under CFLAG	2024-10-11 03:23:09 +00:00
Peter Boyle	54f1999030	axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues	2024-10-11 03:22:18 +00:00
Peter Boyle	fd58f0b669	Return ok	2024-10-11 03:21:21 +00:00
Peter Boyle	c5c67b706e	cl::sycl -> SYCL	2024-10-10 22:04:12 +00:00
Peter Boyle	be7a543e2c	Revert barriers -- these were not the problem	2024-10-10 22:03:29 +00:00
Peter Boyle	68f112d576	New software moves cl::sycl	2024-10-10 22:03:04 +00:00
Peter Boyle	ec1395a304	Better flight logging	2024-10-10 22:01:57 +00:00
Peter Boyle	beb0e474ee	Use deterministic own brand reduction	2024-10-10 22:01:24 +00:00
Peter Boyle	2b5fdcbbc5	New software version	2024-10-10 21:59:02 +00:00
Peter Boyle	295127d456	Deterministic homebrew reduction	2024-10-10 21:58:26 +00:00
Peter Boyle	7dcfb13694	New software stack	2024-10-10 21:57:35 +00:00
Peter Boyle	ee4046fe92	Added a dimension ordered column sum based reduction for scalar. Removes dependence on MPI_Allreduce and allows for work around on systems where this is bollox.	2024-09-27 09:26:03 -04:00
Peter Boyle	2a9cfeb9ea	New files	2024-09-26 14:23:29 -04:00
Peter Boyle	1147b8ea40	Cheby poly setup	2024-09-26 14:20:32 -04:00
Peter Boyle	3f9119b39d	Remove vectors used for the power spectrum table in paper	2024-09-26 14:19:41 -04:00
Peter Boyle	35e8225abd	Verbose control	2024-09-26 14:18:35 -04:00
Peter Boyle	bdbfbb7a14	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-09-26 14:05:45 -04:00
Peter Boyle	f7d4be8d96	Calculate bytes correctly	2024-09-26 14:04:44 -04:00
Peter Boyle	9fa8bd6438	Configure for AOT on Aurora latest software	2024-09-23 11:25:44 +00:00
Peter Boyle	02c8178f16	Almost working on Aurora	2024-09-23 09:43:50 +00:00
Peter Boyle	e637fbacae	Verbose remove	2024-09-23 09:42:43 +00:00
Peter Boyle	e29b97b3ea	Qslash term added	2023-09-14 16:14:03 -04:00
Peter Boyle	ad2b699d2b	Better macos	2023-09-14 16:12:21 -04:00