Merge pull request #477 from lehner/feature/wilson-clover-5d

Feature/wilson clover 5d
Merge pull request #471 from edbennett/fix-wflow
2025-06-13 04:37:05 +01:00 · 2025-04-24 14:44:48 -04:00 · 2025-04-24 14:40:31 -04:00 · 2025-04-24 14:39:10 -04:00 · 2025-04-24 14:35:51 -04:00 · 2025-04-24 14:34:33 -04:00
77 changed files with 5242 additions and 656 deletions
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@ -191,7 +191,7 @@ public:
      
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
-    std::cout << "CPU view" << std::endl;
+    //std::cout << "CPU view" << std::endl;
    
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@ -215,7 +215,7 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
      
-    std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -229,7 +229,7 @@ public:
    }
      
    // Barrel shift and collect global pencil
-    std::cout << GridLogPerformance<<"Making pencil" << std::endl;
+    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@ -251,7 +251,7 @@ public:
      }
    }
      
-    std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
+    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@ -274,7 +274,7 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
      
-    std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
+    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@ -291,7 +291,7 @@ public:
    }
    result = result*div;
      
-    std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
+    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -277,6 +277,38 @@ public:
    assert(0);
  }
 };
+template<class Matrix,class Field>
+class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD shift;
+public:
+  ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    _Mat.Mdiag(in,out);
+    out = out + shift*in;
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};

 //////////////////////////////////////////////////////////
 // Even Odd Schur decomp operators; there are several
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@ -208,8 +208,8 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);

-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -367,28 +367,67 @@ public:
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
        });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@ -414,8 +453,8 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -514,28 +553,70 @@ public:
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@ -661,29 +742,41 @@ public:
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
-	  } );
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  });
      } else { 
 	assert(0);
      }
@ -809,28 +902,40 @@ public:
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
--- a/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
+++ b/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
@ -144,11 +144,11 @@ public:
      acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
    }    
    RealD t4 = usecond();
-    std::cout << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
-    std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
-    std::cout << "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
-    std::cout << "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
-    std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
  }
  
  void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
@ -242,16 +242,16 @@ public:
    RealD flops = 8.0*M*N*K;
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
-    std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
-    std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #else
    int nrhs;
    GridBase *grid;
@ -358,17 +358,17 @@ public:
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
    xybytes = 4*xybytes/(t2-t1)/1.e3;
-    std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
-    std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #endif
  }
 };
--- a/Grid/algorithms/iterative/AdefMrhs.h
+++ b/Grid/algorithms/iterative/AdefMrhs.h
@ -63,7 +63,12 @@ class TwoLevelCGmrhs
  GridStopWatch SmoothTimer;
  GridStopWatch InsertTimer;

-  
+  /*
+    Field rrr;
+  Field sss;
+  Field qqq;
+  Field zzz;
+  */  
  // more most opertor functions
  TwoLevelCGmrhs(RealD tol,
 		 Integer maxit,
@ -74,6 +79,12 @@ class TwoLevelCGmrhs
    MaxIterations(maxit),
    _FineLinop(FineLinop),
    _Smoother(Smoother)
+    /*
+    rrr(fine),
+    sss(fine),
+    qqq(fine),
+    zzz(fine)
+*/
  {
    grid       = fine;
  };
@ -81,8 +92,8 @@ class TwoLevelCGmrhs
  // Vector case
  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
  {
-    SolveSingleSystem(src,x);
-    //    SolvePrecBlockCG(src,x);
+    //    SolveSingleSystem(src,x);
+    SolvePrecBlockCG(src,x);
  }

 ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -657,6 +668,8 @@ public:
    CoarseField PleftProjMrhs(this->coarsegridmrhs);
    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);

+    //    this->rrr=in[0];
+
 #undef SMOOTHER_BLOCK_SOLVE
 #if SMOOTHER_BLOCK_SOLVE
    this->SmoothTimer.Start();
@ -669,6 +682,7 @@ public:
      this->SmoothTimer.Stop();
    }
 #endif
+    //    this->sss=Min[0];
    
    for(int rhs=0;rhs<nrhs;rhs++) {
      
@ -705,9 +719,11 @@ public:
    this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]  
    this->PromoteTimer.Stop();
    this->FineTimer.Start();
+    //    this->qqq=tmp[0];
    for(int rhs=0;rhs<nrhs;rhs++) {
      axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
    }
+    //    this->zzz=out[0];
    this->FineTimer.Stop();
  }
 };
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -245,9 +245,10 @@ until convergence
 	_HermOp(src_n,tmp);
 	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
 	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
-	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+//	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+	RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
 	RealD vden = norm2(src_n);
-	RealD na = vnum/vden;
+	RealD na = std::sqrt(vnum/vden);
 	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
@ -255,6 +256,7 @@ until convergence
 	src_n = tmp;
      }
    }
+    std::cout << GridLogIRL << " Final evalMaxApprox  " << evalMaxApprox << std::endl;
 	
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@ -74,7 +74,7 @@ public:

  void operator() (const Field &src, Field &psi){

-    psi=Zero();
+    //    psi=Zero();
    RealD cp, ssq,rsq;
    ssq=norm2(src);
    rsq=Tolerance*Tolerance*ssq;
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once

+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+
 NAMESPACE_BEGIN(Grid);

 inline RealD AggregatePowerLaw(RealD x)
@ -95,7 +97,7 @@ public:

    RealD scale;

-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    ConjugateGradient<FineField> CG(1.0e-3,400,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);

@ -108,7 +110,7 @@ public:
      
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;

-      for(int i=0;i<1;i++){
+      for(int i=0;i<4;i++){

 	CG(hermop,noise,subspace[b]);

@ -124,6 +126,53 @@ public:
    }
  }

+  virtual void CreateSubspaceGCR(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
+  {
+    RealD scale;
+
+    TrivialPrecon<FineField> simple_fine;
+    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
+    FineField noise(FineGrid);
+    FineField src(FineGrid);
+    FineField guess(FineGrid);
+    FineField Mn(FineGrid);
+
+    for(int b=0;b<nn;b++){
+      
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
+
+      for(int i=0;i<2;i++){
+	//  void operator() (const Field &src, Field &psi){
+#if 1
+	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
+	src = noise;
+	guess=Zero();
+	GCR(src,guess);
+	subspace[b] = guess;
+#else
+	std::cout << GridLogMessage << " inverting on zero "<<std::endl;
+	src=Zero();
+	guess = noise;
+	GCR(src,guess);
+	subspace[b] = guess;
+#endif
+	noise = subspace[b];
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+      }
+
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
+      subspace[b]   = noise;
+
+    }
+  }
+
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
@ -160,14 +209,21 @@ public:

    int b =0;
    {
+      ComplexD ip;
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+
+      hermop.Op(Mn,tmp);
+      ip= innerProduct(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+
+      hermop.AdjOp(Mn,tmp); 
+      ip = innerProduct(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
      b++;
    }

@ -213,8 +269,18 @@ public:
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
-	  hermop.Op(Mn,tmp); 
-	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+
+
+	  ComplexD ip;
+
+	  hermop.Op(Mn,tmp);
+	  ip= innerProduct(Mn,tmp); 
+	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+
+	  hermop.AdjOp(Mn,tmp); 
+	  ip = innerProduct(Mn,tmp); 
+	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+	  
 	  b++;
 	}

@ -228,6 +294,70 @@ public:
    }
    assert(b==nn);
  }
+
+
+  virtual void CreateSubspacePolyCheby(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo1,
+				       int orderfilter,
+				       double lo2,
+				       int orderstep)
+  {
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
+    // Initial matrix element
+    hermop.Op(noise,Mn);
+    std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {
+      // Filter
+      std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
+      Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
+    }
+
+    // Generate a full sequence of Chebyshevs
+    for(int n=1;n<nn;n++){
+      std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
+      Chebyshev<FineField> Cheb(lo2,hi,orderstep);
+      Cheb(hermop,subspace[n-1],Mn);
+
+      for(int m=0;m<n;m++){
+	ComplexD c = innerProduct(subspace[m],Mn);
+	Mn = Mn - c*subspace[m];
+      }
+      
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5);
+      Mn=Mn*scale;
+      
+      subspace[n]=Mn;
+      
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
+
+    }
+  }
+
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@ -441,8 +441,20 @@ public:
    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
  }
 #else
+  //////////////////////////////////////////////////////////////////////
+  // Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    CoarsenOperator(linop,Subspace,Subspace);
+  }
+  //////////////////////////////////////////////////////////////////////
+  // Petrov - Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & U,
+		       Aggregation<Fobj,CComplex,nbasis> & V)
  {
    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
    GridBase *grid = FineGrid();
@ -458,11 +470,9 @@ public:
    // Orthogonalise the subblocks over the basis
    /////////////////////////////////////////////////////////////
    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
+    blockOrthogonalise(InnerProd,V.subspace);
+    blockOrthogonalise(InnerProd,U.subspace);

-    //    for(int s=0;s<Subspace.subspace.size();s++){
-      //      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
-    //    }
    const int npoint = geom.npoint;
      
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
@ -542,7 +552,7 @@ public:
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
 	tphaseBZ-=usecond();
-	phaV = phaF[p]*Subspace.subspace[i];
+	phaV = phaF[p]*V.subspace[i];
 	tphaseBZ+=usecond();

 	/////////////////////////////////////////////////////////////////////
@ -555,7 +565,7 @@ public:
 	//	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;

 	tproj-=usecond();
-	blockProject(coarseInner,MphaV,Subspace.subspace);
+	blockProject(coarseInner,MphaV,U.subspace);
 	coarseInner = conjugate(pha[p]) * coarseInner;

 	ComputeProj[p] = coarseInner;
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@ -69,7 +69,7 @@ public:
  }

  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
@ -175,10 +175,11 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
 template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview
-template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // 
+template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // Really want to deprecate
 template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page
 template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector

+/*
 template<class T> class vecView
 {
 protected:
@ -214,6 +215,7 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
 #define autoVecView(v_v,v,mode)					\
  auto v_v = VectorView(v,mode);				\
  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
+*/

 NAMESPACE_END(Grid);

--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -9,6 +9,7 @@ static char print_buffer [ MAXLINE ];
 #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
 #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug  << print_buffer << std::endl;
 //#define dprintf(...) 
+//#define mprintf(...) 

 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
@ -109,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -119,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
    AccCache.AccPtr=(uint64_t) NULL;
-    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@ -139,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@ -153,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@ -167,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@ -182,7 +183,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx\n",
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx",
 	  (uint64_t)AccCache.bytes,
 	  (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
+    dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
+    dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
+  if(bytes>=DeviceMaxBytes) {
+    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
+  }
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error

  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n",
+    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
-    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
+    dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
+    dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
  } else {
    assert(0);
  }
@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
-    dprintf("AccCache entry removed from LRU \n");
+    dprintf("AccCache entry removed from LRU ");
    LRUremove(AccCache);
  }

@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
  } else {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 #include <Grid/communicator/SharedMemory.h>

+#define NVLINK_GET
+
 NAMESPACE_BEGIN(Grid);

 extern bool Stencil_force_mpi ;
@ -127,7 +129,7 @@ public:
  void GlobalSumVector(ComplexD *c,int N);
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
-  
+
  template<class obj> void GlobalSumP2P(obj &o)
  {
    std::vector<obj> column;
@ -147,7 +149,8 @@ public:
 			    sizeof(obj),d*100+p);

      }
-      CommsComplete(list);
+      if (!list.empty()) // avoid triggering assert in comms == none
+	CommsComplete(list);
      for(int p=1;p<_processors[d];p++){
 	accum = accum + column[p];
      }
@ -192,6 +195,11 @@ public:
 				      void *recv,
 				      int recv_from_rank,int do_recv,
 				      int xbytes,int rbytes,int dir);
+
+  // Could do a PollHtoD and have a CommsMerge dependence
+  void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
+  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
+
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;

 ////////////////////////////////////////////
@ -362,8 +363,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
  std::vector<MpiCommsRequest_t> reqs(0);
-  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  unsigned long  rcrc = crc32(0L, Z_NULL, 0);

  int myrank = _processor;
  int ierr;
@ -379,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);

-  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
-  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@ -399,6 +395,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,


 #ifdef ACCELERATOR_AWARE_MPI
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int dest,int dox,
@ -440,8 +438,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(rrq);
      off_node_bytes+=rbytes;
    }
+#ifdef NVLINK_GET
+    else { 
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
+      assert(shm!=NULL);
+      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+    }
+#endif
  }
-  
+  // This is a NVLINK PUT  
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
@ -450,9 +455,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(xrq);
      off_node_bytes+=xbytes;
    } else {
+#ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+#endif
    }
  }
  return off_node_bytes;
@ -461,7 +468,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  int nreq=list.size();
-
+  /*finishes Get/Put*/
  acceleratorCopySynchronise();

  if (nreq==0) return;
@ -561,53 +568,105 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
  
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-#undef DEVICE_TO_HOST_CONCURRENT // pipeline
-#ifdef DEVICE_TO_HOST_CONCURRENT
+
      tag= dir+_processor*32;

      host_xmit = this->HostBufferMalloc(xbytes);
-      acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
      
      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      //      assert(ierr==0);
      //      off_node_bytes+=xbytes;

-      CommsRequest_t srq;
      srq.PacketType = InterNodeXmit;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = host_xmit;
      srq.device_buf = xmit;
+      srq.tag        = tag;
+      srq.dest       = dest;
+      srq.commdir    = commdir;
      list.push_back(srq);
-#else
-      tag= dir+_processor*32;
-
-      host_xmit = this->HostBufferMalloc(xbytes);
-      const int chunks=1;
-      for(int n=0;n<chunks;n++){
-	void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
-	void * xmitc      = (void *)( (uint64_t) xmit      + n*xbytes/chunks);
-	acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
-      }
-      acceleratorCopySynchronise(); // Complete all pending copy transfers
-      
-      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
-      off_node_bytes+=xbytes;
-
-      CommsRequest_t srq;
-      srq.PacketType = InterNodeXmit;
-      srq.bytes      = xbytes;
-      srq.req        = xrq;
-      srq.host_buf   = host_xmit;
-      srq.device_buf = xmit;
-      list.push_back(srq);
-#endif
    }
  }

  return off_node_bytes;
 }
+/*
+ * In the interest of better pipelining, poll for completion on each DtoH and 
+ * start MPI_ISend in the meantime
+ */
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeRecv ) {
+
+	int flag = 0;
+	MPI_Status status;
+	int ierr = MPI_Test(&list[idx].req,&flag,&status);
+	assert(ierr==0);
+
+	if ( flag ) {
+	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
+	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
+	  list[idx].PacketType=InterNodeReceiveHtoD;
+	} else {
+	  pending ++;
+	}
+      }
+    }
+    //    std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
+  } while ( pending );
+  
+}
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeXmit ) {
+
+	if ( acceleratorEventIsComplete(list[idx].ev) ) {
+
+	  void *host_xmit = list[idx].host_buf;
+	  uint32_t xbytes = list[idx].bytes;
+	  int dest        = list[idx].dest;
+	  int tag         = list[idx].tag;
+	  int commdir     = list[idx].commdir;
+	  ///////////////////
+	  // Send packet
+	  ///////////////////
+
+	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
+	  
+	  MPI_Request xrq;
+	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+	  assert(ierr==0);
+
+	  list[idx].req        = xrq; // Update the MPI request in the list
+
+	  list[idx].PacketType=InterNodeXmitISend;
+
+	} else {
+	  // not done, so return to polling loop
+	  pending++;
+	}
+      }
+    }
+  } while (pending);
+}  

 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
@ -644,69 +703,89 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   * - complete all copies
   * - post MPI send asynch
   */
+#ifdef NVLINK_GET
+  if ( dor ) {

-  //  static int printed;
-  //  if((printed<8) && this->IsBoss() ) {
-  //    printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
-  //    printed++;
-  //  }
-  
+    if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
+      assert(shm!=NULL);
+
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+
+      srq.PacketType = IntraNodeRecv;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
+    }
+  }  
+#else
  if (dox) {

-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-#ifdef DEVICE_TO_HOST_CONCURRENT
-      tag= dir+_processor*32;
-      // Find the send in the prepared list
-      int list_idx=-1;
-      for(int idx = 0; idx<list.size();idx++){
-
-	if ( (list[idx].device_buf==xmit)
-	   &&(list[idx].PacketType==InterNodeXmit)
-	   &&(list[idx].bytes==xbytes) ) {
-
-	  list_idx = idx;
-	  host_xmit = list[idx].host_buf;
-	}
-      }
-      assert(list_idx != -1); // found it
-      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
-      list[list_idx].req        = xrq; // Update the MPI request in the list
-      off_node_bytes+=xbytes;
-#endif      
-    } else {
+    if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+
+      CommsRequest_t srq;
+      
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+
+      srq.PacketType = IntraNodeXmit;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
+      
    }
  }
+#endif
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  int nreq=list.size();
+  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D

-  if (nreq==0) return;
-  std::vector<MPI_Status> status(nreq);
-  std::vector<MPI_Request> MpiRequests(nreq);
+  std::vector<MPI_Status> status;
+  std::vector<MPI_Request> MpiRequests;
+    
+  for(int r=0;r<list.size();r++){
+    // Must check each Send buf is clear to reuse
+    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
+    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
+  }

-  for(int r=0;r<nreq;r++){
-    MpiRequests[r] = list[r].req;
+  int nreq=MpiRequests.size();
+
+  if (nreq>0) {
+    status.resize(MpiRequests.size());
+    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
+    assert(ierr==0);
  }
  
-  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
-  assert(ierr==0);
-
-  for(int r=0;r<nreq;r++){
-    if ( list[r].PacketType==InterNodeRecv ) {
-      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
-    }
-  }
+  //  for(int r=0;r<nreq;r++){
+  //    if ( list[r].PacketType==InterNodeRecv ) {
+  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
+  //    }
+  //  }
+  
  
-  acceleratorCopySynchronise(); // Complete all pending copy transfers
  list.resize(0);               // Delete the list
  this->HostBufferFreeAll();    // Clean up the buffer allocs
-  this->StencilBarrier(); 
+#ifndef NVLINK_GET
+  this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
+#endif   
 }
 #endif
 ////////////////////////////////////////////
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@ -132,6 +132,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int xmit_to_rank,int dox,
@ -139,7 +141,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
 							   int recv_from_rank,int dor,
 							   int xbytes,int rbytes, int dir)
 {
-  return xbytes+rbytes;
+  return 0.0;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@ -50,12 +50,30 @@ typedef MPI_Request MpiCommsRequest_t;
 #ifdef ACCELERATOR_AWARE_MPI
 typedef MPI_Request CommsRequest_t;
 #else
-enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
+/*
+ * Enable state transitions as each packet flows.
+ */
+enum PacketType_t {
+  FaceGather,
+  InterNodeXmit,
+  InterNodeRecv,
+  IntraNodeXmit,
+  IntraNodeRecv,
+  InterNodeXmitISend,
+  InterNodeReceiveHtoD
+};
+/*
+ *Package arguments needed for various actions along packet flow
+ */
 typedef struct {
  PacketType_t PacketType;
  void *host_buf;
  void *device_buf;
+  int dest;
+  int tag;
+  int commdir;
  unsigned long bytes;
+  acceleratorEvent_t ev;
  MpiCommsRequest_t req;
 } CommsRequest_t;
 #endif
@ -119,7 +137,7 @@ public:
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
-  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
+  //  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
  static void SharedMemoryZero(void *dest,size_t bytes);

 };
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -542,12 +542,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  printf("Host buffer allocate for GPU non-aware MPI\n");
+  // printf("Host buffer allocate for GPU non-aware MPI\n");
 #if 0
  HostCommBuf= acceleratorAllocHost(bytes);
 #else 
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
-#ifdef HAVE_NUMAIF_H
+#if 0
  #warning "Moving host buffers to specific NUMA domain"
  int numa;
  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
@ -916,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
  bzero(dest,bytes);
 #endif
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-{
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-  acceleratorCopyToDevice(src,dest,bytes);
-#else   
-  bcopy(src,dest,bytes);
-#endif
-}
+//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+//{
+//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+//  acceleratorCopyToDevice(src,dest,bytes);
+//#else   
+//  bcopy(src,dest,bytes);
+//#endif
+//}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
@ -959,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);

    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
+    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
  }
  ShmBufferFreeAll();

@ -989,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  //SharedMemoryTest();
+  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@ -1011,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
       check[0]=GlobalSharedMemory::WorldNode;
       check[1]=r;
       check[2]=magic;
-       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
+       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
    }
  }
  ShmBarrier();
  for(uint64_t r=0;r<ShmSize;r++){
-    ShmBarrier();
-    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
-    ShmBarrier();
+    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==magic);
-    ShmBarrier();
  }
+  ShmBarrier();
+  std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
 }

 void *SharedMemory::ShmBuffer(int rank)
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@ -68,7 +68,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }
-#if 1
+
 template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];
@ -125,7 +125,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
-    
+#ifndef ACCELERATOR_AWARE_MPI
+  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
+  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
+#endif
+  
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  RealD tcopy=0.0;
@ -156,16 +160,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
+
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      
      tcomms-=usecond();
      grid->Barrier();

+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
+#else
+      // bouncy bouncy
+      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
+      grid->SendToRecvFrom((void *)&hsend_buf[0],
+			   xmit_to_rank,
+			   (void *)&hrecv_buf[0],
+			   recv_from_rank,
+			   bytes);
+      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
+#endif
+
      xbytes+=bytes;
      grid->Barrier();
      tcomms+=usecond();
@ -226,12 +243,16 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  static std::vector<deviceVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
- 
+
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
-
+#ifndef ACCELERATOR_AWARE_MPI
+  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
+  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
+#endif
+  
  int bytes = buffer_size*sizeof(scalar_object);

  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
@ -283,11 +304,22 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo

 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
+#ifdef ACCELERATOR_AWARE_MPI
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
+#else
+      // bouncy bouncy
+	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
+	grid->SendToRecvFrom((void *)&hsend_buf[0],
+			     xmit_to_rank,
+			     (void *)&hrecv_buf[0],
+			     recv_from_rank,
+			     bytes);
+	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
+#endif

 	xbytes+=bytes;
 	grid->Barrier();
@ -311,234 +343,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  }
 }
-#else 
-template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
-{
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  GridBase *grid=rhs.Grid();
-  Lattice<vobj> temp(rhs.Grid());
-
-  int fd              = rhs.Grid()->_fdimensions[dimension];
-  int rd              = rhs.Grid()->_rdimensions[dimension];
-  int pd              = rhs.Grid()->_processors[dimension];
-  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
-  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
-  assert(simd_layout==1);
-  assert(comm_dim==1);
-  assert(shift>=0);
-  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-  
-  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
-  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
-  vobj *send_buf;
-  vobj *recv_buf;
-  {
-    grid->ShmBufferFreeAll();
-    size_t bytes = buffer_size*sizeof(vobj);
-    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
-    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
-  }
-    
-  int cb= (cbmask==0x2)? Odd : Even;
-  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
-  for(int x=0;x<rd;x++){       
-
-    int sx        =  (x+sshift)%rd;
-    int comm_proc = ((x+sshift)/rd)%pd;
-    
-    if (comm_proc==0) {
-
-      tcopy-=usecond();
-      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-      tcopy+=usecond();
-
-    } else {
-
-      int words = buffer_size;
-      if (cbmask != 0x3) words=words>>1;
-
-      int bytes = words * sizeof(vobj);
-
-      tgather-=usecond();
-      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
-      tgather+=usecond();
-
-      //      int rank           = grid->_processor;
-      int recv_from_rank;
-      int xmit_to_rank;
-      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-
-
-      tcomms-=usecond();
-      //      grid->Barrier();
-
-      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
-      grid->SendToRecvFrom((void *)&send_buf[0],
-			   xmit_to_rank,
-			   (void *)&recv_buf[0],
-			   recv_from_rank,
-			   bytes);
-      xbytes+=bytes;
-      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
-
-      //      grid->Barrier();
-      tcomms+=usecond();
-
-      tscatter-=usecond();
-      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
-      tscatter+=usecond();
-    }
-  }
-  if(Cshift_verbose){
-    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  }
-}
-
-template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
-{
-  GridBase *grid=rhs.Grid();
-  const int Nsimd = grid->Nsimd();
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_object scalar_object;
-  typedef typename vobj::scalar_type scalar_type;
-   
-  int fd = grid->_fdimensions[dimension];
-  int rd = grid->_rdimensions[dimension];
-  int ld = grid->_ldimensions[dimension];
-  int pd = grid->_processors[dimension];
-  int simd_layout     = grid->_simd_layout[dimension];
-  int comm_dim        = grid->_processors[dimension] >1 ;
-
-  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
-  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
-  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
-
-  assert(comm_dim==1);
-  assert(simd_layout==2);
-  assert(shift>=0);
-  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-
-  int permute_type=grid->PermuteType(dimension);
-
-  ///////////////////////////////////////////////
-  // Simd direction uses an extract/merge pair
-  ///////////////////////////////////////////////
-  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
-  //  int words = sizeof(vobj)/sizeof(vector_type);
-
-  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
-  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
-  scalar_object *  recv_buf_extract_mpi;
-  scalar_object *  send_buf_extract_mpi;
-  {
-    size_t bytes = sizeof(scalar_object)*buffer_size;
-    grid->ShmBufferFreeAll();
-    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
-    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
-  }
-  for(int s=0;s<Nsimd;s++){
-    send_buf_extract[s].resize(buffer_size);
-    recv_buf_extract[s].resize(buffer_size);
-  }
-
-  int bytes = buffer_size*sizeof(scalar_object);
-
-  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
-  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
-
-  ///////////////////////////////////////////
-  // Work out what to send where
-  ///////////////////////////////////////////
-  int cb    = (cbmask==0x2)? Odd : Even;
-  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
-  // loop over outer coord planes orthog to dim
-  for(int x=0;x<rd;x++){       
-
-    // FIXME call local permute copy if none are offnode.
-    for(int i=0;i<Nsimd;i++){       
-      pointers[i] = &send_buf_extract[i][0];
-    }
-    tgather-=usecond();
-    int sx   = (x+sshift)%rd;
-    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
-    tgather+=usecond();
-
-    for(int i=0;i<Nsimd;i++){
-      
-      int inner_bit = (Nsimd>>(permute_type+1));
-      int ic= (i&inner_bit)? 1:0;
-
-      int my_coor          = rd*ic + x;
-      int nbr_coor         = my_coor+sshift;
-      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
-
-      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
-      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
-      int nbr_lane = (i&(~inner_bit));
-
-      int recv_from_rank;
-      int xmit_to_rank;
-
-      if (nbr_ic) nbr_lane|=inner_bit;
-
-      assert (sx == nbr_ox);
-
-      if(nbr_proc){
-	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-
-	tcomms-=usecond();
-	//	grid->Barrier();
-
-	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
-	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
-			     xmit_to_rank,
-			     (void *)recv_buf_extract_mpi,
-			     recv_from_rank,
-			     bytes);
-	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
-	xbytes+=bytes;
-
-	//	grid->Barrier();
-	tcomms+=usecond();
-	rpointers[i] = &recv_buf_extract[i][0];
-      } else { 
-	rpointers[i] = &send_buf_extract[nbr_lane][0];
-      }
-
-    }
-    tscatter-=usecond();
-    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
-    tscatter+=usecond();
-
-  }
-  if(Cshift_verbose){
-    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
-  }
-}
-#endif

 NAMESPACE_END(Grid); 

--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
  
  //copy offsets to device
-  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
+  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  
  
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
    exit(EXIT_FAILURE);
  }
  
-  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
+  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  
  //sync after copy
  accelerator_barrier();
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -466,6 +466,12 @@ public:
    static deviceVector<vobj> recv_buf;
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);
+#ifndef ACCELERATOR_AWARE_MPI
+    static hostVector<vobj> hsend_buf; 
+    static hostVector<vobj> hrecv_buf;
+    hsend_buf.resize(buffer_size*2*depth);    
+    hrecv_buf.resize(buffer_size*2*depth);
+#endif    

    std::vector<MpiCommsRequest_t> fwd_req;   
    std::vector<MpiCommsRequest_t> bwd_req;   
@ -495,9 +501,16 @@ public:
      t_gather+=usecond()-t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&send_buf[d*buffer_size], xmit_to_rank,
 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
+      grid->SendToRecvFromBegin(fwd_req,
+				(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
+				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+#endif
      t_comms+=usecond()-t;
     }
    for ( int d=0;d < depth ; d ++ ) {
@ -508,9 +521,16 @@ public:
      t_gather+= usecond() - t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
+      grid->SendToRecvFromBegin(bwd_req,
+				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
+				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+#endif      
      t_comms+=usecond()-t;
    }

@ -533,8 +553,13 @@ public:

    t=usecond();
    grid->CommsComplete(fwd_req);
+#ifndef ACCELERATOR_AWARE_MPI
+    for ( int d=0;d < depth ; d ++ ) {
+      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
+    }
+#endif
    t_comms+= usecond() - t;
-
+    
    t=usecond();
    for ( int d=0;d < depth ; d ++ ) {
      ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
@ -543,6 +568,11 @@ public:

    t=usecond();
    grid->CommsComplete(bwd_req);
+#ifndef ACCELERATOR_AWARE_MPI
+    for ( int d=0;d < depth ; d ++ ) {
+      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
+    }
+#endif
    t_comms+= usecond() - t;
    
    t=usecond();
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
@ -0,0 +1,196 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
+
+    Copyright (C) 2020 - 2025
+
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
+#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
+
+template<class Impl, class CloverHelpers>
+class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
+				     public WilsonCloverHelpers<Impl>,
+				     public CompactWilsonCloverHelpers<Impl> {
+  /////////////////////////////////////////////
+  // Sizes
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_COMPACT_CLOVER_SIZES(Impl);
+
+  /////////////////////////////////////////////
+  // Type definitions
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+  INHERIT_COMPACT_CLOVER_TYPES(Impl);
+
+  typedef WilsonFermion5D<Impl>            WilsonBase;
+  typedef WilsonCloverHelpers<Impl>        Helpers;
+  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
+
+  /////////////////////////////////////////////
+  // Constructors
+  /////////////////////////////////////////////
+
+public:
+
+  CompactWilsonCloverFermion5D(GaugeField& _Umu,
+			       GridCartesian         &FiveDimGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridCartesian         &FourDimGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
+			       const RealD _mass,
+			       const RealD _csw_r = 0.0,
+			       const RealD _csw_t = 0.0,
+			       const RealD _cF = 1.0,
+			       const ImplParams& impl_p = ImplParams());
+
+  /////////////////////////////////////////////
+  // Member functions (implementing interface)
+  /////////////////////////////////////////////
+
+public:
+
+  virtual void Instantiatable() {};
+  int          ConstEE()     override { return 0; };
+  int          isTrivialEE() override { return 0; };
+
+  void Dhop(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
+
+  void M(const FermionField& in, FermionField& out) override;
+
+  void Mdag(const FermionField& in, FermionField& out) override;
+
+  void Meooe(const FermionField& in, FermionField& out) override;
+
+  void MeooeDag(const FermionField& in, FermionField& out) override;
+
+  void Mooee(const FermionField& in, FermionField& out) override;
+
+  void MooeeDag(const FermionField& in, FermionField& out) override;
+
+  void MooeeInv(const FermionField& in, FermionField& out) override;
+
+  void MooeeInvDag(const FermionField& in, FermionField& out) override;
+
+  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
+
+  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
+
+  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  /////////////////////////////////////////////
+  // Member functions (internals)
+  /////////////////////////////////////////////
+
+  void MooeeInternal(const FermionField&        in,
+                     FermionField&              out,
+                     const CloverDiagonalField& diagonal,
+                     const CloverTriangleField& triangle);
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+  void ImportGauge(const GaugeField& _Umu) override;
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+private:
+
+  template<class Field>
+  const MaskField* getCorrectMaskField(const Field &in) const {
+    if(in.Grid()->_isCheckerBoarded) {
+      if(in.Checkerboard() == Odd) {
+        return &this->BoundaryMaskOdd;
+      } else {
+        return &this->BoundaryMaskEven;
+      }
+    } else {
+      return &this->BoundaryMask;
+    }
+  }
+
+  template<class Field>
+  void ApplyBoundaryMask(Field& f) {
+    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
+    assert(m != nullptr);
+    CompactHelpers::ApplyBoundaryMask(f, *m);
+  }
+
+  /////////////////////////////////////////////
+  // Member Data
+  /////////////////////////////////////////////
+
+public:
+
+  RealD csw_r;
+  RealD csw_t;
+  RealD cF;
+  int n_rhs;
+  
+  bool fixedBoundaries;
+
+  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
+  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
+
+  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
+  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
+
+  FermionField Tmp;
+
+  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@ -164,12 +165,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS

 // Compact Clover fermions
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
+template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;

 typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;

+typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
+typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
+typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
+
 typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -484,6 +484,11 @@ public:
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
+#ifdef NVLINK_GET
+    this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
+    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
+    // Or issue barrier AFTER the DMA is running
+#endif    
  }

 };
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -91,13 +91,13 @@ public:
  virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};

  // half checkerboard operations; leave unimplemented as abstract for now
-  virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);

-  virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
@ -0,0 +1,376 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
+
+    Copyright (C) 2017 - 2025
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
+
+
+NAMESPACE_BEGIN(Grid);
+template<class Impl, class CloverHelpers>
+CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
+										GridCartesian         &FiveDimGrid,
+										GridRedBlackCartesian &FiveDimRedBlackGrid,
+										GridCartesian         &FourDimGrid,
+										GridRedBlackCartesian &FourDimRedBlackGrid,
+										const RealD _mass,
+										const RealD _csw_r,
+										const RealD _csw_t,
+										const RealD _cF,
+										const ImplParams& impl_p)
+  : WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
+  , csw_r(_csw_r)
+  , csw_t(_csw_t)
+  , cF(_cF)
+  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
+  , Diagonal(&FourDimGrid),        Triangle(&FourDimGrid)
+  , DiagonalEven(&FourDimRedBlackGrid),    TriangleEven(&FourDimRedBlackGrid)
+  , DiagonalOdd(&FourDimRedBlackGrid),     TriangleOdd(&FourDimRedBlackGrid)
+  , DiagonalInv(&FourDimGrid),     TriangleInv(&FourDimGrid)
+  , DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
+  , DiagonalInvOdd(&FourDimRedBlackGrid),  TriangleInvOdd(&FourDimRedBlackGrid)
+  , Tmp(&FiveDimGrid)
+  , BoundaryMask(&FiveDimGrid)
+  , BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
+{
+  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
+
+  csw_r *= 0.5;
+  csw_t *= 0.5;
+  //if (clover_anisotropy.isAnisotropic)
+  //  csw_r /= clover_anisotropy.xi_0;
+
+  ImportGauge(_Umu);
+  if (fixedBoundaries) {
+    this->BoundaryMaskEven.Checkerboard() = Even;
+    this->BoundaryMaskOdd.Checkerboard() = Odd;
+    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
+  }
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::Dhop(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopOE(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopEO(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
+  WilsonBase::DhopDir(in, out, dir, disp);
+  if(this->fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
+  WilsonBase::DhopDirAll(in, out);
+  if(this->fixedBoundaries) {
+    for(auto& o : out) ApplyBoundaryMask(o);
+  }
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
+  Mooee(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
+  MooeeDag(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
+  WilsonBase::Meooe(in, out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
+  WilsonBase::MeooeDag(in, out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalEven, TriangleEven);
+    }
+  } else {
+    MooeeInternal(in, out, Diagonal, Triangle);
+  }
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
+  Mooee(in, out); // blocks are hermitian
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
+    }
+  } else {
+    MooeeInternal(in, out, DiagonalInv, TriangleInv);
+  }
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
+  MooeeInv(in, out); // blocks are hermitian
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
+  DhopDirAll(in, out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
+  assert(!fixedBoundaries); // TODO check for changes required for open bc
+
+  // NOTE: code copied from original clover term
+  conformable(X.Grid(), Y.Grid());
+  conformable(X.Grid(), force.Grid());
+  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+  GaugeField clover_force(force.Grid());
+  PropagatorField Lambda(force.Grid());
+
+  // Guido: Here we are hitting some performance issues:
+  // need to extract the components of the DoubledGaugeField
+  // for each call
+  // Possible solution
+  // Create a vector object to store them? (cons: wasting space)
+  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
+
+  Impl::extractLinkField(U, this->Umu);
+
+  force = Zero();
+  // Derivative of the Wilson hopping term
+  this->DhopDeriv(force, X, Y, dag);
+
+  ///////////////////////////////////////////////////////////
+  // Clover term derivative
+  ///////////////////////////////////////////////////////////
+  Impl::outerProductImpl(Lambda, X, Y);
+  //std::cout << "Lambda:" << Lambda << std::endl;
+
+  Gamma::Algebra sigma[] = {
+      Gamma::Algebra::SigmaXY,
+      Gamma::Algebra::SigmaXZ,
+      Gamma::Algebra::SigmaXT,
+      Gamma::Algebra::MinusSigmaXY,
+      Gamma::Algebra::SigmaYZ,
+      Gamma::Algebra::SigmaYT,
+      Gamma::Algebra::MinusSigmaXZ,
+      Gamma::Algebra::MinusSigmaYZ,
+      Gamma::Algebra::SigmaZT,
+      Gamma::Algebra::MinusSigmaXT,
+      Gamma::Algebra::MinusSigmaYT,
+      Gamma::Algebra::MinusSigmaZT};
+
+  /*
+    sigma_{\mu \nu}=
+    | 0         sigma[0]  sigma[1]  sigma[2] |
+    | sigma[3]    0       sigma[4]  sigma[5] |
+    | sigma[6]  sigma[7]     0      sigma[8] |
+    | sigma[9]  sigma[10] sigma[11]   0      |
+  */
+
+  int count = 0;
+  clover_force = Zero();
+  for (int mu = 0; mu < 4; mu++)
+  {
+    force_mu = Zero();
+    for (int nu = 0; nu < 4; nu++)
+    {
+      if (mu == nu)
+        continue;
+
+      RealD factor;
+      if (nu == 4 || mu == 4)
+      {
+        factor = 2.0 * csw_t;
+      }
+      else
+      {
+        factor = 2.0 * csw_r;
+      }
+      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
+      count++;
+    }
+
+    pokeLorentz(clover_force, U[mu] * force_mu, mu);
+  }
+  //clover_force *= csw;
+  force += clover_force;
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
+								      FermionField&              out,
+								      const CloverDiagonalField& diagonal,
+								      const CloverTriangleField& triangle) {
+  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+  out.Checkerboard() = in.Checkerboard();
+  conformable(in, out);
+  CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
+  // NOTE: parts copied from original implementation
+
+  // Import gauge into base class
+  double t0 = usecond();
+  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
+
+  // Initialize temporary variables
+  double t1 = usecond();
+  conformable(_Umu.Grid(), this->GaugeGrid());
+  GridBase* grid = _Umu.Grid();
+  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+  CloverField TmpOriginal(grid);
+  CloverField TmpInverse(grid);
+
+  // Compute the field strength terms mu>nu
+  double t2 = usecond();
+  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+  // Compute the Clover Operator acting on Colour and Spin
+  // multiply here by the clover coefficients for the anisotropy
+  double t3 = usecond();
+  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
+  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
+  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
+  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
+  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
+  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
+
+  // Instantiate the clover term
+  // - In case of the standard clover the mass term is added
+  // - In case of the exponential clover the clover term is exponentiated
+  double t4 = usecond();
+  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
+
+  // Convert the data layout of the clover term
+  double t5 = usecond();
+  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
+
+  // Modify the clover term at the temporal boundaries in case of open boundary conditions
+  double t6 = usecond();
+  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
+
+  // Invert the Clover term
+  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
+  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
+  // TODO: For now this inversion is explictly done on the CPU
+  double t7 = usecond();
+  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
+
+  // Fill the remaining clover fields
+  double t8 = usecond();
+  pickCheckerboard(Even, DiagonalEven,    Diagonal);
+  pickCheckerboard(Even, TriangleEven,    Triangle);
+  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
+  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
+  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
+  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
+  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
+  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
+
+  // Report timings
+  double t9 = usecond();
+
+  std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
+  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -14,6 +14,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -484,6 +485,54 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
  Dhop(in,out,dag); // -0.5 is included
  axpy(out,4.0-M5,in,out);
 }
+template <class Impl>
+void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  typename FermionField::scalar_type scal(4.0 + M5);
+  out = scal * in;
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  Mooee(in, out);
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  out = (1.0/(4.0 + M5))*in;
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  MooeeInv(in,out);
+}

 template<class Impl>
 void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else {							\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  acceleratorSynchronise();						\
+  acceleratorSynchronise();					\
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
  Recon(result, Uchi);

@ -517,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
-     // dependent on result of merge
+     //     // dependent on result of merge
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
--- a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master
@ -0,0 +1,45 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master
+
+    Copyright (C) 2017 - 2025
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Mattia Bruno <mattia.bruno@cern.ch>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
+#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
+template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc
@ -0,0 +1 @@
+../CompactWilsonCloverFermion5DInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc
@ -0,0 +1 @@
+../CompactWilsonCloverFermion5DInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@ -62,7 +62,7 @@ do
 done
 done

-CC_LIST="CompactWilsonCloverFermionInstantiation"
+CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation"

 for impl in $COMPACT_WILSON_IMPL_LIST
 do
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@ -76,27 +76,27 @@ public:
    return action;
  };

-  virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
    //extend Ta to include Lorentz indexes
    RealD factor_p = c_plaq/RealD(Nc)*0.5;
    RealD factor_r = c_rect/RealD(Nc)*0.5;

-    GridBase *grid = Umu.Grid();
+    GridBase *grid = U.Grid();

-    std::vector<GaugeLinkField> U (Nd,grid);
+    std::vector<GaugeLinkField> Umu (Nd,grid);
    for(int mu=0;mu<Nd;mu++){
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+      Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
    }
    std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
-    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
+    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, Umu, workspace);

    GaugeLinkField dSdU_mu(grid);
    GaugeLinkField staple(grid);

    for (int mu=0; mu < Nd; mu++){
-      dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
-      dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
-	  
+      dSdU_mu = Ta(Umu[mu]*Staple[mu])*factor_p;
+      dSdU_mu = dSdU_mu + Ta(Umu[mu]*RectStaple[mu])*factor_r;
+
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }

--- a/Grid/qcd/action/gauge/WilsonGaugeAction.h
+++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h
@ -73,20 +73,23 @@ public:
    // extend Ta to include Lorentz indexes

    RealD factor = 0.5 * beta / RealD(Nc);
+    GridBase *grid = U.Grid();

-    GaugeLinkField Umu(U.Grid());
-    GaugeLinkField dSdU_mu(U.Grid());
+    GaugeLinkField dSdU_mu(grid);
+    std::vector<GaugeLinkField> Umu(Nd, grid);
    for (int mu = 0; mu < Nd; mu++) {
+      Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
+    }

-      Umu = PeekIndex<LorentzIndex>(U, mu);
-      
+    for (int mu = 0; mu < Nd; mu++) {
      // Staple in direction mu
-      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
-      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
-      
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, Umu, mu);
+      dSdU_mu = Ta(Umu[mu] * dSdU_mu) * factor;
+
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
  }
+
 private:
  RealD beta;  
 };
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@ -207,11 +207,14 @@ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(con
 }

 template <class Gimpl>
-void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
-  addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int meas_interval){
+  addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "  << step << "  " << t << "  " << energyDensityPlaquette(t,U) << std::endl;
    });
-  addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+  addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+      std::cout << GridLogMessage << "[WilsonFlow] Energy density (cloverleaf) : "  << step << "  " << t << "  " << energyDensityCloverleaf(t,U) << std::endl;
+    });
+  addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "  << step << "  " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
    });
 }
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -292,19 +292,21 @@ public:
  //////////////////////////////////////////////////
  // the sum over all nu-oriented staples for nu != mu on each site
  //////////////////////////////////////////////////
-  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
+  static void Staple(GaugeMat &staple, const GaugeLorentz &U, int mu) {

-    GridBase *grid = Umu.Grid();
-
-    std::vector<GaugeMat> U(Nd, grid);
+    std::vector<GaugeMat> Umu(Nd, U.grid());
    for (int d = 0; d < Nd; d++) {
-      U[d] = PeekIndex<LorentzIndex>(Umu, d);
+      Umu[d] = PeekIndex<LorentzIndex>(U, d);
    }
-    Staple(staple, U, mu);
+    Staple(staple, Umu, mu);
  }

-  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
-    staple = Zero();
+  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &Umu, int mu) {
+
+    autoView(staple_v, staple, AcceleratorWrite);
+    accelerator_for(i, staple.Grid()->oSites(), Simd::Nsimd(), {
+        staple_v[i] = Zero();
+    });

    for (int nu = 0; nu < Nd; nu++) {

@ -318,12 +320,12 @@ public:
        //      |
        //    __|
        //
-     
+
        staple += Gimpl::ShiftStaple(
 				     Gimpl::CovShiftForward(
-							    U[nu], nu,
+							    Umu[nu], nu,
 							    Gimpl::CovShiftBackward(
-										    U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+										    Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))),
 				     mu);

        //  __
@ -333,8 +335,8 @@ public:
        //

        staple += Gimpl::ShiftStaple(
-				     Gimpl::CovShiftBackward(U[nu], nu,
-							     Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+				     Gimpl::CovShiftBackward(Umu[nu], nu,
+							     Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu);
      }
    }
  }
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -363,12 +363,16 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    //    std::cout << "Communicate Begin "<<std::endl;
+    //    _grid->Barrier();
    FlightRecorder::StepLog("Communicate begin");
    // All GPU kernel tasks must complete
    //    accelerator_barrier();     // All kernels should ALREADY be complete
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
    for(int i=0;i<Packets.size();i++){
+      //      std::cout << "Communicate prepare "<<i<<std::endl;
+      //      _grid->Barrier();
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
 					  Packets[i].send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
@ -376,8 +380,15 @@ public:
 					  Packets[i].from_rank,Packets[i].do_recv,
 					  Packets[i].xbytes,Packets[i].rbytes,i);
    }
+    //    std::cout << "Communicate PollDtoH "<<std::endl;
+    //    _grid->Barrier();
+    _grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
+    //    std::cout << "Communicate CopySynch "<<std::endl;
+    //    _grid->Barrier();
    acceleratorCopySynchronise();
+    // Starts intranode
    for(int i=0;i<Packets.size();i++){
+      //      std::cout << "Communicate Begin "<<i<<std::endl;
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
@ -395,7 +406,14 @@ public:

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    //    std::cout << "Communicate Complete "<<std::endl;
+    //    _grid->Barrier();
    FlightRecorder::StepLog("Start communicate complete");
+    //    std::cout << "Communicate Complete PollIRecv "<<std::endl;
+    //    _grid->Barrier();
+    _grid->StencilSendToRecvFromPollIRecv(MpiReqs);
+    //    std::cout << "Communicate Complete Complete "<<std::endl;
+    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
@ -428,6 +446,7 @@ public:
    Communicate();
    CommsMergeSHM(compress);
    CommsMerge(compress);
+    accelerator_barrier();
  }

  template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@ -483,6 +502,9 @@ public:
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
    //    accelerator_barrier();
+    //////////////////////////////////
+    // I will overwrite my send buffers
+    //////////////////////////////////
    _grid->StencilBarrier();// Synch shared memory on a single nodes

    assert(source.Grid()==_grid);
@ -496,7 +518,11 @@ public:
      HaloGatherDir(source,compress,point,face_idx);
    }
    accelerator_barrier(); // All my local gathers are complete
-    //    _grid->StencilBarrier();// Synch shared memory on a single nodes
+#ifdef NVLINK_GET
+    _grid->StencilBarrier(); // He can now get mu local gather, I can get his
+    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
+    // Or issue barrier AFTER the DMA is running
+#endif    
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
  }
@ -535,6 +561,7 @@ public:
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
      acceleratorFenceComputeStream();
+      // Also fenced in WilsonKernels
    }
  }
  
@ -663,7 +690,7 @@ public:
 	}
      }
    }
-    std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    //    std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
    surface_list.resize(surface_list_size);
    std::vector<int> surface_list_host(surface_list_size);
    int32_t ss=0;
@ -683,6 +710,7 @@ public:
      }
    }
    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
+    //    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@ -774,8 +802,8 @@ public:
    this->_entries_host_p = &_entries[0];
    this->_entries_p = &_entries_device[0];

-    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
-	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
+    //    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
+    //	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
    
    for(int ii=0;ii<npoints;ii++){

--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -242,19 +242,33 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };

+typedef int acceleratorEvent_t;
+
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //auto discard=cudaStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}


 inline int  acceleratorIsCommunicable(void *ptr)
@ -343,11 +357,28 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};

 inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }

-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
-inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); }
-inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+
+///////
+// Asynch event interface
+///////
+typedef sycl::event acceleratorEvent_t;
+
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  ev.wait();
+}
+
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
+{
+  return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
+}
+
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);}
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
+
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}

 inline int  acceleratorIsCommunicable(void *ptr)
@ -358,8 +389,10 @@ inline int  acceleratorIsCommunicable(void *ptr)
  else return 0;
 #endif
  return 1;
+
 }

+
 #endif

 //////////////////////////////////////////////
@ -459,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 inline void *acceleratorAllocHost(size_t bytes)
 {
  void *ptr=NULL;
-  auto err = hipMallocHost((void **)&ptr,bytes);
+  auto err = hipHostMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
@ -492,23 +525,35 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}

 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}

-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+typedef int acceleratorEvent_t;
+
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  return 0;
 }
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyToDevice(from,to,bytes);
+  return 0;
 }
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };

+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //  auto discard=hipStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
+
+
 #endif

 inline void acceleratorPin(void *ptr,unsigned long bytes)
@ -545,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)

 #undef GRID_SIMT

+typedef int acceleratorEvent_t;
+
 inline void acceleratorMem(void)
 {
  /*
@ -564,9 +611,12 @@ inline void acceleratorMem(void)

 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific

-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { acceleratorCopyToDevice(from,to,bytes); return 0; }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { acceleratorCopyFromDevice(from,to,bytes); return 0; }
+inline void acceleratorEventWait(acceleratorEvent_t ev){}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); return 0;}
+
 inline void acceleratorCopySynchronise(void) {};

 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
@ -655,9 +705,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
  acceleratorCopySynchronise();
 }

-template<class T> void acceleratorPut(T& dev,T&host)
+template<class T> void acceleratorPut(T& dev,const T&host)
 {
-  acceleratorCopyToDevice(&host,&dev,sizeof(T));
+  acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
 }
 template<class T> T acceleratorGet(T& dev)
 {
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_critical                                     DO_PRAGMA(omp critical)

 #ifdef GRID_OMP
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
-  uint64_t *ufrom = (uint64_t *)from;
+  const uint64_t *ufrom = (const uint64_t *)from;
  uint64_t *uto   = (uint64_t *)to;
  assert(bytes%8==0);
  uint64_t words=bytes/8;
@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
  });
 }
 #else
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
  bcopy(from,to,bytes);
 }
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);

-
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
+    std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
+    FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
+    FlightRecorder::PrintEntireLog = 1;
+    FlightRecorder::ChecksumComms  = 1;
+    FlightRecorder::ChecksumCommsSend=1;
+  }
+  
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@ -651,3 +658,4 @@ void Grid_debug_handler_init(void)
 }

 NAMESPACE_END(Grid);
+
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@ -50,7 +50,7 @@ namespace Grid{
      int64_t index64;
      IndexFromCoorReversed(coor,index64,dims);
      if ( index64>=2*1024*1024*1024LL ){
-	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+	//	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
      }
      assert(index64<2*1024*1024*1024LL);
      index = (int) index64;
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@ -25,13 +25,20 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
+
+#if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
+#endif

 using namespace Grid;

 int main(int argc, char **argv)
 {
+#if Nc != 3
+#warning FTHMC2p1f will not work for Nc != 3
+  std::cout << "This program will currently only work for Nc == 3." << std::endl;
+#else
  std::cout << std::setprecision(12);
  
  Grid_init(&argc, &argv);
@ -220,7 +227,6 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing

  Grid_finalize();
+#endif
 } // main

-
-
--- a/HMC/FTHMC2p1f_3GeV.cc
+++ b/HMC/FTHMC2p1f_3GeV.cc
@ -24,14 +24,22 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
+
 #include <Grid/Grid.h>
+
+#if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
+#endif

 using namespace Grid;

 int main(int argc, char **argv)
 {
+#if Nc != 3
+#warning FTHMC2p1f_3GeV will not work for Nc != 3
+  std::cout << "This program will currently only work for Nc == 3." << std::endl;
+#else
  std::cout << std::setprecision(12);
  
  Grid_init(&argc, &argv);
@ -220,6 +228,7 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing

  Grid_finalize();
+#endif
 } // main


--- a/HMC/HMC2p1f_3GeV.cc
+++ b/HMC/HMC2p1f_3GeV.cc
@ -25,13 +25,20 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
+
+#if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
+#endif

 using namespace Grid;

 int main(int argc, char **argv)
 {
+#if Nc != 3
+#warning HMC2p1f_3GeV will not work for Nc != 3
+  std::cout << "This program will currently only work for Nc == 3." << std::endl;
+#else
  std::cout << std::setprecision(12);
  
  Grid_init(&argc, &argv);
@ -220,6 +227,7 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing

  Grid_finalize();
+#endif
 } // main


--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@ -52,7 +52,7 @@ int main (int argc, char ** argv)

  int threads = GridThread::GetThreads();

-  int Ls=8;
+  int Ls=16;
  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@ -175,8 +175,8 @@ public:
 	    timestat.statistics(t_time);
 	  
 	    dbytes=dbytes*ppn;
-	    double xbytes    = dbytes*0.5;
-	    double bidibytes = dbytes;
+	    double xbytes    = dbytes;
+	    double bidibytes = dbytes*2.0;
 	  
 	    std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
 		     << bytes << " \t "
@ -492,17 +492,18 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Dw.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Dw.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@ -520,11 +521,11 @@ public:
 	double mf_hi, mf_lo, mf_err;

 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@ -535,6 +536,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;

      }

@ -654,17 +656,19 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;

-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Ds.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@ -675,11 +679,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@ -689,6 +693,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
      
      }

@ -792,19 +797,18 @@ public:
 	  Dc.M(src,r);
 	}
 	FGrid->Barrier();
-	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t ni = 100;
+	uint64_t no = 50;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  Dc.M(src,r);
-	  t1=usecond();
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
+	  double t0=usecond();
+	  for(uint64_t j=0;j<ni;j++){
+	    Dc.M(src,r);
+	  }
+	  double t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
@ -814,20 +818,21 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call   "<< timestat.mean/ni<<std::endl;
      
      }

@ -872,7 +877,7 @@ int main (int argc, char ** argv)
  int do_dslash=1;

  int sel=4;
-  std::vector<int> L_list({8,12,16,24});
+  std::vector<int> L_list({8,12,16,24,32});
  int selm1=sel-1;

  std::vector<double> clover;
--- a/configure.ac
+++ b/configure.ac
@ -151,7 +151,7 @@ AC_ARG_ENABLE([tracing],
 case ${ac_TRACING} in
    nvtx)
        AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
-	LIBS="${LIBS} -lnvToolsExt64_1"
+	LIBS="${LIBS} -lnvToolsExt"
 	;;
    roctx)
        AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])
--- a/examples/Example_Laplacian_smearing.cc
+++ b/examples/Example_Laplacian_smearing.cc
@ -93,10 +93,13 @@ int main(int argc, char ** argv)
  Real coeff = (width*width) / Real(4*Iterations);

  chi=kronecker;
+
  //  chi = (1-p^2/2N)^N kronecker
  for(int n = 0; n < Iterations; ++n) {
    Laplacian.M(chi,psi);
    chi = chi - coeff*psi;
+    RealD n2 = norm2(chi);
+    chi = chi * (1.0/std::sqrt(n2));
  }

  std::cout << " Wuppertal smeared operator is chi = \n" << chi <<std::endl;
--- a/systems/Aurora/benchmarks/bench16.pbs
+++ b/systems/Aurora/benchmarks/bench16.pbs
@ -0,0 +1,74 @@
+#!/bin/bash
+
+##PBS -q LatticeQCD_aesp_CNDA
+#PBS -q debug-scaling
+##PBS -q prod
+#PBS -l select=16
+#PBS -l walltime=00:20:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+cd $PBS_O_WORKDIR
+
+source ../sourceme.sh
+
+cp $PBS_NODEFILE nodefile
+
+export OMP_NUM_THREADS=4
+export MPICH_OFI_NIC_POLICY=GPU
+
+#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+
+#
+# Local vol 16.16.16.32
+#
+
+LX=16
+LY=16
+LZ=16
+LT=32
+
+NX=2
+NY=2
+NZ=4
+NT=1
+
+GX=2
+GY=2
+GZ=1
+GT=3
+
+PX=$((NX * GX ))
+PY=$((NY * GY ))
+PZ=$((NZ * GZ ))
+PT=$((NT * GT ))
+
+VX=$((PX * LX ))
+VY=$((PY * LY ))
+VZ=$((PZ * LZ ))
+VT=$((PT * LT ))
+
+NP=$((PX*PY*PZ*PT))
+VOL=${VX}.${VY}.${VZ}.${VT}
+AT=8
+MPI=${PX}.${PY}.${PZ}.${PT}
+
+CMD="mpiexec -np $NP -ppn 12  -envall \
+	     ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi $MPI --grid $VOL \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
+
+echo VOL $VOL
+echo MPI $MPI
+echo NPROC $NP
+echo $CMD
+$CMD
+
--- a/systems/Aurora/benchmarks/gpu_tile.sh
+++ b/systems/Aurora/benchmarks/gpu_tile.sh
@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero

 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
@ -30,8 +30,8 @@ echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_A

 if [ $PALS_RANKID = "0" ]
 then
-    numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
-#    numactl -p $NUMAP -N $NUMAP  "$@"
+#    numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
+    numactl -p $NUMAP -N $NUMAP  "$@"
 else 
    numactl -p $NUMAP -N $NUMAP  "$@"
 fi
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@ -1,18 +1,19 @@
 #Ahead of time compile for PVC

-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"

 #JIT compile 
 #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "

-../../configure \
+../configure \
 	--enable-simd=GPU \
 	--enable-reduction=grid \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
+	--prefix $HOME/gpt-install \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \
--- a/systems/Frontier-rocm631/config-command
+++ b/systems/Frontier-rocm631/config-command
@ -0,0 +1,22 @@
+CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
+../../configure --enable-comms=mpi-auto \
+--with-lime=$CLIME \
+--enable-unified=no \
+--enable-shm=nvlink \
+--enable-tracing=none \
+--enable-accelerator=hip \
+--enable-gen-simd-width=64 \
+--disable-gparity \
+--disable-fermion-reps \
+--enable-simd=GPU \
+--with-gmp=$OLCF_GMP_ROOT \
+--with-fftw=$FFTW_DIR/.. \
+--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
+--disable-fermion-reps \
+CXX=hipcc MPICXX=mpicxx \
+CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
+ LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
+
+
+
+
--- a/systems/Frontier-rocm631/sourceme631.sh
+++ b/systems/Frontier-rocm631/sourceme631.sh
@ -0,0 +1,16 @@
+
+echo spack
+. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
+
+#module load cce/15.0.1
+
+module load rocm/6.3.1
+module load cray-fftw
+module load craype-accel-amd-gfx90a
+export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+
+#Ugly hacks to get down level software working on current system
+#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
+#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
+#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
+
--- a/systems/Frontier/benchmarks/bench2.slurm
+++ b/systems/Frontier/benchmarks/bench2.slurm
@ -30,14 +30,10 @@ source ${root}/sourceme.sh

 export OMP_NUM_THREADS=7
 export MPICH_GPU_SUPPORT_ENABLED=1
-export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
-
-for vol in 32.32.32.64
+#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+#64.64.32.96
+for vol in 64.64.32.64
 do
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
-
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
 done

--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@ -3,20 +3,19 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --with-lime=$CLIME \
 --enable-unified=no \
 --enable-shm=nvlink \
--enable-tracing=timer \
+--enable-tracing=none \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
--enable-accelerator-cshift \
 --with-gmp=$OLCF_GMP_ROOT \
 --with-fftw=$FFTW_DIR/.. \
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
-CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
- LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
+CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
+ LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"



--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@ -1,12 +1,25 @@
+
+echo spack
 . /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
-spack load c-lime
-module load emacs 
-module load PrgEnv-gnu
-module load rocm
-module load cray-mpich
-module load gmp
+
+module load cce/15.0.1
+module load rocm/5.3.0
 module load cray-fftw
 module load craype-accel-amd-gfx90a
+
+#Ugly hacks to get down level software working on current system
+export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
+ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
+
+#echo spack load c-lime
+#spack load c-lime
+#module load emacs 
+##module load PrgEnv-gnu
+##module load cray-mpich
+##module load cray-fftw
+##module load craype-accel-amd-gfx90a
+##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 #Hack for lib
-#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
+##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@ -0,0 +1,206 @@
+The purpose of this file is to collate all non-obvious known magic shell variables
+and compiler flags required for either correctness or performance on various systems.
+
+A repository of work-arounds.
+
+Contents:
+1. Interconnect + MPI
+2. Compilation
+3. Profiling
+
+************************
+* 1. INTERCONNECT + MPI
+************************
+
+--------------------------------------------------------------------
+MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O 
+--------------------------------------------------------------------
+export OMPI_MCA_io=romio321
+
+--------------------------------------
+ROMIO fail with > 2GB per node read (32 bit issue)
+--------------------------------------
+
+Use later MPICH
+
+https://github.com/paboyle/Grid/issues/381
+
+https://github.com/pmodels/mpich/commit/3a479ab0
+
+--------------------------------------------------------------------
+Slingshot: Frontier and Perlmutter libfabric slow down 
+and physical memory fragmentation 
+--------------------------------------------------------------------
+export FI_MR_CACHE_MONITOR=disabled
+or
+export FI_MR_CACHE_MONITOR=kdreg2
+
+--------------------------------------------------------------------
+Perlmutter
+--------------------------------------------------------------------
+
+export MPICH_RDMA_ENABLED_CUDA=1
+export MPICH_GPU_IPC_ENABLED=1
+export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
+export MPICH_GPU_NO_ASYNC_MEMCPY=0
+
+--------------------------------------------------------------------
+Frontier/LumiG
+--------------------------------------------------------------------
+
+Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
+
+cat << EOF > select_gpu
+#!/bin/bash
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+export GPU_MAP=(0 1 2 3 7 6 5 4)
+export NUMA_MAP=(3 3 1 1 2 2 0 0)
+export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
+export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
+export HIP_VISIBLE_DEVICES=\$GPU
+unset ROCR_VISIBLE_DEVICES
+echo RANK \$SLURM_LOCALID using GPU \$GPU    
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+chmod +x ./select_gpu
+
+srun ./select_gpu BINARY
+
+
+--------------------------------------------------------------------
+Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
+--------------------------------------------------------------------
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+
+--------------------------------------------------------------------
+Mellanox + A100 correctness (Tursa, Booster, Leonardo)
+--------------------------------------------------------------------
+export UCX_MEMTYPE_CACHE=n
+
+--------------------------------------------------------------------
+MPICH/Aurora/PVC correctness and performance 
+--------------------------------------------------------------------
+
+https://github.com/pmodels/mpich/issues/7302
+
+--enable-cuda-aware-mpi=no  
+--enable-unified=no
+
+Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
+Do not use SVM
+
+Ideally use MPICH with fix to issue 7302:
+
+https://github.com/pmodels/mpich/pull/7312
+
+Ideally:
+MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
+
+Alternatives:
+export MPIR_CVAR_NOLOCAL=1
+export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
+
+--------------------------------------------------------------------
+MPICH/Aurora/PVC correctness and performance 
+--------------------------------------------------------------------
+
+Broken:
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+This gives good peformance without requiring 
+--enable-cuda-aware-mpi=no  
+
+But is an open issue reported by James Osborn
+https://github.com/pmodels/mpich/issues/7139
+
+Possibly resolved but unclear if in the installed software yet.
+
+************************
+* 2. COMPILATION
+************************
+
+--------------------------------------------------------------------
+G++ compiler breakage / graveyard
+--------------------------------------------------------------------
+
+9.3.0, 10.3.1, 
+https://github.com/paboyle/Grid/issues/290
+https://github.com/paboyle/Grid/issues/264
+
+Working (-) Broken (X):
+
+4.9.0 -
+4.9.1 -
+5.1.0 X
+5.2.0 X
+5.3.0 X
+5.4.0 X
+6.1.0 X
+6.2.0 X
+6.3.0 -
+7.1.0 -
+8.0.0 (HEAD) -
+
+https://github.com/paboyle/Grid/issues/100
+
+--------------------------------------------------------------------
+AMD GPU nodes :
+--------------------------------------------------------------------
+
+multiple ROCM versions broken; use 5.3.0
+manifests itself as wrong results in fp32 
+
+https://github.com/paboyle/Grid/issues/464
+
+--------------------------------------------------------------------
+Aurora/PVC
+--------------------------------------------------------------------
+
+SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
+SYCL slow link and relocatable code issues (Christoph Lehner)
+Opt large register file required for good performance in fp64
+
+
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fPIC"
+
+--------------------------------------------------------------------
+Aurora/PVC useful extra options
+--------------------------------------------------------------------
+
+Host only sanitizer:
+-Xarch_host -fsanitize=leak
+-Xarch_host -fsanitize=address
+
+Deterministic MPI reduction:
+export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
+unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
+
+
+
+************************
+* 3. Visual profile tools
+************************
+
+--------------------------------------------------------------------
+Frontier/rocprof
+--------------------------------------------------------------------
+
+--------------------------------------------------------------------
+Aurora/unitrace
+--------------------------------------------------------------------
+
+
+--------------------------------------------------------------------
+Tursa/nsight-sys
+--------------------------------------------------------------------
--- a/systems/sdcc-genoa/bench.slurm
+++ b/systems/sdcc-genoa/bench.slurm
@ -0,0 +1,32 @@
+#!/bin/bash
+#SBATCH --partition lqcd
+#SBATCH --time=00:50:00
+#SBATCH -A lqcdtest
+#SBATCH -q lqcd
+#SBATCH --exclusive
+#SBATCH --nodes=1
+#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=64
+#SBATCH --qos lqcd
+
+source sourceme.sh
+
+export PLACES=(1:16:4 1:32:2 0:64:1);
+export THR=(16 32 64)
+
+for t in  2 
+do
+   
+export OMP_NUM_THREADS=${THR[$t]}
+export OMP_PLACES=${PLACES[$t]}
+export thr=${THR[$t]}
+
+#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
+for vol in 48.48.48.96
+do
+srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
+done
+#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
+done
+
--- a/systems/sdcc-genoa/bench2.slurm
+++ b/systems/sdcc-genoa/bench2.slurm
@ -0,0 +1,36 @@
+#!/bin/bash
+#SBATCH --partition lqcd
+#SBATCH --time=00:50:00
+#SBATCH -A lqcdtest
+#SBATCH -q lqcd
+#SBATCH --exclusive
+#SBATCH --nodes=2
+#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
+#SBATCH --ntasks=2
+#SBATCH --cpus-per-task=64
+#SBATCH --qos lqcd
+
+source sourceme.sh
+
+export PLACES=(1:16:4 1:32:2 0:64:1);
+export THR=(16 32 64)
+
+nodes=2
+mpi=1.1.1.2
+
+for t in 2 
+do
+   
+export OMP_NUM_THREADS=${THR[$t]}
+export OMP_PLACES=${PLACES[$t]}
+export thr=${THR[$t]}
+
+#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
+
+for vol in 64.64.64.128
+do
+srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
+done
+
+done
+
--- a/systems/sdcc-genoa/config-command
+++ b/systems/sdcc-genoa/config-command
@ -0,0 +1,16 @@
+../../configure \
+--enable-comms=mpi-auto \
+--enable-unified=yes \
+--enable-shm=shmopen \
+--enable-shm-fast-path=shmopen \
+--enable-accelerator=none \
+--enable-simd=AVX512 \
+--disable-accelerator-cshift \
+--disable-fermion-reps \
+--disable-gparity \
+CXX=clang++ \
+MPICXX=mpicxx \
+CXXFLAGS="-std=c++17"
+
+
+
--- a/systems/sdcc-genoa/sourceme.sh
+++ b/systems/sdcc-genoa/sourceme.sh
@ -0,0 +1,4 @@
+source $HOME/spack/share/spack/setup-env.sh
+spack load llvm@17.0.4
+export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
+module load openmpi
--- a/systems/spack-linux/config-command
+++ b/systems/spack-linux/config-command
@ -0,0 +1,17 @@
+../../src/Grid/configure \
+    --prefix /home/pab/NPR/install \
+    --enable-comms=mpi-auto \
+    --enable-simd=AVX2 \
+    --enable-shm=none \
+    --enable-debug \
+    --with-lime=$CLIME \
+    --with-hdf5=$HDF5 \
+    --with-fftw=$FFTW \
+    --with-gmp=$GMP \
+    --with-mpfr=$MPFR \
+    --disable-gparity \
+    --disable-fermion-reps \
+    CXX=clang++ \
+    MPICXX=mpicxx \
+    CXXFLAGS="-std=c++17 "
+
--- a/systems/spack-linux/sourceme.sh
+++ b/systems/spack-linux/sourceme.sh
@ -0,0 +1,28 @@
+source $HOME/spack/share/spack/setup-env.sh
+spack load llvm@12
+spack load autoconf%clang@12.0.1
+spack load automake%clang@12.0.1
+spack load c-lime%clang@12.0.1
+spack load fftw%clang@12.0.1
+spack load gmp%clang@12.0.1
+spack load mpfr%clang@12.0.1
+spack load openmpi%clang@12.0.1
+spack load openssl%clang@12.0.1
+spack load hdf5+cxx%clang@12.0.1
+spack load cmake%clang@12.0.1
+export FFTW=`spack find --paths fftw%clang@12.0.1    | grep ^fftw   | awk '{print $2}' `
+export HDF5=`spack find --paths hdf5+cxx%clang@12.0.1   | grep ^hdf5   | awk '{print $2}' `
+export CLIME=`spack find --paths c-lime%clang@12.0.1 | grep ^c-lime | awk '{print $2}' `
+export MPFR=`spack find --paths mpfr%clang@12.0.1    | grep ^mpfr  | awk '{print $2}' `
+export LLVM=`spack find --paths llvm@12    | grep ^llvm  | awk '{print $2}' `
+export OPENSSL=`spack find --paths openssl%clang@12.0.1 | grep openssl | awk  '{print $2}' `
+export GMP=`spack find --paths gmp%clang@12.0.1      | grep ^gmp | awk '{print $2}' `
+export TCLAP=`spack find --paths tclap%clang@12.0.1  | grep ^tclap | awk '{print $2}' `
+export LD_LIBRARY_PATH=${TCLAP}/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$FFTW/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$LLVM/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$LLVM/lib/x86_64-unknown-linux-gnu/:$LD_LIBRARY_PATH
+
+ulimit -s 81920
--- a/systems/spack-linux/spack-install
+++ b/systems/spack-linux/spack-install
@ -0,0 +1,19 @@
+cd
+git clone https://github.com/spack/spack.git
+source $HOME/spack/share/spack/setup-env.sh
+
+spack install llvm@12
+
+spack install autoconf%clang@12.0.1
+spack install automake%clang@12.0.1
+spack install c-lime%clang@12.0.1
+spack install fftw%clang@12.0.1
+spack install gmp%clang@12.0.1
+spack install mpfr%clang@12.0.1
+spack install openmpi%clang@12.0.1
+spack install openssl%clang@12.0.1
+spack install hdf5+cxx%clang@12.0.1
+spack install cmake%clang@12.0.1
+spack install tclap%clang@12.0.1
+spack install emacs%clang@12.0.1
+
--- a/tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc
@ -0,0 +1,781 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_general_coarse_hdcg.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
+#include <Grid/algorithms/iterative/AdefMrhs.h>
+#include <Grid/algorithms/iterative/PowerSpectrum.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class aggregation>
+void SaveFineEvecs(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg[0].Grid()->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.size();b++){
+    WR.writeScidacFieldRecord(Agg[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void SaveBasis(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg.FineGrid->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    //    WR.writeScidacFieldRecord(Agg.subspace[b],record);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void LoadBasis(aggregation &Agg, std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+
+template<class aggregation>
+void LoadBasisSkip(aggregation &Agg, std::string file,int N,LatticeFermionF & tmp)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    for(int n=0;n<N;n++){
+      RD.readScidacFieldRecord(tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+      if(n==0) precisionChange(Agg.subspace[b],tmp);
+    }
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+template<class aggregation>
+void LoadBasisSum(aggregation &Agg, std::string file,int N,LatticeFermionF & tmp)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  
+  LatticeFermionF sum(tmp.Grid());
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    sum=Zero();
+    for(int n=0;n<N;n++){
+      RD.readScidacFieldRecord(tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+      sum=sum+tmp;
+    }
+    precisionChange(Agg.subspace[b],sum);
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+
+template<class CoarseVector>
+void SaveEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(evec[0].Grid()->IsBoss());
+  WR.open(evec_file);
+  for(int b=0;b<evec.size();b++){
+    WR.writeScidacFieldRecord(evec[b],record,0,0);
+  }
+  WR.close();
+  XmlWriter WRx(eval_file);
+  write(WRx,"evals",eval);
+#endif
+}
+template<class CoarseVector>
+void LoadEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+    XmlReader RDx(eval_file);
+    read(RDx,"evals",eval);
+    emptyUserRecord record;
+
+    Grid::ScidacReader RD ;
+    RD.open(evec_file);
+    assert(evec.size()==eval.size());
+    for(int k=0;k<eval.size();k++) {
+      RD.readScidacFieldRecord(evec[k],record);
+    }
+    RD.close();
+#endif
+}
+
+// Want Op in CoarsenOp to call MatPcDagMatPc
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
+  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
+  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
+  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+};
+
+template<class Field> class FixedCGPolynomial : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  ConjugateGradientPolynomial<Field>  CG;
+  int iters;
+  bool record;
+  int replay_count;
+  FixedCGPolynomial(int _iters, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    iters(_iters),
+    record(true),
+    CG(0.0,_iters,false)
+  {
+    std::cout << GridLogMessage<<" FixedCGPolynomial order "<<iters<<std::endl;
+    replay_count = 0;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+#if 1
+    GridBase *grid = in.Grid();
+    Field Mx0(grid);
+    Field r0(grid);
+    Field Minvr0(grid);
+
+    _SmootherOperator.HermOp(out,Mx0);
+
+    r0 = in - Mx0;
+
+    Minvr0 = Zero();
+    Minvr0.Checkerboard()=in.Checkerboard();
+    
+    if ( record ) {
+      std::cout << " FixedCGPolynomial recording polynomial "<<std::endl;
+      CG.Solve(_SmootherOperator,r0,Minvr0);
+      record = false;
+      /*
+      std::cout << "P(x) = 0 "<<std::endl;
+      for(int i=0;i<CG.polynomial.size();i++){
+	std::cout<<" + "<< CG.polynomial[i]<<" * (x**"<<i<<")"<<std::endl;
+	}
+      */
+      Field tmp(Minvr0.Grid());
+      CG.CGsequenceHermOp(_SmootherOperator,r0,tmp);
+      tmp = tmp - Minvr0;
+      std::cout << " CGsequence error "<<norm2(tmp)<<" / "<<norm2(out)<<std::endl;
+    } else {
+      std::cout << " FixedCGPolynomial replaying polynomial "<<std::endl;
+      CG.CGsequenceHermOp(_SmootherOperator,r0,Minvr0);
+      if ( replay_count %5== 0 ) record=true;
+      replay_count++;
+    }
+    out = out + Minvr0;
+    _SmootherOperator.HermOp(out,r0);
+    r0 = r0 - in;
+    RealD rr=norm2(r0);
+    RealD ss=norm2(in);
+    std::cout << " FixedCGPolynomial replayed polynomial resid "<<::sqrt(rr/ss)<<std::endl;
+#else
+    out = Zero();
+    out.Checkerboard()=in.Checkerboard();
+    if ( record ) {
+      std::cout << " FixedCGPolynomial recording polynomial "<<std::endl;
+      CG.Solve(_SmootherOperator,in,out);
+      record = false;
+      std::cout << "P(x) = 0 "<<std::endl;
+      for(int i=0;i<CG.polynomial.size();i++){
+	std::cout<<" + "<< CG.polynomial[i]<<" * (x**"<<i<<")"<<std::endl;
+      }
+      Field tmp(in.Grid());
+      CG.CGsequenceHermOp(_SmootherOperator,in,tmp);
+      tmp = tmp - out;
+      std::cout << " CGsequence error "<<norm2(tmp)<<" / "<<norm2(out)<<std::endl;
+    } else {
+      std::cout << " FixedCGPolynomial replaying polynomial "<<std::endl;
+      CG.CGsequenceHermOp(_SmootherOperator,in,out);
+      if ( replay_count %5== 5 ) record=true;
+      replay_count++;
+    }
+#endif
+    
+  }
+  void operator() (const std::vector<Field> &in, std::vector<Field> &out)
+  {
+    for(int i=0;i<out.size();i++){
+      out[i]=Zero();
+    }
+    int blockDim = 0;//not used for BlockCGVec
+    BlockConjugateGradient<Field>    BCGV  (BlockCGrQVec,blockDim,0.0,iters,false);
+    BCGV(_SmootherOperator,in,out);
+  }
+  
+};
+template<class Field> class CGSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  int iters;
+  CGSmoother(int _iters, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    iters(_iters)
+  {
+    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
+
+    out=Zero();
+
+    CG(_SmootherOperator,in,out);
+  }
+};
+
+
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+template<class Field> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  Chebyshev<Field> Cheby;
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {
+    std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    //    Field r(out.Grid());
+    Cheby(_SmootherOperator,in,out);
+    //    _SmootherOperator.HermOp(out,r);
+    //    r=r-in;
+    //    RealD rr=norm2(r);
+    //    RealD ss=norm2(in);
+    //    std::cout << GridLogMessage<<" Chebyshev smoother resid "<<::sqrt(rr/ss)<<std::endl;
+  }
+};
+
+
+template<class Field> class ChebyshevInverter : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _Operator;
+  Chebyshev<Field> Cheby;
+  ChebyshevInverter(RealD _lo,RealD _hi,int _ord, FineOperator &Operator) :
+    _Operator(Operator),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {
+    std::cout << GridLogMessage<<" Chebyshev Inverter order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    Field r(in.Grid());
+    Field AinvR(in.Grid());
+    _Operator.HermOp(out,r);
+    r = in - r; // b - A x
+    Cheby(_Operator,r,AinvR); // A^{-1} ( b - A x ) ~ A^{-1} b - x
+    out = out + AinvR;
+    _Operator.HermOp(out,r);
+    r = in - r; // b - A x
+    RealD rr = norm2(r);
+    RealD ss = norm2(in);
+    std::cout << "ChebshevInverse resid " <<::sqrt(rr/ss)<<std::endl;
+  }
+};
+
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int sample=1;
+  if( GridCmdOptionExists(argv,argv+argc,"--sample") ){
+    std::string arg;
+    arg = GridCmdOptionPayload(argv,argv+argc,"--sample");
+    GridCmdOptionInt(arg,sample);
+  }
+  
+  const int Ls=24;
+  const int nbasis = 62;
+  const int cb = 0 ;
+  RealD mass=0.00078;
+
+  if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
+    std::string arg;
+    arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
+    GridCmdOptionFloat(arg,mass);
+  }
+
+  RealD M5=1.8;
+  RealD b=1.5;
+  RealD c=0.5;
+
+  std::cout << GridLogMessage << " *************************** " <<std::endl;
+  std::cout << GridLogMessage << " Mass " <<mass<<std::endl;
+  std::cout << GridLogMessage << " M5   " <<M5<<std::endl;
+  std::cout << GridLogMessage << " Ls   " <<Ls<<std::endl;
+  std::cout << GridLogMessage << " b    " <<b<<std::endl;
+  std::cout << GridLogMessage << " c    " <<c<<std::endl;
+  std::cout << GridLogMessage << " *************************** " <<std::endl;
+  
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  //////////////////////////////////////////
+  // Single precision grids -- lanczos + smoother
+  //////////////////////////////////////////
+  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplexF::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  ///////////////////////// Configuration /////////////////////////////////
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.1000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  //////////////////////// Fermion action //////////////////////////////////
+  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "         Fine Power method            "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+
+  {
+    LatticeFermionD pm_src(FrbGrid);
+    pm_src = ComplexD(1.0);
+    PowerMethod<LatticeFermionD>       fPM;
+    fPM(HermOpEO,pm_src);
+  }
+
+  if(0)
+  {
+
+    std::cout << "**************************************"<<std::endl;
+    std::cout << "         Fine Lanczos           "<<std::endl;
+    std::cout << "**************************************"<<std::endl;
+
+    typedef LatticeFermionF FermionField;
+    LatticeGaugeFieldF UmuF(UGridF);
+    precisionChange(UmuF,Umu);
+    MobiusFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,b,c);
+    SchurDiagMooeeOperator<MobiusFermionF, LatticeFermionF> HermOpEOF(DdwfF);
+
+    const int Fine_Nstop = 200;
+    const int Fine_Nk = 200;
+    const int Fine_Np = 200;
+    const int Fine_Nm = Fine_Nk+Fine_Np;
+    const int Fine_MaxIt= 10;
+
+    RealD Fine_resid = 1.0e-4;
+    std::cout << GridLogMessage << "Fine Lanczos "<<std::endl;
+    std::cout << GridLogMessage << "Nstop "<<Fine_Nstop<<std::endl;
+    std::cout << GridLogMessage << "Nk "<<Fine_Nk<<std::endl;
+    std::cout << GridLogMessage << "Np "<<Fine_Np<<std::endl;
+    std::cout << GridLogMessage << "resid "<<Fine_resid<<std::endl;
+
+    Chebyshev<FermionField> Cheby(0.002,92.0,401);
+    //    Chebyshev<FermionField> Cheby(0.1,92.0,401);
+    FunctionHermOp<FermionField> OpCheby(Cheby,HermOpEOF);
+    PlainHermOp<FermionField> Op     (HermOpEOF);
+    ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Fine_Nstop,Fine_Nk,Fine_Nm,Fine_resid,Fine_MaxIt);
+    std::vector<RealD>          Fine_eval(Fine_Nm);
+    FermionField                Fine_src(FrbGridF); 
+    Fine_src = ComplexF(1.0);
+    std::vector<FermionField> Fine_evec(Fine_Nm,FrbGridF);
+
+    int Fine_Nconv;
+    std::cout << GridLogMessage <<" Calling IRL.calc single prec"<<std::endl;
+    IRL.calc(Fine_eval,Fine_evec,Fine_src,Fine_Nconv);
+
+    std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
+    SaveFineEvecs(Fine_evec,evec_file);
+  }
+
+
+  //////////////////////////////////////////
+  // Construct a coarsened grid with 4^4 cell
+  //////////////////////////////////////////
+  Coordinate Block({4,4,6,4});
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/Block[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
+							    GridDefaultSimd(Nd,vComplex::Nsimd()),
+							    GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  ///////////////////////// RNGs /////////////////////////////////
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  
+  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
+  HermFineMatrix FineHermOp(HermOpEO);
+
+  ////////////////////////////////////////////////////////////
+  ///////////// Coarse basis and Little Dirac Operator ///////
+  ////////////////////////////////////////////////////////////
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse5d,FrbGrid,cb);
+
+  ////////////////////////////////////////////////////////////
+  // Need to check about red-black grid coarsening
+  ////////////////////////////////////////////////////////////
+  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.mixed.2500.60");
+  //  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62");
+  //  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
+  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.mixed.2500.60");
+  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.mixed.60");
+  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
+  std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
+  bool load_agg=true;
+  bool load_refine=true;
+  bool load_mat=false;
+  bool load_evec=false;
+
+  int refine=1;
+  if ( load_agg ) {
+    if ( !(refine) || (!load_refine) ) { 
+      LoadBasis(Aggregates,subspace_file);
+    }
+  } else {
+    //    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
+    //    					0.0003,1.0e-5,2000); // Lo, tol, maxit
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run
+    Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 
+    SaveBasis(Aggregates,subspace_file);
+  }
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Building MultiRHS Coarse operator"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  ConjugateGradient<CoarseVector>  coarseCG(4.0e-2,20000,true);
+    
+  const int nrhs=24;
+    
+  Coordinate mpi=GridDefaultMpi();
+  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
+  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
+  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
+    
+  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
+  typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
+  MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "         Coarse Lanczos               "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+
+  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
+  Chebyshev<CoarseVector>      IRLCheby(0.005,42.0,301);  // 1 iter
+  MrhsHermMatrix MrhsCoarseOp     (mrhs);
+
+  //  CoarseVector pm_src(CoarseMrhs);
+  //  pm_src = ComplexD(1.0);
+  //  PowerMethod<CoarseVector>       cPM;   cPM(MrhsCoarseOp,pm_src);
+
+  int Nk=192;
+  int Nm=384;
+  int Nstop=Nk;
+  int Nconv_test_interval=1;
+  
+  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
+							  Coarse5d,
+							  CoarseMrhs,
+							  nrhs,
+							  IRLCheby,
+							  Nstop,
+							  Nconv_test_interval,
+							  nrhs,
+							  Nk,
+							  Nm,
+							  1e-5,10);
+
+  int Nconv;
+  std::vector<RealD>            eval(Nm);
+  std::vector<CoarseVector>     evec(Nm,Coarse5d);
+  std::vector<CoarseVector>     c_src(nrhs,Coarse5d);
+
+  ///////////////////////
+  // Deflation guesser object
+  ///////////////////////
+  MultiRHSDeflation<CoarseVector> MrhsGuesser;
+
+  //////////////////////////////////////////
+  // Block projector for coarse/fine
+  //////////////////////////////////////////
+  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
+
+  //////////////////////////
+  // Extra HDCG parameters
+  //////////////////////////
+  int maxit=300;
+  ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false);
+  ConjugateGradient<CoarseVector>  CGstart(5.0e-2,maxit,false);
+  RealD lo=2.0;
+  int ord = 7;
+  //  int ord = 11;
+
+  int blockDim = 0;//not used for BlockCG
+  BlockConjugateGradient<CoarseVector>    BCG  (BlockCGrQ,blockDim,5.0e-5,maxit,true);
+
+  DoNothingGuesser<CoarseVector> DoNothing;
+  //  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
+  //  HPDSolver<CoarseVector> HPDSolveMrhsStart(MrhsCoarseOp,CGstart,DoNothing);
+  //  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,BCG,DoNothing);
+  //  HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,BCG,DoNothing);
+  //  FixedCGPolynomial<CoarseVector>  HPDSolveMrhs(maxit,MrhsCoarseOp);
+
+  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,120,MrhsCoarseOp);  //
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,110,MrhsCoarseOp);  // 114 iter with Chebysmooth and BlockCG
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,120,MrhsCoarseOp); // 138 iter with Chebysmooth
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,200,MrhsCoarseOp); // 139 iter
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(3.0e-3,40.0,200,MrhsCoarseOp); // 137 iter, CG smooth, flex
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-3,40.0,200,MrhsCoarseOp); // 146 iter, CG smooth, flex
+  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(3.0e-4,40.0,200,MrhsCoarseOp); // 156 iter, CG smooth, flex
+
+  /////////////////////////////////////////////////
+  // Mirs smoother
+  /////////////////////////////////////////////////
+  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,lo);
+  //  FixedCGPolynomial<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
+  //  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
+  ChebyshevSmoother<LatticeFermionD> CGsmooth(2.0,92.0,8,HermOpEO) ;
+  
+  if ( load_refine ) {
+    LoadBasis(Aggregates,refine_file);
+    //    LatticeFermionF conv_tmp(FrbGridF);
+    //    LoadBasisSum(Aggregates,refine_file,sample,conv_tmp);
+  } else {
+    Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters
+    SaveBasis(Aggregates,refine_file);
+  }
+  Aggregates.Orthogonalise();
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Coarsen after refine"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Recompute coarse evecs  "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  evec.resize(Nm,Coarse5d);
+  eval.resize(Nm);
+  for(int r=0;r<nrhs;r++){
+    random(CRNG,c_src[r]);
+  }
+ IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Reimport coarse evecs  "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  MrhsGuesser.ImportEigenBasis(evec,eval);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Setting up mRHS HDCG"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
+  MrhsProjector.ImportBasis(Aggregates.subspace);
+      
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Calling mRHS HDCG"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
+    HDCGmrhs(1.0e-8, 300,
+	     FineHermOp,
+	     CGsmooth,
+	     HPDSolveMrhs,    // Used in M1
+	     HPDSolveMrhs,          // Used in Vstart
+	     MrhsProjector,
+	     MrhsGuesser,
+	     CoarseMrhs);
+    
+  std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
+  std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
+  LatticeFermionD result_accurate(FrbGrid);
+  LatticeFermionD result_sloppy(FrbGrid);
+  LatticeFermionD error(FrbGrid);
+  LatticeFermionD residual(FrbGrid);
+
+  for(int r=0;r<nrhs;r++){
+    random(RNG5,src_mrhs[r]);
+    res_mrhs[r]=Zero();
+  }
+  HDCGmrhs(src_mrhs,res_mrhs);
+  result_accurate = res_mrhs[0];
+
+#if 0
+
+  std::vector<RealD>   bins({1.0e-3,1.0e-2,1.0e-1,1.0,10.0,100.0});
+  std::vector<int>   orders({6000  ,4000  ,1000  ,500,500 ,500});
+  PowerSpectrum GraphicEqualizer(bins,orders);
+  
+  std::cout << "**************************************"<<std::endl;
+  std::cout << GridLogMessage << " PowerSpectrum of rrr "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  GraphicEqualizer(FineHermOp,HDCGmrhs.rrr);
+  std::cout << "**************************************"<<std::endl;
+  std::cout << GridLogMessage << " PowerSpectrum of sss "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  GraphicEqualizer(FineHermOp,HDCGmrhs.sss);
+  std::cout << "**************************************"<<std::endl;
+  std::cout << GridLogMessage << " PowerSpectrum of qqq "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  GraphicEqualizer(FineHermOp,HDCGmrhs.qqq);
+  std::cout << "**************************************"<<std::endl;
+  std::cout << GridLogMessage << " PowerSpectrum of zzz "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  GraphicEqualizer(FineHermOp,HDCGmrhs.zzz);
+
+  std::vector<RealD> tols({1.0e-3,1.0e-4,1.0e-5});
+
+
+  for(auto tol : tols) {
+    
+    TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
+      HDCGmrhsSloppy(tol, 500,
+		     FineHermOp,
+		     CGsmooth,
+		     HPDSolveMrhs,    // Used in M1
+		     HPDSolveMrhs,    // Used in Vstart
+		     MrhsProjector,
+		     MrhsGuesser,
+		     CoarseMrhs);
+  
+    //  Solve again to 10^-5
+    for(int r=0;r<nrhs;r++){
+      res_mrhs[r]=Zero();
+    }
+    HDCGmrhsSloppy(src_mrhs,res_mrhs);
+    
+    result_sloppy = res_mrhs[0];
+    error = result_sloppy - result_accurate;
+    FineHermOp.HermOp(result_sloppy,residual);
+    residual = residual - src_mrhs[0];
+    
+    std::cout << "**************************************"<<std::endl;
+    std::cout << GridLogMessage << " Converged to tolerance "<< tol<<std::endl;
+    std::cout << GridLogMessage << " Absolute error "<<norm2(error)<<std::endl;
+    std::cout << GridLogMessage << " Residual       "<<norm2(residual)<<std::endl;
+    std::cout << "**************************************"<<std::endl;
+
+    std::cout << "**************************************"<<std::endl;
+    std::cout << GridLogMessage << " PowerSpectrum of error   "<<std::endl;
+    std::cout << "**************************************"<<std::endl;
+    GraphicEqualizer(FineHermOp,error);
+    std::cout << "**************************************"<<std::endl;
+    std::cout << GridLogMessage << " PowerSpectrum of residual   "<<std::endl;
+    std::cout << "**************************************"<<std::endl;
+    GraphicEqualizer(FineHermOp,residual);
+
+  };
+#endif
+  
+  // Standard CG
+#if 0
+  {
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Calling red black CG"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+      
+    LatticeFermion result(FrbGrid); result=Zero();
+    LatticeFermion    src(FrbGrid); random(RNG5,src);
+    result=Zero();
+
+    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
+    CGfine(HermOpEO, src, result);
+  }
+#endif  
+  Grid_finalize();
+  return 0;
+}
--- a/tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc
@ -0,0 +1,355 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_general_coarse_hdcg.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
+#include <Grid/algorithms/iterative/AdefMrhs.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class aggregation>
+void SaveFineEvecs(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg[0].Grid()->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.size();b++){
+    WR.writeScidacFieldRecord(Agg[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void SaveBasis(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg.FineGrid->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    //    WR.writeScidacFieldRecord(Agg.subspace[b],record);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void LoadBasis(aggregation &Agg, std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+template<class aggregation>
+void LoadFineEvecs(aggregation &Agg, std::string file,LatticeFermionF & conv_tmp)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.size();b++){
+    RD.readScidacFieldRecord(conv_tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    precisionChange(Agg[b],conv_tmp);
+  }    
+  RD.close();
+#endif
+}
+template<class CoarseVector>
+void SaveEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(evec[0].Grid()->IsBoss());
+  WR.open(evec_file);
+  for(int b=0;b<evec.size();b++){
+    WR.writeScidacFieldRecord(evec[b],record,0,0);
+  }
+  WR.close();
+  XmlWriter WRx(eval_file);
+  write(WRx,"evals",eval);
+#endif
+}
+template<class CoarseVector>
+void LoadEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+    XmlReader RDx(eval_file);
+    read(RDx,"evals",eval);
+    emptyUserRecord record;
+
+    Grid::ScidacReader RD ;
+    RD.open(evec_file);
+    assert(evec.size()==eval.size());
+    for(int k=0;k<eval.size();k++) {
+      RD.readScidacFieldRecord(evec[k],record);
+    }
+    RD.close();
+#endif
+}
+
+// Want Op in CoarsenOp to call MatPcDagMatPc
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
+  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
+  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
+  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+};
+
+template<class Field> class CGSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  int iters;
+  CGSmoother(int _iters, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    iters(_iters)
+  {
+    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
+
+    out=Zero();
+
+    CG(_SmootherOperator,in,out);
+  }
+};
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+  const int nbasis = 62;
+  const int cb = 0 ;
+  RealD mass=0.00078;
+  RealD M5=1.8;
+  RealD b=1.5;
+  RealD c=0.5;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid with 4^4 cell
+  Coordinate Block({4,4,6,4});
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/Block[d];
+  }
+
+  //////////////////////////////////////////
+  // Double precision grids 
+  //////////////////////////////////////////
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
+							    GridDefaultSimd(Nd,vComplex::Nsimd()),
+							    GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  //////////////////////////////////////////
+  // Single precision grids -- lanczos + smoother
+  //////////////////////////////////////////
+  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplexF::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+  ///////////////////////// RNGs /////////////////////////////////
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  ///////////////////////// Configuration /////////////////////////////////
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.1000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  //////////////////////// Fermion action //////////////////////////////////
+  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+
+  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
+  
+  const int Fine_Nstop = 200;
+  const int Fine_Nk = 100;
+  const int Fine_Np = 100;
+  const int Fine_Nm = Fine_Nk+Fine_Np;
+
+  typedef LatticeFermion FermionField;
+  std::vector<RealD>        Fine_eval;
+  std::vector<FermionField> Fine_evec;
+
+  LatticeFermionF conv_tmp(FrbGridF);
+  Fine_eval.resize(Fine_Nstop);
+  Fine_evec.resize(Fine_Nstop,FrbGrid);
+  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
+  LoadFineEvecs(Fine_evec,evec_file,conv_tmp);
+  
+  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
+  HermFineMatrix FineHermOp(HermOpEO);
+
+  ////////////////////////////////////////////////////////////
+  ///////////// Coarse basis and Little Dirac Operator ///////
+  ////////////////////////////////////////////////////////////
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse5d,FrbGrid,cb);
+
+  ////////////////////////////////////////////////////////////
+  // Need to check about red-black grid coarsening
+  ////////////////////////////////////////////////////////////
+  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.mixed.2500.60");
+  //  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62");
+  //  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evec");
+  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.mixed.2500.60");
+  //  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.mixed.60");
+  //  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
+  //  std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
+  bool load_agg=true;
+  bool load_refine=true;
+
+  //////////////////////////////////////////
+  // Block projector for coarse/fine
+  //////////////////////////////////////////
+  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
+
+
+  /////////////////////////////////////////////////
+  // Mirs smoother
+  /////////////////////////////////////////////////
+  int ord=8;
+  RealD lo=2.0;
+  RealD MirsShift = lo;
+  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
+  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
+  
+  LoadBasis(Aggregates,refine_file);
+  Aggregates.Orthogonalise();
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Using filtered subspace"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
+  MrhsProjector.ImportBasis(Aggregates.subspace);
+
+  FermionField Ftmp(FrbGrid);
+  std::vector<FermionField> Fine_ev(1,FrbGrid);
+  std::vector<FermionField> Fine_ev_compressed(1,FrbGrid);
+  std::vector<CoarseVector>  c_evec(1,Coarse5d);
+  for(int ev=0;ev<Fine_evec.size();ev++){
+    Fine_ev[0] = Fine_evec[ev];
+    MrhsProjector.blockProject(Fine_ev,c_evec);
+    MrhsProjector.blockPromote(Fine_ev_compressed,c_evec);
+    Ftmp = Fine_ev_compressed[0];
+    RealD div = 1.0/ sqrt(norm2(Ftmp));
+    Ftmp = Ftmp * div;
+    std::cout << GridLogMessage<<" "<<ev<<" uncomp "<< norm2(Fine_ev[0])  <<std::endl;
+    std::cout << GridLogMessage<<" "<<ev<<" comp   "<< norm2(Ftmp)  <<std::endl;
+    Ftmp = Fine_ev[0] - Ftmp;
+    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
+    CGsmooth(Fine_ev_compressed[0],Ftmp);
+    Ftmp = Ftmp *lo;
+    std::cout << GridLogMessage<<" "<<ev<<" smoothed "<< norm2(Ftmp)  <<std::endl;
+    div = 1.0/ sqrt(norm2(Ftmp));
+    Ftmp=Ftmp*div;
+    Ftmp = Fine_ev[0]-Ftmp;
+    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
+  }
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Using eigenvector subspace "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  for(int i=0;i<Aggregates.subspace.size();i++){
+    Aggregates.subspace[i] = Fine_evec[i];
+  }
+  Aggregates.Orthogonalise();
+  MrhsProjector.ImportBasis(Aggregates.subspace);
+  for(int ev=0;ev<Fine_evec.size();ev++){
+    Fine_ev[0] = Fine_evec[ev];
+    MrhsProjector.blockProject(Fine_ev,c_evec);
+    MrhsProjector.blockPromote(Fine_ev_compressed,c_evec);
+    Ftmp = Fine_ev_compressed[0];
+    RealD div = 1.0/ sqrt(norm2(Ftmp));
+    Ftmp = Ftmp * div;
+    std::cout << GridLogMessage<<" "<<ev<<" uncomp "<< norm2(Fine_ev[0])  <<std::endl;
+    std::cout << GridLogMessage<<" "<<ev<<" comp   "<< norm2(Ftmp)  <<std::endl;
+    Ftmp = Fine_ev[0] - Ftmp;
+    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
+    CGsmooth(Fine_ev_compressed[0],Ftmp);
+    Ftmp = Ftmp *lo;
+    std::cout << GridLogMessage<<" "<<ev<<" smoothed "<< norm2(Ftmp)  <<std::endl;
+    div = 1.0/ sqrt(norm2(Ftmp));
+    Ftmp=Ftmp*div;
+    Ftmp = Fine_ev[0]-Ftmp;
+    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
+  }
+
+  // Standard CG
+  Grid_finalize();
+  return 0;
+}
--- a/tests/debug/Test_general_coarse_pvdagm.cc
+++ b/tests/debug/Test_general_coarse_pvdagm.cc
@ -36,28 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;

-template<class Field>
-class HermOpAdaptor : public LinearOperatorBase<Field>
-{
-  LinearOperatorBase<Field> & wrapped;
-public:
-  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
-  void OpDiag (const Field &in, Field &out) {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
-  void Op     (const Field &in, Field &out){
-    wrapped.HermOp(in,out);
-  }
-  void AdjOp     (const Field &in, Field &out){
-    wrapped.HermOp(in,out);
-  }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
-  void HermOp(const Field &in, Field &out){
-    wrapped.HermOp(in,out);
-  }
-  
-};
-
 template<class Matrix,class Field>
 class PVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
@ -69,78 +47,169 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
+    //    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
-    _PV.M(tmp,out);
-    _Mat.Mdag(in,tmp);
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    std::cout << "HermOp"<<std::endl;
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    //    _Mat.M(in,tmp);
+    //    _PV.Mdag(tmp,out);
+    //    _PV.M(out,tmp);
+    //    _Mat.Mdag(tmp,out);
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
-    _PV.M(out,tmp);
-    _Mat.Mdag(tmp,out);
-    std::cout << "HermOp done "<<norm2(out)<<std::endl;
-    
+    out = out + shift * in;
  }
-};
-
-template<class Field> class DumbOperator  : public LinearOperatorBase<Field> {
-public:
-  LatticeComplex scale;
-  DumbOperator(GridBase *grid) : scale(grid)
-  {
-    scale = 0.0;
-    LatticeComplex scalesft(grid);
-    LatticeComplex scaletmp(grid);
-    for(int d=0;d<4;d++){
-      Lattice<iScalar<vInteger> > x(grid); LatticeCoordinate(x,d+1);
-      LatticeCoordinate(scaletmp,d+1);
-      scalesft = Cshift(scaletmp,d+1,1);
-      scale = 100.0*scale + where( mod(x    ,2)==(Integer)0, scalesft,scaletmp);
-    }
-    std::cout << " scale\n" << scale << std::endl;
-  }
-  // Support for coarsening to a multigrid
-  void OpDiag (const Field &in, Field &out) {};
-  void OpDir  (const Field &in, Field &out,int dir,int disp){};
-  void OpDirAll  (const Field &in, std::vector<Field> &out) {};
-
-  void Op     (const Field &in, Field &out){
-    out = scale * in;
-  }
-  void AdjOp  (const Field &in, Field &out){
-    out = scale * in;
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    double n1, n2;
-    HermOpAndNorm(in,out,n1,n2);
-  }
-  void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
-    ComplexD dot;
-
-    out = scale * in;
-
-    dot= innerProduct(in,out);
-    n1=real(dot);
-
-    dot = innerProduct(out,out);
-    n2=real(dot);
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
  }
 };
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();

+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};

 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-  const int Ls=2;
+  const int Ls=16;

  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@ -151,7 +220,8 @@ int main (int argc, char ** argv)
  // Construct a coarsened grid
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
-    clatt[d] = clatt[d]/4;
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
@ -173,15 +243,14 @@ int main (int argc, char ** argv)
  FieldMetaData header;
  std::string file("ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu,header,file);
-  //Umu = 1.0;
  
-  RealD mass=0.5;
+  RealD mass=0.01;
  RealD M5=1.8;

  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);

-  const int nbasis = 1;
+  const int nbasis = 20;
  const int cb = 0 ;
  LatticeFermion prom(FGrid);

@ -193,25 +262,51 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
-  
-  PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM(Ddwf,Dpv);
-  HermOpAdaptor<LatticeFermionD> HOA(PVdagM);
+
+  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+  PVdagM_t PVdagM(Ddwf,Dpv);
+  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
+  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
+  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
+

  // Run power method on HOA??
-  PowerMethod<LatticeFermion>       PM;   PM(HOA,src);
+  //  PowerMethod<LatticeFermion>       PM;   PM(PVdagM,src);
 
  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace AggregatesPD(Coarse5d,FGrid,cb);
+  /*
  AggregatesPD.CreateSubspaceChebyshev(RNG5,
-				       HOA,
+				       PVdagM,
 				       nbasis,
-				       5000.0,
-				       0.02,
-				       100,
-				       50,
-				       50,
+				       4000.0,
+				       2.0,
+				       200,
+				       200,
+				       200,
 				       0.0);
+  */
+  AggregatesPD.CreateSubspaceGCR(RNG5,
+				 PVdagM,
+				 nbasis);
  
  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
  LittleDiracOpPV.CoarsenOperator(PVdagM,AggregatesPD);
@ -257,6 +352,60 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;

+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-2, 100, LinOpCoarse,simple,10,10); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(AggregatesPD,
+			    PVdagM,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
--- a/tests/lanczos/LanParams.xml
+++ b/tests/lanczos/LanParams.xml
@ -0,0 +1,14 @@
+<?xml version="1.0"?>
+<grid>
+  <LanczosParameters>
+    <mass>0.00107</mass>
+    <M5>1.8</M5>
+    <Ls>48</Ls>
+    <Nstop>10</Nstop>
+    <Nk>15</Nk>
+    <Np>85</Np>
+    <ChebyLow>0.003</ChebyLow>
+    <ChebyHigh>60</ChebyHigh>
+    <ChebyOrder>201</ChebyOrder>
+  </LanczosParameters>
+</grid>
--- a/tests/lanczos/Test_dwf_G5R5.cc
+++ b/tests/lanczos/Test_dwf_G5R5.cc
@ -0,0 +1,346 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_dwf_G5R5.cc
+
+Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+From Duo and Bob's Chirality study
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+ ;
+
+//typedef WilsonFermionD FermionOp;
+typedef DomainWallFermionD FermionOp;
+typedef typename DomainWallFermionD::FermionField FermionField;
+
+
+RealD AllZero(RealD x) { return 0.; }
+
+namespace Grid {
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+		  		RealD, mass , 
+		  		RealD, M5 , 
+	  			Integer, Ls,
+	  			Integer, Nstop,
+	  			Integer, Nk,
+	  			Integer, Np,
+	  			RealD, ChebyLow,
+	  			RealD, ChebyHigh,
+	  			Integer, ChebyOrder)
+//                                  Integer, StartTrajectory,
+//                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
+//                                  bool, MetropolisTest,
+//                                  Integer, NoMetropolisUntil,
+//                                  std::string, StartingType,
+//                                  Integer, SW,
+//				  RealD, Kappa,
+//                                  IntegratorParameters, MD)
+
+  LanczosParameters() {
+    ////////////////////////////// Default values
+      mass = 0;
+//    MetropolisTest    = true;
+//    NoMetropolisUntil = 10;
+//    StartTrajectory   = 0;
+//    SW                = 2;
+//    Trajectories      = 10;
+//    StartingType      = "HotStart";
+    /////////////////////////////////
+  }
+
+  template <class ReaderClass >
+  LanczosParameters(Reader<ReaderClass> & TheReader){
+    initialize(TheReader);
+  }
+
+  template < class ReaderClass > 
+  void initialize(Reader<ReaderClass> &TheReader){
+//    std::cout << GridLogMessage << "Reading HMC\n";
+    read(TheReader, "HMC", *this);
+  }
+
+
+  void print_parameters() const {
+//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
+//    MD.print_parameters();
+  }
+  
+};
+
+}
+
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);
+
+  LanczosParameters LanParams;
+#if 1
+  {
+    XmlReader  HMCrd("LanParams.xml");
+    read(HMCrd,"LanczosParameters",LanParams);
+  }
+#else
+  {
+    LanParams.mass = mass;
+  }
+#endif
+  std::cout << GridLogMessage<< LanParams <<std::endl;
+  { 
+    XmlWriter HMCwr("LanParams.xml.out");
+    write(HMCwr,"LanczosParameters",LanParams);
+  }
+
+  int Ls=16;
+  RealD M5=1.8;
+  RealD mass = -1.0;
+
+  mass=LanParams.mass;
+  Ls=LanParams.Ls;
+  M5=LanParams.M5;
+
+  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
+      GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid =
+      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+//  GridCartesian* FGrid = UGrid;
+//  GridRedBlackCartesian* FrbGrid = UrbGrid;
+  GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
+//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
+
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5);
+
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("./config");
+
+  int precision32 = 0;
+  int tworow      = 0;
+  NerscIO::readConfiguration(Umu,header,file);
+
+/*
+  std::vector<LatticeColourMatrix> U(4, UGrid);
+  for (int mu = 0; mu < Nd; mu++) {
+    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+  }
+*/
+
+  int Nstop = 10;
+  int Nk = 20;
+  int Np = 80;
+  Nstop=LanParams.Nstop;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
+
+  int Nm = Nk + Np;
+  int MaxIt = 10000;
+  RealD resid = 1.0e-5;
+
+
+//while ( mass > - 5.0){
+  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
+//  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
+  Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
+//  Gamma5R5HermitianLinearOperator
+  std::vector<double> Coeffs{0, 1.};
+  Polynomial<FermionField> PolyX(Coeffs);
+
+  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
+
+  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
+  PlainHermOp<FermionField> Op     (HermOp);
+  PlainHermOp<FermionField> Op2     (G5R5Herm);
+
+  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
+
+  std::vector<RealD> eval(Nm);
+  FermionField src(FGrid);
+  gaussian(RNG5, src);
+  std::vector<FermionField> evec(Nm, FGrid);
+  for (int i = 0; i < 1; i++) {
+    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
+              << std::endl;
+  };
+
+  int Nconv;
+  IRL.calc(eval, evec, src, Nconv);
+
+  std::cout << mass <<" : " << eval << std::endl;
+
+#if 0
+  Gamma g5(Gamma::Algebra::Gamma5) ;
+  ComplexD dot;
+  FermionField tmp(FGrid);
+//  RealD eMe,eMMe;
+  for (int i = 0; i < Nstop ; i++) {
+//    tmp = g5*evec[i];
+    dot = innerProduct(evec[i],evec[i]);
+//    G5R5(tmp,evec[i]);
+    G5R5Herm.HermOpAndNorm(evec[i],tmp,eMe,eMMe);
+    std::cout <<"Norm "<<M5<<" "<< mass << " : " << i << " " << real(dot) << " " << imag(dot)  << " "<< eMe << " " <<eMMe<< std::endl ;
+    for (int j = 0; j < Nstop ; j++) {
+      dot = innerProduct(tmp,evec[j]);
+      std::cout <<"G5R5 "<<M5<<" "<< mass << " : " << i << " " <<j<<" " << real(dot) << " " << imag(dot)  << std::endl ;
+    }
+  }
+//  src  = evec[0]+evec[1]+evec[2];
+//  mass += -0.1;
+#endif
+
+  //**********************************************************************
+  //orthogonalization
+  //calculat the matrix
+  cout << "Start orthogonalization " << endl;
+  cout << "calculate the matrix element" << endl;
+  vector<LatticeFermion> G5R5Mevec(Nconv, FGrid);
+  vector<LatticeFermion> finalevec(Nconv, FGrid);
+  vector<RealD> eMe(Nconv), eMMe(Nconv);
+  for(int i = 0; i < Nconv; i++){
+    G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
+  }
+  cout << "Re<evec, G5R5M(evec)>: " << endl;
+  cout << eMe << endl;
+  cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
+  cout << eMMe << endl;
+  vector<vector<ComplexD>> VevecG5R5Mevec(Nconv);
+  Eigen::MatrixXcd evecG5R5Mevec = Eigen::MatrixXcd::Zero(Nconv, Nconv);
+  for(int i = 0; i < Nconv; i++){
+    VevecG5R5Mevec[i].resize(Nconv);
+    for(int j = 0; j < Nconv; j++){
+      VevecG5R5Mevec[i][j] = innerProduct(evec[i], G5R5Mevec[j]);
+      evecG5R5Mevec(i, j) = VevecG5R5Mevec[i][j];
+    }
+  }
+  //calculate eigenvector
+  cout << "Eigen solver" << endl;
+  Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(evecG5R5Mevec);
+  vector<RealD> eigeneval(Nconv);
+  vector<vector<ComplexD>> eigenevec(Nconv);
+  for(int i = 0; i < Nconv; i++){
+    eigeneval[i] = eigensolver.eigenvalues()[i];
+    eigenevec[i].resize(Nconv);
+    for(int j = 0; j < Nconv; j++){
+      eigenevec[i][j] = eigensolver.eigenvectors()(i, j);
+    }
+  }
+  //rotation
+  cout << "Do rotation" << endl;
+  for(int i = 0; i < Nconv; i++){
+    finalevec[i] = finalevec[i] - finalevec[i];
+    for(int j = 0; j < Nconv; j++){
+      finalevec[i] = eigenevec[j][i]*evec[j] + finalevec[i];
+    }
+  }
+  //normalize again;
+  for(int i = 0; i < Nconv; i++){
+    RealD tmp_RealD = norm2(finalevec[i]);
+    tmp_RealD = 1./pow(tmp_RealD, 0.5);
+    finalevec[i] = finalevec[i]*tmp_RealD;
+  }
+
+  //check
+  for(int i = 0; i < Nconv; i++){
+    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
+  }
+
+  //**********************************************************************
+  //sort the eigenvectors
+  vector<LatticeFermion> finalevec_copy(Nconv, FGrid);
+  for(int i = 0; i < Nconv; i++){
+    finalevec_copy[i] = finalevec[i];
+  }
+  vector<RealD> eMe_copy(eMe);
+  for(int i = 0; i < Nconv; i++){
+    eMe[i] = fabs(eMe[i]);
+    eMe_copy[i] = eMe[i];
+  }
+  sort(eMe_copy.begin(), eMe_copy.end());
+  for(int i = 0; i < Nconv; i++){
+    for(int j = 0; j < Nconv; j++){
+      if(eMe[j] == eMe_copy[i]){
+        finalevec[i] = finalevec_copy[j];
+      }
+    }
+  }
+    for(int i = 0; i < Nconv; i++){
+    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
+  }
+  cout << "Re<evec, G5R5M(evec)>: " << endl;
+  cout << eMe << endl;
+  cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
+  cout << eMMe << endl;
+
+
+//  vector<LatticeFermion> finalevec(Nconv, FGrid);
+// temporary, until doing rotation
+//  for(int i = 0; i < Nconv; i++)
+//	  finalevec[i]=evec[i];
+  //**********************************************************************
+  //calculate chirality matrix
+  vector<LatticeFermion> G5evec(Nconv, FGrid);
+  vector<vector<ComplexD>> chiral_matrix(Nconv);
+  vector<vector<RealD>> chiral_matrix_real(Nconv);
+  for(int i = 0; i < Nconv; i++){
+//    G5evec[i] = G5evec[i] - G5evec[i];
+    G5evec[i] = Zero();
+    for(int j = 0; j < Ls/2; j++){
+      axpby_ssp(G5evec[i], 1., finalevec[i], 0., G5evec[i], j, j);
+    }
+    for(int j = Ls/2; j < Ls; j++){
+      axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
+    }
+  }
+  for(int i = 0; i < Nconv; i++){
+    chiral_matrix_real[i].resize(Nconv);
+    chiral_matrix[i].resize(Nconv);
+    for(int j = 0; j < Nconv; j++){
+      chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
+      chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
+      std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
+    }
+  }
+  for(int i = 0; i < Nconv; i++){
+    if(chiral_matrix[i][i].real() < 0.){
+      chiral_matrix_real[i][i] = -1. * chiral_matrix_real[i][i];
+    }
+  }
+
+
+  Grid_finalize();
+}
--- a/tests/lanczos/Test_wilson_DWFKernel.cc
+++ b/tests/lanczos/Test_wilson_DWFKernel.cc
@ -0,0 +1,278 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_dwf_lanczos.cc
+
+Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+ ;
+
+typedef WilsonFermionD FermionOp;
+typedef typename WilsonFermionD::FermionField FermionField;
+
+
+RealD AllZero(RealD x) { return 0.; }
+
+namespace Grid {
+
+#if 0
+template<typename Field>
+class RationalHermOp : public LinearFunction<Field> {
+public:
+  using LinearFunction<Field>::operator();  
+//  OperatorFunction<Field>   & _poly;
+  LinearOperatorBase<Field> &_Linop;
+  RealD _massDen, _massNum;
+   
+  FunctionHermOp(LinearOperatorBase<Field>& linop, RealD massDen,RealD massNum)
+    :  _Linop(linop) ,_massDen(massDen),_massNum(massNum) {};
+      
+  void operator()(const Field& in, Field& out) {
+//    _poly(_Linop,in,out);
+  } 
+};
+#endif
+
+template<class Matrix,class Field>
+class InvG5LinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD _num;
+  RealD _Tol;
+  Integer _MaxIt;
+  Gamma g5;
+
+public:
+  InvG5LinearOperator(Matrix &Mat,RealD num): _Mat(Mat),_num(num), _Tol(1e-12),_MaxIt(10000), g5(Gamma::Algebra::Gamma5) {};
+
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    assert(0);
+    _Mat.Mdiag(in,out);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+    _Mat.MdirAll(in,out);
+  };
+  void Op     (const Field &in, Field &out){
+    assert(0);
+    _Mat.M(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    assert(0);
+    _Mat.Mdag(in,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+     Field tmp(in.Grid());
+     MdagMLinearOperator<Matrix,Field> denom(_Mat);
+     ConjugateGradient<Field> CG(_Tol,_MaxIt); 
+     _Mat.M(in,tmp);
+     tmp += _num*in;
+     _Mat.Mdag(tmp,out);
+     CG(denom,out,tmp);
+     out = g5*tmp;
+  }
+};
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+		  		RealD, mass , 
+				RealD, resid,
+	  			RealD, ChebyLow,
+	  			RealD, ChebyHigh,
+	  			Integer, ChebyOrder)
+//                                  Integer, StartTrajectory,
+//                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
+//                                  bool, MetropolisTest,
+//                                  Integer, NoMetropolisUntil,
+//                                  std::string, StartingType,
+//                                  Integer, SW,
+//				  RealD, Kappa,
+//                                  IntegratorParameters, MD)
+
+  LanczosParameters() {
+    ////////////////////////////// Default values
+      mass = 0;
+//    MetropolisTest    = true;
+//    NoMetropolisUntil = 10;
+//    StartTrajectory   = 0;
+//    SW                = 2;
+//    Trajectories      = 10;
+//    StartingType      = "HotStart";
+    /////////////////////////////////
+  }
+
+  template <class ReaderClass >
+  LanczosParameters(Reader<ReaderClass> & TheReader){
+    initialize(TheReader);
+  }
+
+  template < class ReaderClass > 
+  void initialize(Reader<ReaderClass> &TheReader){
+//    std::cout << GridLogMessage << "Reading HMC\n";
+    read(TheReader, "HMC", *this);
+  }
+
+
+  void print_parameters() const {
+//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
+//    MD.print_parameters();
+  }
+  
+};
+
+}
+
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);
+
+  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
+      GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid =
+      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian* FGrid = UGrid;
+  GridRedBlackCartesian* FrbGrid = UrbGrid;
+//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
+
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG RNG5rb(FrbGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+
+  LatticeGaugeField Umu(UGrid);
+//  SU<Nc>::HotConfiguration(RNG4, Umu);
+
+  FieldMetaData header;
+  std::string file("./config");
+
+  int precision32 = 0;
+  int tworow      = 0;
+//  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
+  NerscIO::readConfiguration(Umu,header,file);
+
+/*
+  std::vector<LatticeColourMatrix> U(4, UGrid);
+  for (int mu = 0; mu < Nd; mu++) {
+    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+  }
+*/
+
+  int Nstop = 5;
+  int Nk = 10;
+  int Np = 90;
+  int Nm = Nk + Np;
+  int MaxIt = 10000;
+  RealD resid = 1.0e-5;
+
+  RealD mass = -1.0;
+
+  LanczosParameters LanParams;
+#if 1
+  {
+    XmlReader  HMCrd("LanParams.xml");
+    read(HMCrd,"LanczosParameters",LanParams);
+  }
+#else
+  {
+    LanParams.mass = mass;
+  }
+#endif
+  std::cout << GridLogMessage<< LanParams <<std::endl;
+  { 
+    XmlWriter HMCwr("LanParams.xml.out");
+    write(HMCwr,"LanczosParameters",LanParams);
+  }
+
+  mass=LanParams.mass;
+  resid=LanParams.resid;
+
+
+while ( mass > - 5.0){
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
+  InvG5LinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator,-2.); /// <-----
+  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
+//  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
+
+  std::vector<double> Coeffs{0, 0, 1.};
+  Polynomial<FermionField> PolyX(Coeffs);
+  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
+
+       FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
+//     InvHermOp<FermionField> Op(WilsonOperator,HermOp);
+     PlainHermOp<FermionField> Op     (HermOp);
+//     PlainHermOp<FermionField> Op2     (HermOp2);
+
+  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
+
+  std::vector<RealD> eval(Nm);
+  FermionField src(FGrid);
+  gaussian(RNG5, src);
+  std::vector<FermionField> evec(Nm, FGrid);
+  for (int i = 0; i < 1; i++) {
+    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
+              << std::endl;
+  };
+
+  int Nconv;
+  IRL.calc(eval, evec, src, Nconv);
+
+  std::cout << mass <<" : " << eval << std::endl;
+
+  Gamma g5(Gamma::Algebra::Gamma5) ;
+  ComplexD dot;
+  FermionField tmp(FGrid);
+  for (int i = 0; i < Nstop ; i++) {
+    tmp = g5*evec[i];
+    dot = innerProduct(tmp,evec[i]);
+    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
+  }
+  src  = evec[0]+evec[1]+evec[2];
+  mass += -0.1;
+}
+
+  Grid_finalize();
+}
--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@ -0,0 +1,211 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_dwf_lanczos.cc
+
+Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+ ;
+
+typedef WilsonFermionD FermionOp;
+typedef typename WilsonFermionD::FermionField FermionField;
+
+
+RealD AllZero(RealD x) { return 0.; }
+
+namespace Grid {
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+		  		RealD, mass , 
+	  			RealD, ChebyLow,
+	  			RealD, ChebyHigh,
+	  			Integer, ChebyOrder)
+//                                  Integer, StartTrajectory,
+//                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
+//                                  bool, MetropolisTest,
+//                                  Integer, NoMetropolisUntil,
+//                                  std::string, StartingType,
+//                                  Integer, SW,
+//				  RealD, Kappa,
+//                                  IntegratorParameters, MD)
+
+  LanczosParameters() {
+    ////////////////////////////// Default values
+      mass = 0;
+//    MetropolisTest    = true;
+//    NoMetropolisUntil = 10;
+//    StartTrajectory   = 0;
+//    SW                = 2;
+//    Trajectories      = 10;
+//    StartingType      = "HotStart";
+    /////////////////////////////////
+  }
+
+  template <class ReaderClass >
+  LanczosParameters(Reader<ReaderClass> & TheReader){
+    initialize(TheReader);
+  }
+
+  template < class ReaderClass > 
+  void initialize(Reader<ReaderClass> &TheReader){
+//    std::cout << GridLogMessage << "Reading HMC\n";
+    read(TheReader, "HMC", *this);
+  }
+
+
+  void print_parameters() const {
+//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
+//    MD.print_parameters();
+  }
+  
+};
+
+}
+
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);
+
+  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
+      GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid =
+      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian* FGrid = UGrid;
+  GridRedBlackCartesian* FrbGrid = UrbGrid;
+//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
+
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG RNG5rb(FrbGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+
+  LatticeGaugeField Umu(UGrid);
+//  SU<Nc>::HotConfiguration(RNG4, Umu);
+
+  FieldMetaData header;
+  std::string file("./config");
+
+  int precision32 = 0;
+  int tworow      = 0;
+//  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
+  NerscIO::readConfiguration(Umu,header,file);
+
+/*
+  std::vector<LatticeColourMatrix> U(4, UGrid);
+  for (int mu = 0; mu < Nd; mu++) {
+    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+  }
+*/
+
+  int Nstop = 10;
+  int Nk = 20;
+  int Np = 80;
+  int Nm = Nk + Np;
+  int MaxIt = 10000;
+  RealD resid = 1.0e-5;
+
+  RealD mass = -1.0;
+
+  LanczosParameters LanParams;
+#if 1
+  {
+    XmlReader  HMCrd("LanParams.xml");
+    read(HMCrd,"LanczosParameters",LanParams);
+  }
+#else
+  {
+    LanParams.mass = mass;
+  }
+#endif
+  std::cout << GridLogMessage<< LanParams <<std::endl;
+  { 
+    XmlWriter HMCwr("LanParams.xml.out");
+    write(HMCwr,"LanczosParameters",LanParams);
+  }
+
+  mass=LanParams.mass;
+
+
+while ( mass > - 5.0){
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
+  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
+  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
+
+  std::vector<double> Coeffs{0, 1.};
+  Polynomial<FermionField> PolyX(Coeffs);
+//  Chebyshev<FermionField> Cheby(0.5, 60., 31);
+//                                  RealD, ChebyLow,
+//                                RealD, ChebyHigh,
+//                                Integer, ChebyOrder)
+
+  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
+
+  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
+     PlainHermOp<FermionField> Op     (HermOp);
+     PlainHermOp<FermionField> Op2     (HermOp2);
+
+  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
+
+  std::vector<RealD> eval(Nm);
+  FermionField src(FGrid);
+  gaussian(RNG5, src);
+  std::vector<FermionField> evec(Nm, FGrid);
+  for (int i = 0; i < 1; i++) {
+    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
+              << std::endl;
+  };
+
+  int Nconv;
+  IRL.calc(eval, evec, src, Nconv);
+
+  std::cout << mass <<" : " << eval << std::endl;
+
+  Gamma g5(Gamma::Algebra::Gamma5) ;
+  ComplexD dot;
+  FermionField tmp(FGrid);
+  for (int i = 0; i < Nstop ; i++) {
+    tmp = g5*evec[i];
+    dot = innerProduct(tmp,evec[i]);
+    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
+  }
+  src  = evec[0]+evec[1]+evec[2];
+  mass += -0.1;
+}
+
+  Grid_finalize();
+}
--- a/tests/smearing/Test_WilsonFlow.cc
+++ b/tests/smearing/Test_WilsonFlow.cc
@ -33,8 +33,7 @@ namespace Grid{
    GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters,
            int, steps,
            double, step_size,
-            int, meas_interval,
-            double, maxTau); // for the adaptive algorithm
+            int, meas_interval);
       

    template <class ReaderClass >
@ -86,7 +85,7 @@ int main(int argc, char **argv) {
  WFParameters WFPar(Reader);
  ConfParameters CPar(Reader);
  CheckpointerParameters CPPar(CPar.conf_prefix, CPar.rng_prefix);
-  BinaryHmcCheckpointer<PeriodicGimplR> CPBin(CPPar);
+  NerscHmcCheckpointer<PeriodicGimplR> CPBin(CPPar);

  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){

@ -96,19 +95,13 @@ int main(int argc, char **argv) {
  std::cout << GridLogMessage << "Initial plaquette: "
    << WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;

-  int t=WFPar.maxTau;
-  WilsonFlowAdaptive<PeriodicGimplR> WF(WFPar.step_size, WFPar.maxTau,
-					1.0e-4,
+  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size, WFPar.steps,
 					WFPar.meas_interval);

  WF.smear(Uflow, Umu);

  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
-  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
-  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow);
  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
-  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
-  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;

  std::cout<< GridLogMessage << " Admissibility check:\n";
  const double sp_adm = 0.067;                // admissible threshold
--- a/visualisation/CMakeLists.txt
+++ b/visualisation/CMakeLists.txt
@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
+
+project(GridViewer)
+
+list(APPEND CMAKE_PREFIX_PATH "/Users/peterboyle/QCD/vtk/VTK-9.4.2-install/")
+
+find_package(VTK COMPONENTS 
+  CommonColor
+  CommonCore
+  FiltersCore
+  FiltersModeling
+  IOImage
+  IOFFMPEG
+  InteractionStyle
+  InteractionWidgets
+  RenderingContextOpenGL2
+  RenderingCore
+  RenderingFreeType
+  RenderingGL2PSOpenGL2
+  RenderingOpenGL2
+)
+
+if (NOT VTK_FOUND)
+  message(FATAL_ERROR "GridViewer: Unable to find the VTK build folder.")
+endif()
+
+# Prevent a "command line is too long" failure in Windows.
+set(CMAKE_NINJA_FORCE_RESPONSE_FILE "ON" CACHE BOOL "Force Ninja to use response files.")
+
+add_executable(FieldDensityAnimate MACOSX_BUNDLE FieldDensityAnimate.cxx )
+  target_link_libraries(FieldDensityAnimate PRIVATE ${VTK_LIBRARIES}
+)
+# vtk_module_autoinit is needed
+vtk_module_autoinit(
+  TARGETS FieldDensityAnimate
+  MODULES ${VTK_LIBRARIES}
+)
--- a/visualisation/E8_vs_Topo8.avi
+++ b/visualisation/E8_vs_Topo8.avi
--- a/visualisation/FieldDensity.py
+++ b/visualisation/FieldDensity.py
@ -0,0 +1,285 @@
+#!/usr/bin/env python
+
+# noinspection PyUnresolvedReferences
+import math
+import vtk
+import vtkmodules.vtkInteractionStyle
+# noinspection PyUnresolvedReferences
+import vtkmodules.vtkRenderingOpenGL2
+from vtkmodules.vtkCommonColor import vtkNamedColors
+from vtkmodules.vtkCommonCore import (
+    VTK_VERSION_NUMBER,
+    vtkVersion
+)
+from vtkmodules.vtkCommonCore import VTK_DOUBLE
+from vtkmodules.vtkCommonDataModel import vtkImageData
+from vtkmodules.vtkFiltersCore import (
+    vtkMarchingCubes,
+    vtkStripper
+)
+from vtkmodules.vtkFiltersModeling import vtkOutlineFilter
+from vtkmodules.vtkIOImage import (
+    vtkMetaImageReader,
+    vtkJPEGWriter,
+    vtkPNGWriter
+)
+from vtkmodules.vtkRenderingCore import (
+    vtkActor,
+    vtkCamera,
+    vtkPolyDataMapper,
+    vtkProperty,
+    vtkRenderWindow,
+    vtkRenderWindowInteractor,
+    vtkRenderer,
+    vtkWindowToImageFilter
+)
+
+
+class vtkTimerCallback():
+    def __init__(self, steps, imageData, iren):
+        self.timer_count = 0
+        self.steps = steps
+        self.imageData = imageData
+        self.iren = iren
+        self.timerId = None
+        self.step = 0
+
+    def execute(self, obj, event):
+
+        print(self.timer_count)
+
+        dims = self.imageData.GetDimensions()
+
+        t=self.step/10.0
+
+        z0 = 2
+        y0 = 4
+        x0 = 4
+        z1 = 14
+        y1 = 12
+        x1 = 12
+        for z in range(dims[2]):
+            for y in range(dims[1]):
+                for x in range(dims[0]):
+                    self.imageData.SetScalarComponentFromDouble(x, y, z, 0,
+                                                                math.sin(t)*math.exp(-0.25*((x-x0)*(x-x0)+(y-y0)*(y-y0)+(z-z0)*(z-z0)))
+                                                                - math.cos(t)*math.exp(-0.25*((x-x1)*(x-x1)+(y-y1)*(y-y1)+(z-z1)*(z-z1))))
+
+        self.imageData.Modified()
+
+        iren = obj
+        iren.GetRenderWindow().Render()
+        self.timer_count += 1
+        self.step += 1
+
+        if self.step >= self.steps :
+            iren.DestroyTimer(self.timerId)
+
+def WriteImage(fileName, renWin):
+    '''
+    '''
+
+    import os
+
+    if fileName:
+        # Select the writer to use.
+        path, ext = os.path.splitext(fileName)
+        ext = ext.lower()
+        if not ext:
+            ext = '.png'
+            fileName = fileName + ext
+        elif ext == '.jpg':
+            writer = vtkJPEGWriter()
+        else:
+            writer = vtkPNGWriter()
+
+        windowto_image_filter = vtkWindowToImageFilter()
+        windowto_image_filter.SetInput(renWin)
+        windowto_image_filter.SetScale(1)  # image quality
+        windowto_image_filter.SetInputBufferTypeToRGBA()
+
+        writer.SetFileName(fileName)
+        writer.SetInputConnection(windowto_image_filter.GetOutputPort())
+        writer.Write()
+    else:
+        raise RuntimeError('Need a filename.')
+
+
+def main():
+
+    colors = vtkNamedColors()
+
+    file_name = get_program_parameters()
+
+    colors.SetColor('InstantonColor', [240, 184, 160, 255])
+    colors.SetColor('BackfaceColor', [255, 229, 200, 255])
+    colors.SetColor('BkgColor', [51, 77, 102, 255])
+
+    # Create the renderer, the render window, and the interactor. The renderer
+    # draws into the render window, the interactor enables mouse- and
+    # keyboard-based interaction with the data within the render window.
+    #
+    a_renderer = vtkRenderer()
+    ren_win = vtkRenderWindow()
+    ren_win.AddRenderer(a_renderer)
+
+    iren = vtkRenderWindowInteractor()
+    iren.SetRenderWindow(ren_win)
+
+    # The following reader is used to read a series of 2D slices (images)
+    # that compose the volume. The slice dimensions are set, and the
+    # pixel spacing. The data Endianness must also be specified. The reader
+    # uses the FilePrefix in combination with the slice number to construct
+    # filenames using the format FilePrefix.%d. (In this case the FilePrefix
+    # is the root name of the file: quarter.)
+    imageData = vtkImageData()
+    imageData.SetDimensions(16, 16, 16)
+    imageData.AllocateScalars(VTK_DOUBLE, 1)
+
+    dims = imageData.GetDimensions()
+
+    # Fill every entry of the image data with '2.0'
+    # Set the input data
+    for z in range(dims[2]):
+        z0 = dims[2]/2
+        for y in range(dims[1]):
+            y0 = dims[1]/2
+            for x in range(dims[0]):
+                x0 = dims[0]/2
+                imageData.SetScalarComponentFromDouble(x, y, z, 0, math.exp(-0.25*((x-x0)*(x-x0)+(y-y0)*(y-y0)+z*z)) -  math.exp(-0.25*((x-x0)*(x-x0)+y*y+(z-z0)*(z-z0))))
+
+    instanton_extractor = vtkMarchingCubes()
+    instanton_extractor.SetInputData(imageData)
+    instanton_extractor.SetValue(0, 0.1)
+    
+    instanton_stripper = vtkStripper()
+    instanton_stripper.SetInputConnection(instanton_extractor.GetOutputPort())
+    
+    instanton_mapper = vtkPolyDataMapper()
+    instanton_mapper.SetInputConnection(instanton_stripper.GetOutputPort())
+    instanton_mapper.ScalarVisibilityOff()
+    
+    instanton = vtkActor()
+    instanton.SetMapper(instanton_mapper)
+    instanton.GetProperty().SetDiffuseColor(colors.GetColor3d('InstantonColor'))
+    instanton.GetProperty().SetSpecular(0.3)
+    instanton.GetProperty().SetSpecularPower(20)
+    instanton.GetProperty().SetOpacity(0.5)
+
+    # The triangle stripper is used to create triangle strips from the
+    # isosurface these render much faster on may systems.
+    antiinstanton_extractor = vtkMarchingCubes()
+    antiinstanton_extractor.SetInputData(imageData)
+    antiinstanton_extractor.SetValue(0, -0.1)
+    
+    antiinstanton_stripper = vtkStripper()
+    antiinstanton_stripper.SetInputConnection(antiinstanton_extractor.GetOutputPort())
+    
+    antiinstanton_mapper = vtkPolyDataMapper()
+    antiinstanton_mapper.SetInputConnection(antiinstanton_stripper.GetOutputPort())
+    antiinstanton_mapper.ScalarVisibilityOff()
+    
+    antiinstanton = vtkActor()
+    antiinstanton.SetMapper(antiinstanton_mapper)
+    antiinstanton.GetProperty().SetDiffuseColor(colors.GetColor3d('Ivory'))
+
+    # An outline provides box around the data.
+    outline_data = vtkOutlineFilter()
+    outline_data.SetInputData(imageData)
+    
+    map_outline = vtkPolyDataMapper()
+    map_outline.SetInputConnection(outline_data.GetOutputPort())
+    
+    outline = vtkActor()
+    outline.SetMapper(map_outline)
+    outline.GetProperty().SetColor(colors.GetColor3d('Black'))
+    
+    # It is convenient to create an initial view of the data. The FocalPoint
+    # and Position form a vector direction. Later on (ResetCamera() method)
+    # this vector is used to position the camera to look at the data in
+    # this direction.
+    a_camera = vtkCamera()
+    a_camera.SetViewUp(0, 0, -1)
+    a_camera.SetPosition(0, -100, 0)
+    a_camera.SetFocalPoint(0, 0, 0)
+    a_camera.ComputeViewPlaneNormal()
+    a_camera.Azimuth(30.0)
+    a_camera.Elevation(30.0)
+
+    # Actors are added to the renderer. An initial camera view is created.
+    # The Dolly() method moves the camera towards the FocalPoint,
+    # thereby enlarging the image.
+    a_renderer.AddActor(outline)
+    a_renderer.AddActor(instanton)
+    a_renderer.AddActor(antiinstanton)
+    a_renderer.SetActiveCamera(a_camera)
+    a_renderer.ResetCamera()
+    a_camera.Dolly(1.0)
+
+    # Set a background color for the renderer and set the size of the
+    # render window (expressed in pixels).
+    a_renderer.SetBackground(colors.GetColor3d('BkgColor'))
+    ren_win.SetSize(1024, 1024)
+    ren_win.SetWindowName('ExpoDemo')
+
+    # Note that when camera movement occurs (as it does in the Dolly()
+    # method), the clipping planes often need adjusting. Clipping planes
+    # consist of two planes: near and far along the view direction. The
+    # near plane clips out objects in front of the plane the far plane
+    # clips out objects behind the plane. This way only what is drawn
+    # between the planes is actually rendered.
+    a_renderer.ResetCameraClippingRange()
+
+    # write image
+    #    WriteImage('exp.jpg',ren_win)
+
+    # Sign up to receive TimerEvent
+    cb = vtkTimerCallback(200, imageData, iren)
+    iren.AddObserver('TimerEvent', cb.execute)
+    cb.timerId = iren.CreateRepeatingTimer(50)
+
+    # start the interaction and timer
+    ren_win.Render()
+    
+    # Initialize the event loop and then start it.
+    iren.Initialize()
+    iren.Start()
+
+
+def get_program_parameters():
+    import argparse
+    description = 'Simple lattice volumetric demo'
+    epilogue = '''
+    Derived from VTK/Examples/Cxx/Medical2.cxx
+    '''
+    parser = argparse.ArgumentParser(description=description, epilog=epilogue,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('filename', help='FieldDensity.py')
+    args = parser.parse_args()
+    return args.filename
+
+
+def vtk_version_ok(major, minor, build):
+    """
+    Check the VTK version.
+
+    :param major: Major version.
+    :param minor: Minor version.
+    :param build: Build version.
+    :return: True if the requested VTK version is greater or equal to the actual VTK version.
+    """
+    needed_version = 10000000000 * int(major) + 100000000 * int(minor) + int(build)
+    try:
+        vtk_version_number = VTK_VERSION_NUMBER
+    except AttributeError:  # as error:
+        ver = vtkVersion()
+        vtk_version_number = 10000000000 * ver.GetVTKMajorVersion() + 100000000 * ver.GetVTKMinorVersion() \
+                             + ver.GetVTKBuildVersion()
+    if vtk_version_number >= needed_version:
+        return True
+    else:
+        return False
+
+
+if __name__ == '__main__':
+    main()
--- a/visualisation/FieldDensityAnimate.cxx
+++ b/visualisation/FieldDensityAnimate.cxx
@ -0,0 +1,490 @@
+// Derived from VTK/Examples/Cxx/Medical2.cxx
+// The example reads a volume dataset, extracts two isosurfaces that
+// represent the skin and bone, and then displays them.
+//
+// Modified heavily by Peter Boyle to display lattice field theory data as movies and compare multiple files
+
+#include <vtkActor.h>
+#include <vtkCamera.h>
+#include <vtkMetaImageReader.h>
+#include <vtkNamedColors.h>
+#include <vtkNew.h>
+#include <vtkOutlineFilter.h>
+#include <vtkPolyDataMapper.h>
+#include <vtkProperty.h>
+#include <vtkRenderWindow.h>
+#include <vtkRenderWindowInteractor.h>
+#include <vtkRenderer.h>
+#include <vtkStripper.h>
+#include <vtkImageData.h>
+#include <vtkVersion.h>
+#include <vtkCallbackCommand.h>
+#include <vtkTextActor.h>
+#include <vtkTextProperty.h>
+
+#define MPEG
+#ifdef MPEG
+#include <vtkFFMPEGWriter.h>
+#endif
+
+#include <vtkProperty2D.h>
+#include <vtkSliderWidget.h>
+#include <vtkSliderRepresentation2D.h>
+#include <vtkWindowToImageFilter.h>
+
+#include <array>
+#include <string>
+
+#include <Grid/Grid.h>
+
+#define USE_FLYING_EDGES
+#ifdef USE_FLYING_EDGES
+#include <vtkFlyingEdges3D.h>
+typedef vtkFlyingEdges3D isosurface;
+#else
+#include <vtkMarchingCubes.h>
+typedef vtkMarchingCubes isosurface;
+#endif
+
+int mpeg = 0 ;
+int xlate = 0 ;
+
+template <class T> void readFile(T& out, std::string const fname){
+#ifdef HAVE_LIME
+  Grid::emptyUserRecord record;
+  Grid::ScidacReader RD;
+  RD.open(fname);
+  RD.readScidacFieldRecord(out,record);
+  RD.close();
+#endif
+}
+using namespace Grid;
+
+class FrameUpdater : public vtkCallbackCommand
+{
+public:
+
+  FrameUpdater() {
+    TimerCount = 0;
+    xoff       = 0;
+    t          = 0;
+    imageData = nullptr;
+    grid_data = nullptr;
+    timerId = 0;
+    maxCount = -1;
+  }
+  
+  static FrameUpdater* New()
+  {
+    FrameUpdater* cb = new FrameUpdater;
+    cb->TimerCount = 0;
+    return cb;
+  }
+
+  virtual void Execute(vtkObject* caller, unsigned long eventId,void* vtkNotUsed(callData))
+  {
+    const int max=256;
+    char text_string[max];
+
+    if (this->TimerCount < this->maxCount) {
+
+      if (vtkCommand::TimerEvent == eventId)
+	{
+	  ++this->TimerCount;
+	  
+	  // Make a new frame
+	  auto latt_size = grid_data->Grid()->GlobalDimensions();
+	  for(int xx=0;xx<latt_size[0];xx++){
+	    for(int yy=0;yy<latt_size[1];yy++){
+	      for(int zz=0;zz<latt_size[2];zz++){
+		int x = (xx+xoff)%latt_size[0];
+		Coordinate site({x,yy,zz,t});
+		RealD value = real(peekSite(*grid_data,site));
+		imageData->SetScalarComponentFromDouble(xx,yy,zz,0,value);
+	  }}}
+
+	  if ( xlate ) { 
+	    xoff = (xoff + 1)%latt_size[0];
+	    if ( xoff== 0 ) t = (t+1)%latt_size[3];
+	  } else {
+	    t = (t+1)%latt_size[3];
+	    if ( t== 0 ) 	xoff = (xoff + 1)%latt_size[0];
+	  }
+
+	  snprintf(text_string,max,"T=%d",t);
+	  text->SetInput(text_string);
+      
+	  std::cout << this->TimerCount<<"/"<<maxCount<< " xoff "<<xoff<<" t "<<t  <<std::endl;
+	  imageData->Modified();
+
+	  vtkRenderWindowInteractor* iren = dynamic_cast<vtkRenderWindowInteractor*>(caller);
+	  iren->GetRenderWindow()->Render();
+	  
+	}
+    }
+    
+    if (this->TimerCount >= this->maxCount) {
+      vtkRenderWindowInteractor* iren = dynamic_cast<vtkRenderWindowInteractor*>(caller);
+      if (this->timerId > -1)
+      {
+        iren->DestroyTimer(this->timerId);
+      }
+    }
+  }
+  
+
+private:
+  int TimerCount;
+  int xoff;
+  int t;
+public:
+  Grid::LatticeComplexD * grid_data;
+  vtkImageData* imageData = nullptr;
+  vtkTextActor* text = nullptr;
+  vtkFFMPEGWriter *writer = nullptr;
+  int timerId ;
+  int maxCount ;
+  double rms;
+  isosurface * posExtractor;
+  isosurface * negExtractor;
+};
+class SliderCallback : public vtkCommand
+{
+public:
+    static SliderCallback* New()
+    {
+        return new SliderCallback;
+    }
+    virtual void Execute(vtkObject* caller, unsigned long eventId, void* callData)
+    {
+        vtkSliderWidget *sliderWidget = vtkSliderWidget::SafeDownCast(caller);
+        if (sliderWidget)
+        {
+	  contour = ((vtkSliderRepresentation *)sliderWidget->GetRepresentation())->GetValue();
+        }
+	for(int i=0;i<fu_list.size();i++){
+	  fu_list[i]->posExtractor->SetValue(0,  SliderCallback::contour*fu_list[i]->rms);
+	  fu_list[i]->negExtractor->SetValue(0, -SliderCallback::contour*fu_list[i]->rms);
+	  fu_list[i]->posExtractor->Modified();
+	  fu_list[i]->negExtractor->Modified();
+	}
+    }
+public:
+  static double contour;
+  std::vector<FrameUpdater *> fu_list;
+};
+
+
+double SliderCallback::contour;
+
+int main(int argc, char* argv[])
+{
+  using namespace Grid;
+
+  Grid_init(&argc, &argv);
+  GridLogLayout();
+
+  auto latt_size   = GridDefaultLatt();
+  auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
+  auto mpi_layout  = GridDefaultMpi();
+  GridCartesian    Grid(latt_size, simd_layout, mpi_layout);
+
+ 
+  std::cout << argc << " command Line arguments "<<std::endl;
+  for(int c=0;c<argc;c++) {
+    std::cout << " - "<<argv[c]<<std::endl;
+  }
+
+  std::vector<std::string> file_list;
+  double default_contour = 1.0;
+  std::string arg;
+#ifdef MPEG
+  if( GridCmdOptionExists(argv,argv+argc,"--mpeg") ){
+    mpeg = 1;
+  }
+#endif
+
+  if( GridCmdOptionExists(argv,argv+argc,"--xlate") ){
+    xlate = 1;
+  }
+
+  if( GridCmdOptionExists(argv,argv+argc,"--isosurface") ){
+    arg=GridCmdOptionPayload(argv,argv+argc,"--isosurface");
+    GridCmdOptionFloat(arg,default_contour);
+  }
+  if( GridCmdOptionExists(argv,argv+argc,"--file1") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--file1");
+    file_list.push_back(arg);
+  }
+  if( GridCmdOptionExists(argv,argv+argc,"--file2") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--file2");
+    file_list.push_back(arg);
+  }
+  if( GridCmdOptionExists(argv,argv+argc,"--file3") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--file3");
+    file_list.push_back(arg);
+  }
+  if( GridCmdOptionExists(argv,argv+argc,"--file4") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--file4");
+    file_list.push_back(arg);
+  }
+  for(int c=0;c<file_list.size();c++) {
+    std::cout << " file: "<<file_list[c]<<std::endl;
+  }
+
+  // Common things:
+  vtkNew<vtkNamedColors> colors;
+  std::array<unsigned char, 4> posColor{{240, 184, 160, 255}};  colors->SetColor("posColor", posColor.data());
+  std::array<unsigned char, 4> bkg{{51, 77, 102, 255}};         colors->SetColor("BkgColor", bkg.data());
+
+  // Create the renderer, the render window, and the interactor. The renderer
+  // draws into the render window, the interactor enables mouse- and
+  // keyboard-based interaction with the data within the render window.
+  //
+  vtkNew<vtkRenderWindow> renWin;
+  vtkNew<vtkRenderWindowInteractor> iren;
+  iren->SetRenderWindow(renWin);
+
+  
+  std::vector<LatticeComplexD> data(file_list.size(),&Grid);
+  FieldMetaData header;
+
+  
+  int frameCount;
+  if ( mpeg ) frameCount = latt_size[3];
+  else        frameCount = latt_size[0] * latt_size[3];
+
+  std::vector<FrameUpdater *> fu_list;
+  for (int f=0;f<file_list.size();f++){
+
+    // It is convenient to create an initial view of the data. The FocalPoint
+    // and Position form a vector direction. Later on (ResetCamera() method)
+    // this vector is used to position the camera to look at the data in
+    // this direction.
+    vtkNew<vtkCamera> aCamera;
+    aCamera->SetViewUp(0, 0, -1);
+    aCamera->SetPosition(0, -1000, 0);
+    aCamera->SetFocalPoint(0, 0, 0);
+    aCamera->ComputeViewPlaneNormal();
+    aCamera->Azimuth(30.0);
+    aCamera->Elevation(30.0);
+
+    
+    vtkNew<vtkRenderer> aRenderer;
+    renWin->AddRenderer(aRenderer);
+    
+    double vol = data[f].Grid()->gSites();
+
+    std::cout << "Reading "<<file_list[f]<<std::endl;
+    readFile(data[f],file_list[f]);
+
+    auto nrm    = norm2(data[f]);
+    auto nrmbar = nrm/vol;
+    auto rms    = sqrt(nrmbar);
+
+    double contour = default_contour * rms; // default to 1 x RMS
+
+    // The following reader is used to read a series of 2D slices (images)
+    // that compose the volume. The slice dimensions are set, and the
+    // pixel spacing. The data Endianness must also be specified. The reader
+    // uses the FilePrefix in combination with the slice number to construct
+    // filenames using the format FilePrefix.%d. (In this case the FilePrefix
+    // is the root name of the file: quarter.)
+    vtkNew<vtkImageData> imageData;
+    imageData->SetDimensions(latt_size[0],latt_size[1],latt_size[2]);
+    imageData->AllocateScalars(VTK_DOUBLE, 1);
+    for(int xx=0;xx<latt_size[0];xx++){
+      for(int yy=0;yy<latt_size[1];yy++){
+	for(int zz=0;zz<latt_size[2];zz++){
+	  Coordinate site({xx,yy,zz,0});
+	  RealD value = real(peekSite(data[f],site));
+	  imageData->SetScalarComponentFromDouble(xx,yy,zz,0,value);
+    }}}
+
+    vtkNew<isosurface> posExtractor;
+    posExtractor->SetInputData(imageData);
+    posExtractor->SetValue(0, contour);
+  
+    vtkNew<vtkStripper> posStripper;
+    posStripper->SetInputConnection(posExtractor->GetOutputPort());
+
+    vtkNew<vtkPolyDataMapper> posMapper;
+    posMapper->SetInputConnection(posStripper->GetOutputPort());
+    posMapper->ScalarVisibilityOff();
+
+    vtkNew<vtkActor> pos;
+    pos->SetMapper(posMapper);
+    pos->GetProperty()->SetDiffuseColor(colors->GetColor3d("posColor").GetData());
+    pos->GetProperty()->SetSpecular(0.3);
+    pos->GetProperty()->SetSpecularPower(20);
+    pos->GetProperty()->SetOpacity(0.5);
+
+    // An isosurface, or contour value is set
+    // The triangle stripper is used to create triangle strips from the
+    // isosurface; these render much faster on may systems.
+    vtkNew<isosurface> negExtractor;
+    negExtractor->SetInputData(imageData);
+    negExtractor->SetValue(0, -contour);
+
+    vtkNew<vtkStripper> negStripper;
+    negStripper->SetInputConnection(negExtractor->GetOutputPort());
+
+    vtkNew<vtkPolyDataMapper> negMapper;
+    negMapper->SetInputConnection(negStripper->GetOutputPort());
+    negMapper->ScalarVisibilityOff();
+
+    vtkNew<vtkActor> neg;
+    neg->SetMapper(negMapper);
+    neg->GetProperty()->SetDiffuseColor(colors->GetColor3d("Ivory").GetData());
+
+    // An outline provides context around the data.
+    vtkNew<vtkOutlineFilter> outlineData;
+    outlineData->SetInputData(imageData);
+
+    vtkNew<vtkPolyDataMapper> mapOutline;
+    mapOutline->SetInputConnection(outlineData->GetOutputPort());
+
+    vtkNew<vtkActor> outline;
+    outline->SetMapper(mapOutline);
+    outline->GetProperty()->SetColor(colors->GetColor3d("Black").GetData());
+
+    vtkNew<vtkTextActor> Text;
+    Text->SetInput(file_list[f].c_str());
+    Text->SetPosition2(0,0);
+    Text->GetTextProperty()->SetFontSize(48);
+    Text->GetTextProperty()->SetColor(colors->GetColor3d("Gold").GetData());
+
+    vtkNew<vtkTextActor> TextT;
+    TextT->SetInput("T=0");
+    TextT->SetPosition(0,.9*1025);
+    TextT->GetTextProperty()->SetFontSize(48);
+    TextT->GetTextProperty()->SetColor(colors->GetColor3d("Gold").GetData());
+    
+  
+    // Actors are added to the renderer. An initial camera view is created.
+    // The Dolly() method moves the camera towards the FocalPoint,
+    // thereby enlarging the image.
+    aRenderer->AddActor(Text);
+    aRenderer->AddActor(TextT);
+    aRenderer->AddActor(outline);
+    aRenderer->AddActor(pos);
+    aRenderer->AddActor(neg);
+
+    // Sign up to receive TimerEvent
+    vtkNew<FrameUpdater> fu;
+    fu->imageData = imageData;
+    fu->grid_data = &data[f];
+    fu->text      = TextT;
+    fu->maxCount = frameCount;
+    fu->posExtractor = posExtractor;
+    fu->negExtractor = negExtractor;
+    fu->rms = rms;
+      
+    iren->AddObserver(vtkCommand::TimerEvent, fu);
+
+    aRenderer->SetActiveCamera(aCamera);
+    aRenderer->ResetCamera();
+    aRenderer->SetBackground(colors->GetColor3d("BkgColor").GetData());
+    aCamera->Dolly(1.0);
+
+    double nf = file_list.size();
+    std::cout << " Adding renderer " <<f<<" of "<<nf<<std::endl;
+    aRenderer->SetViewport((1.0/nf)*f, 0.0,(1.0/nf)*(f+1) , 1.0);
+
+    // Note that when camera movement occurs (as it does in the Dolly()
+    // method), the clipping planes often need adjusting. Clipping planes
+    // consist of two planes: near and far along the view direction. The
+    // near plane clips out objects in front of the plane; the far plane
+    // clips out objects behind the plane. This way only what is drawn
+    // between the planes is actually rendered.
+    aRenderer->ResetCameraClippingRange();
+    
+    fu_list.push_back(fu);
+  }
+
+
+  // Set a background color for the renderer and set the size of the
+  // render window (expressed in pixels).
+  // Initialize the event loop and then start it.
+  renWin->SetSize(1024*file_list.size(), 1024);
+  renWin->SetWindowName("FieldDensity");
+  renWin->Render();
+
+  iren->Initialize();
+
+  if ( mpeg ) {
+#ifdef MPEG
+    vtkWindowToImageFilter *imageFilter = vtkWindowToImageFilter::New();
+    imageFilter->SetInput( renWin );
+    imageFilter->SetInputBufferTypeToRGB();
+    
+    vtkFFMPEGWriter *writer = vtkFFMPEGWriter::New();
+    writer->SetFileName("movie.avi");
+    writer->SetRate(1);
+    writer->SetInputConnection(imageFilter->GetOutputPort());
+    writer->Start();
+
+    for(int i=0;i<fu_list[0]->maxCount;i++){
+      for(int f=0;f<fu_list.size();f++){
+	fu_list[f]->Execute(iren,vtkCommand::TimerEvent,nullptr);
+      }
+      imageFilter->Modified();
+      writer->Write();
+    }
+    writer->End();
+    writer->Delete();
+#else
+    assert(-1 && "MPEG support not compiled");
+#endif
+  } else { 
+  
+    // Add control of contour threshold
+    // Create a slider widget
+    vtkSmartPointer<vtkSliderRepresentation2D> sliderRep = vtkSmartPointer<vtkSliderRepresentation2D>::New();
+    sliderRep->SetMinimumValue(0.1);
+    sliderRep->SetMaximumValue(5.0);
+    sliderRep->SetValue(1.0);
+    sliderRep->SetTitleText("Fraction RMS");
+    // Set color properties:
+
+    // Change the color of the knob that slides
+    //  sliderRep->GetSliderProperty()->SetColor(colors->GetColor3d("Green").GetData());
+    sliderRep->GetTitleProperty()->SetColor(colors->GetColor3d("AliceBlue").GetData());
+    sliderRep->GetLabelProperty()->SetColor(colors->GetColor3d("AliceBlue").GetData());
+    sliderRep->GetSelectedProperty()->SetColor(colors->GetColor3d("DeepPink").GetData());
+
+    // Change the color of the bar
+    sliderRep->GetTubeProperty()->SetColor(colors->GetColor3d("MistyRose").GetData());
+    sliderRep->GetCapProperty()->SetColor(colors->GetColor3d("Yellow").GetData());
+    sliderRep->SetSliderLength(0.05);
+    sliderRep->SetSliderWidth(0.025);
+    sliderRep->SetEndCapLength(0.02);
+
+    double nf = file_list.size();
+    sliderRep->GetPoint1Coordinate()->SetCoordinateSystemToNormalizedDisplay();
+    sliderRep->GetPoint1Coordinate()->SetValue(0.1, 0.1);
+    sliderRep->GetPoint2Coordinate()->SetCoordinateSystemToNormalizedDisplay();
+    sliderRep->GetPoint2Coordinate()->SetValue(0.9/nf, 0.1);
+  
+    vtkSmartPointer<vtkSliderWidget> sliderWidget = vtkSmartPointer<vtkSliderWidget>::New();
+    sliderWidget->SetInteractor(iren);
+    sliderWidget->SetRepresentation(sliderRep);
+    sliderWidget->SetAnimationModeToAnimate();
+    sliderWidget->EnabledOn();
+  
+    // Create the slider callback
+    vtkSmartPointer<SliderCallback> slidercallback = vtkSmartPointer<SliderCallback>::New();
+    slidercallback->fu_list = fu_list;
+    sliderWidget->AddObserver(vtkCommand::InteractionEvent, slidercallback);
+
+    int timerId = iren->CreateRepeatingTimer(300);
+    std::cout << "timerId: " << timerId << std::endl;
+
+    // Start the interaction and timer
+    iren->Start();
+  }
+
+  Grid_finalize();
+
+  return EXIT_SUCCESS;
+}
--- a/visualisation/README
+++ b/visualisation/README
@ -0,0 +1,113 @@
+========================================
+Visualisation of Grid / SciDAC format density fields using VTK (visualisation toolkit). Peter Boyle, 2025.
+========================================
+
+Uses:
+
+https://vtk.org
+
+Files are, for example, those produced by
+     Grid/HMC/ComputeWilsonFlow.cc
+and
+     Grid/HMC/site_plaquette.cc
+
+========================================
+Prerequisites:
+========================================
+
+
+1) Install ffmpeg-7.0.2  (developer install, includes headers and libraries).
+   MacOS note: must install ffmpeg from source -- homebrew only installs binaries.
+
+   https://ffmpeg.org/download.html#releases
+
+
+   Note: the latest ffmpeg (7.1.1) broke software compatibility with VTK.
+
+
+2) Build and install VTK-9.4.2, with FFMEG support enabled.
+
+   This is particularly involved on MacOS, so documented here.
+
+   cd VTK-9.4.2
+   mkdir build
+   cd build
+   ccmake ..
+
+
+Using ccmake editor, set:
+
+   FFMPEG_DIR                       /usr/local      
+
+Toggle "advanced mode" (t)
+
+Set:
+   CMAKE_EXE_LINKER_FLAGS 
+CMAKE_MODULE_LINKER_FLAGS 
+CMAKE_SHARED_LINKER_FLAGS 
+each to:
+   
+   -framework Foundation -framework AudioToolbox -framework CoreAudio -liconv -lm -framework AVFoundation -framework CoreVideo -framework CoreMedia -framework CoreGraphics -framework AudioToolbox -framework OpenGL -framework OpenGL -framework VideoToolbox -framework CoreImage -framework AppKit -framework CoreFoundation -framework CoreServices -lz -lbz2 -Wl,-framework,CoreFoundation -Wl,-framework,Security -L/usr/local/lib -lavdevice -lavfilter -lavformat -lavcodec -lswresample -lswscale -lavutil
+
+   Set paths for each of 
+ FFMPEG_DIR                       /usr/local                                                                                                                                                                                              
+ FFMPEG_avcodec_INCLUDE_DIR       /usr/local/include                                                                                                                                                                                      
+ FFMPEG_avcodec_LIBRARY           /usr/local/lib/libavcodec.a                                                                                                                                                                             
+ FFMPEG_avdevice_INCLUDE_DIR      /usr/local/include                                                                                                                                                                                      
+ FFMPEG_avdevice_LIBRARY          /usr/local/lib/libavdevice.a                                                                                                                                                                            
+ FFMPEG_avfilter_INCLUDE_DIR      /usr/local/include                                                                                                                                                                                      
+ FFMPEG_avfilter_LIBRARY          /usr/local/lib/libavfilter.a                                                                                                                                                                            
+ FFMPEG_avformat_INCLUDE_DIR      /usr/local/include                                                                                                                                                                                      
+ FFMPEG_avformat_LIBRARY          /usr/local/lib/libavformat.a                                                                                                                                                                            
+ FFMPEG_avresample_INCLUDE_DIR    /usr/local/include                                                                                                                                                                                      
+ FFMPEG_avresample_LIBRARY        /usr/local/lib/libavresample.a                                                                                                                                                                          
+ FFMPEG_avutil_INCLUDE_DIR        /usr/local/include                                                                                                                                                                                      
+ FFMPEG_avutil_LIBRARY            /usr/local/lib/libavutil.a                                                                                                                                                                              
+ FFMPEG_swresample_INCLUDE_DIR    /usr/local/include                                                                                                                                                                                      
+ FFMPEG_swresample_LIBRARY        /usr/local/lib/libswresample.a                                                                                                                                                                          
+ FFMPEG_swscale_INCLUDE_DIR       /usr/local/include                                                                                                                                                                                      
+ FFMPEG_swscale_LIBRARY           /usr/local/lib/libswscale.a      
+
+ VTK_MODULE_ENABLE_VTK_IOFFMPEG   YES                                                                                                                                                                                                     
+
+
+   VTK really should make it easier to pick up the flags required for FFMPEG linkage, especially as they are very quirky on MacOS.
+
+
+========================================
+Grid:
+========================================
+
+3) Build and install a version of Grid
+
+4) Ensure "grid-config" is in your path.
+
+5) cd Grid/visualisation/
+   libs=`grid-config --libs`
+   ldflags=`grid-config --ldflags`
+   cxxflags=`grid-config --cxxflags`
+   cxx=`grid-config --cxx`
+
+   mkdir build
+   cd build
+
+   LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_CXX_COMPILER=$cxx -DCMAKE_CXX_FLAGS=$cxxflags 
+
+   make
+   
+6) Invoke as:
+
+   FieldDensityAnimate --isosurface <float-from-0-to-5> --grid X.Y.Z.T --file1 SciDacDensityFile1 [--xlate] [--mpeg] 
+   FieldDensityAnimate --isosurface <float-from-0-to-5> --grid X.Y.Z.T --file1 SciDacDensityFile1 --file2 SciDacDensityFile2 [--xlate] [--mpeg] 
+
+==================================
+Extensions
+==================================
+
+7) Direct calling from Grid ?:
+
+   Not yet implemented, but could develop sufficient interface to write a Lattice scalar field into MPEG direct from running code.
+
+8) Example python code: FieldDensity.py . This is not interfaced to Grid.
+
+
--- a/visualisation/Topo-vs-flowtime.avi
+++ b/visualisation/Topo-vs-flowtime.avi
--- a/visualisation/cmake-command
+++ b/visualisation/cmake-command
@ -0,0 +1,9 @@
+libs=`grid-config --libs`
+ldflags=`grid-config --ldflags`
+cxxflags=`grid-config --cxxflags`
+cxx=`grid-config --cxx`
+
+mkdir build
+cd build
+
+LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_CXX_COMPILER=$cxx -DCMAKE_CXX_FLAGS=$cxxflags
Author	SHA1	Message	Date
Peter Boyle	21de6f7da8	Merge pull request #477 from lehner/feature/wilson-clover-5d Feature/wilson clover 5d	2025-04-24 14:44:48 -04:00
Peter Boyle	dbe39f9ce0	Merge pull request #471 from edbennett/fix-wflow Shave off rough edges in Wilson flow test	2025-04-24 14:40:31 -04:00
Peter Boyle	ab3de50d5e	Merge pull request #473 from UCL-ARC/gauge_action_deriv WilsonGagueAction deriv	2025-04-24 14:39:10 -04:00
Peter Boyle	c545bd2139	Merge pull request #465 from edbennett/allow-nonsu3-compilation guard against trying to compile SU3-specific code when Nc ≠ 3	2025-04-24 14:35:51 -04:00
Peter Boyle	6a1c64fbdd	Merge pull request #470 from paboyle/specflow Spectral flow, DWF/Mobius kernel measurement	2025-04-24 14:34:33 -04:00
Peter Boyle	b75809ed61	Update README	2025-04-24 14:27:22 -04:00
Peter Boyle	ecaf228e5c	Update README	2025-04-24 14:25:32 -04:00
Peter Boyle	6d015ae8fc	Visualisation tools	2025-04-24 13:47:34 -04:00
Peter Boyle	233150d93f	Bug fix for no accelerator aware MPI, thanks Shuhei for finding it.	2025-04-24 11:40:46 -04:00
Peter Boyle	7af8c77a52	Normalise	2025-04-24 11:37:39 -04:00
Chulwoo Jung	a957e7bfa1	Adding DWF evec Chirality measurement	2025-04-22 22:17:51 +00:00
Chulwoo Jung	cee4c8ce8c	Merge branch 'develop' of https://github.com/paboyle/Grid into specflow	2025-04-18 19:55:36 +00:00
Christoph Lehner	96bf814d8c	Add checkerboarding to 5D compact clover	2025-04-10 23:05:39 +02:00
Christoph Lehner	7ddc422788	CompactWilsonClover5D	2025-04-10 23:05:29 +02:00
Peter Boyle	e652fc2825	Shared Memory test reenabled on every Grid object creation. Const improvements in Accelerator.h	2025-04-07 11:51:40 -04:00
Peter Boyle	a49fa3f8d0	ROCM 6.3.1 appears to work	2025-04-07 11:50:59 -04:00
Peter Boyle	cd452a2f91	Slurm update	2025-04-04 18:40:20 -04:00
Peter Boyle	4f89f603ae	Changes to add back shared memory test on GPU	2025-04-04 18:40:15 -04:00
Peter Boyle	11dc2c5e1d	PVdagM initialise	2025-04-04 18:35:06 -04:00
Peter Boyle	6fec3c15ca	Cleaner printing	2025-04-04 18:35:06 -04:00
Peter Boyle	938c47480f	Updated compile on frontier. Unsatisfactory hacsk	2025-04-04 18:35:06 -04:00
Peter Boyle	3811d19298	Fence	2025-04-04 18:35:06 -04:00
Peter Boyle	83a3ab6b6f	Barrier -- not sure 100% this was needed	2025-04-04 18:35:05 -04:00
Peter Boyle	d66a9af6a3	No compile fix	2025-04-04 18:35:05 -04:00
Peter Boyle	adc90d3a86	NVLINK GET/PUT on cuda aware mpi	2025-04-04 18:35:05 -04:00
Peter Boyle	ebbd015c5c	Deprecate shared memory copy as direction matters on nvidia GPU	2025-04-04 18:35:05 -04:00
Peter Boyle	4ab73b36b2	Deprecate shared memory copy as direction matters on GPU	2025-04-04 18:35:05 -04:00
Peter Boyle	130e07a422	Non hermitian support	2025-04-04 18:35:05 -04:00
Peter Boyle	8f47bb367e	Shifted non herm	2025-04-04 18:35:05 -04:00
Peter Boyle	0c3cb60135	Script update	2025-04-04 18:35:05 -04:00
Peter Boyle	9eae8fca5d	Size outut	2025-04-04 18:35:05 -04:00
Peter Boyle	882a217074	Example of Useful prerequisite installs with spack	2025-03-26 11:28:53 -04:00
Mashy Green	e465fce201	Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv	2025-03-24 10:12:42 +00:00
Mashy Green	d41542c64b	reverted sp2n test wilsonfundfermiongauge to original	2025-03-24 08:29:15 +00:00
Peter Boyle	199818bd6c	Merge pull request #475 from lehner/feature-aurora Sync with GPT on Aurora	2025-03-13 08:55:55 -04:00
Christoph Lehner	fe66c7ca30	verbosity	2025-03-13 12:49:36 +00:00
Christoph Lehner	e9177e4af3	Blas compatibility	2025-03-13 08:48:23 +00:00
Christoph Lehner	d15a6c5933	Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora	2025-03-13 07:29:55 +00:00
Peter Boyle	25ab9325e7	Use hostVector but remove construct resize	2025-03-11 15:02:32 +00:00
Peter Boyle	19f9378b98	Should work on Aurora nowb	2025-03-11 13:50:43 +00:00
Mashy Green	785bc7a14f	Adding staple zeroing fix	2025-03-10 12:29:04 +00:00
Mashy Green	1a1fe85428	Merge remote-tracking branch 'upstream' into gauge_action_deriv	2025-03-10 08:37:36 +00:00
Mashy Green	0000d2e558	Merge branch 'develop' into gauge_action_deriv	2025-03-10 08:35:57 +00:00
Christoph Lehner	9ffd1ed4ce	Merged	2025-03-08 15:30:08 +00:00
Peter Boyle	3d014864e2	Makinig LLVM happy	2025-03-06 14:19:25 -05:00
Peter Boyle	1d22841811	Working on aurora, GPT issue turned up is fixed	2025-03-06 03:20:18 +00:00
Peter Boyle	a1cdda833f	Update WorkArounds.txt	2025-03-05 14:04:23 -05:00
Peter Boyle	ad6db92690	Update WorkArounds.txt	2025-03-05 14:00:26 -05:00
Peter Boyle	e8ff9d8e50	Update WorkArounds.txt	2025-03-05 14:00:04 -05:00
Peter Boyle	795769c636	Update WorkArounds.txt	2025-03-05 13:50:41 -05:00
Peter Boyle	267a39d943	Update WorkArounds.txt	2025-03-05 13:49:43 -05:00
Peter Boyle	3624bd3d22	Update WorkArounds.txt	2025-03-05 13:45:09 -05:00
Peter Boyle	bc12dbbb38	Update WorkArounds.txt	2025-03-05 12:48:56 -05:00
Peter Boyle	eb8a008a8f	Create WorkArounds.txt	2025-03-05 12:41:59 -05:00
Peter Boyle	c4d9aa1a21	Config command that makes GPT happier	2025-02-27 20:12:49 +00:00
Peter Boyle	6ae809ed40	Print not liked on GPT compile	2025-02-27 20:12:49 +00:00
Peter Boyle	311e2aab3f	Update Accelerator.h	2025-02-26 11:42:52 -05:00
Peter Boyle	438dfbdb83	Only throw if there is a pending list entry in CommsComplete	2025-02-25 16:57:27 +00:00
Peter Boyle	b2ce760cf4	Verbose issue with GPT	2025-02-25 16:55:23 +00:00
Muhammad Asif	b1ba209696	Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22 ) Co-authored-by: Mashy Green <mashy@me.com> merging no-su3 patch	2025-02-24 11:38:42 +00:00
Muhammad Asif	cb3e529b1e	Merge branch 'paboyle:develop' into develop	2025-02-24 11:29:09 +00:00
Mashy Green	717f647418	added the WilsonFlow patch from upstream PR #471	2025-02-24 08:41:31 +00:00
Mashy Green	98e7418187	Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv	2025-02-24 08:33:05 +00:00
Mashy Green	fe05bf48b1	Improvements to WilsonGaugeAction deriv function (#16 ) * patched version + modifications to deriv -> staple in qcd/gauge * Cleaning up and aligning variable naming between action deriv versions * Removing the regresion test files that were also in this branch for a clean PR * Reverting whitespace changes * Fixing after revering too much! --------- Co-authored-by: Mashy Green <mashy@me.com>	2025-02-17 18:52:04 +00:00
Mashy Green	d2dd8f54e2	Fixing after revering too much!	2025-02-17 17:32:27 +00:00
Mashy Green	7726ee4b16	Reverting whitespace changes	2025-02-17 17:16:28 +00:00
Peter Boyle	ba9bbe0221	Bounce MPI through host	2025-02-12 19:34:59 +00:00
Peter Boyle	4c3dd82d84	CSHIFT with bounce throuhgh Host memory on MPI packets	2025-02-12 19:09:53 +00:00
Peter Boyle	44e911b5b7	Comment change	2025-02-12 17:37:55 +00:00
Peter Boyle	a7a16df9d0	GET not put has kinder barrier sequence for NVLINK type access as when GET is done, I can use it without barrier. Moves a barrier to a nicer place, overlapped with DtoH DMA	2025-02-12 14:59:28 +00:00
Peter Boyle	382e0abefd	Was issueing a double fence -- the gather also fences	2025-02-12 14:57:28 +00:00
Peter Boyle	6fdefe5b90	Barrier sequencing if doing "GET" not "PUT" is different. This is somewhat better timing for Barriers	2025-02-12 14:55:20 +00:00
Peter Boyle	4788dd8e2e	More states in packet progression for GPU non aware MPI	2025-02-12 14:53:57 +00:00
Peter Boyle	1cc5f221f3	GET not put ordering is better as I know when I've got all MY data	2025-02-12 14:53:05 +00:00
Peter Boyle	93251bfba0	GET not put for better ordering in the downstream dependent kernels -- I know when I'm done, so we can move a barrier / handshake between ranks intranode to a point off critical path	2025-02-12 14:50:21 +00:00
Peter Boyle	18b79508b8	New line better for pretty print	2025-02-12 14:49:48 +00:00
Peter Boyle	4de5ed1613	Remove vector view. The std::vector will not inform Memory manager of deletion and so a stale entry could be left. It is not and should not be used.	2025-02-12 14:48:46 +00:00
Peter Boyle	0baaddbe98	Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384 nodes. More concurrency/fine grained scheduling is possible.	2025-02-04 19:27:26 +00:00
Ed Bennett	8729c46169	add clover energy density measurement to default WilsonFlow measurements	2025-02-03 14:27:55 +00:00
Ed Bennett	09f81fe7c3	don't force energy density measurement to be every wilson flow iteration	2025-02-03 14:27:45 +00:00
Ed Bennett	1876e5b7c0	correct tests/smearing/WilsonFlow to use non-adaptive flow and use correct interface	2025-02-03 14:27:29 +00:00
Mashy Green	355ec76257	Merge pull request #18 from UCL-ARC/bugfix/nvtx Bugfix/nvtx	2025-02-03 11:05:42 +00:00
Peter Boyle	b50fb34e71	Perf on Aurora	2025-02-01 18:39:34 +00:00
Peter Boyle	de84d730ff	Fastest run config on Aurora to date	2025-02-01 18:08:40 +00:00
Peter Boyle	c74d11e3d7	PVdagM MG	2025-02-01 11:04:13 -05:00
Christoph Lehner	84cab5e6e7	no comms and log cleanup	2025-02-01 16:37:21 +01:00
Peter Boyle	c4fc972fec	Merge branch 'feature/deprecate-uvm' into develop	2025-01-31 16:32:36 +00:00
Mashy Green	4f17c8d081	Merge branch 'paboyle:develop' into bugfix/nvtx	2025-01-29 13:10:12 +00:00
Mashy Green	aaab753982	Reverting to older version of nvtx for Tursa support	2025-01-29 12:57:38 +00:00
Chulwoo Jung	570b72a47b	Bugfix. Sorry!	2025-01-21 15:37:39 -05:00
Chulwoo Jung	a5798a89ed	Merge branch 'develop' into specflow	2025-01-21 12:13:24 -05:00
Peter Boyle	3f3661a86f	Heading towards PVdagM multigrid	2025-01-17 14:33:35 +00:00
Chulwoo Jung	f7e2f9a401	Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement	2025-01-16 20:47:33 +00:00
Chulwoo Jung	2848a9b558	DWF Kernel lanczos working(?)	2025-01-16 01:29:56 +00:00
Mashy Green	d4868991af	Fixed wrong lib for NVTX in configure.ac and updated to nvtx3	2025-01-10 14:53:19 +00:00
Mashy Green	e99d42404e	Removing the regresion test files that were also in this branch for a clean PR	2024-12-16 16:31:22 +00:00
Mashy Green	3ba019c747	Cleaning up and aligning variable naming between action deriv versions	2024-12-03 15:23:00 +00:00
Mashy Green	47429218bb	patched version + modifications to deriv -> staple in qcd/gauge	2024-11-27 16:29:22 +00:00
Peter Boyle	5a4f9bf2e3	Force the ROCM version	2024-10-29 18:12:31 -04:00
Peter Boyle	f617468e04	Update Lattice_base.h	2024-10-11 10:39:16 -04:00
Peter Boyle	ee4046fe92	Added a dimension ordered column sum based reduction for scalar. Removes dependence on MPI_Allreduce and allows for work around on systems where this is bollox.	2024-09-27 09:26:03 -04:00
Peter Boyle	2a9cfeb9ea	New files	2024-09-26 14:23:29 -04:00
Peter Boyle	1147b8ea40	Cheby poly setup	2024-09-26 14:20:32 -04:00
Peter Boyle	3f9119b39d	Remove vectors used for the power spectrum table in paper	2024-09-26 14:19:41 -04:00
Peter Boyle	35e8225abd	Verbose control	2024-09-26 14:18:35 -04:00
Peter Boyle	bdbfbb7a14	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-09-26 14:05:45 -04:00
Peter Boyle	f7d4be8d96	Calculate bytes correctly	2024-09-26 14:04:44 -04:00
Ed Bennett	8d305df0db	guard against trying to compile SU3-specific code when Nc ≠ 3	2024-05-24 14:00:56 +01:00
				`@ -0,0 +1 @@`
				`../CompactWilsonCloverFermion5DInstantiation.cc.master`