Merge pull request #475 from lehner/feature-aurora

Sync with GPT on Aurora
verbosity
2025-06-14 13:57:07 +01:00 · 2025-03-13 08:55:55 -04:00 · 2025-03-13 12:49:36 +00:00 · 2025-03-13 08:48:23 +00:00 · 2025-03-13 07:29:55 +00:00 · 2025-03-11 15:02:32 +00:00
36 changed files with 2450 additions and 552 deletions
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@ -191,7 +191,7 @@ public:
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
-    std::cout << "CPU view" << std::endl;
+    //std::cout << "CPU view" << std::endl;
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@ -215,7 +215,7 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
-    std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -229,7 +229,7 @@ public:
    }
    // Barrel shift and collect global pencil
-    std::cout << GridLogPerformance<<"Making pencil" << std::endl;
+    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@ -251,7 +251,7 @@ public:
      }
    }
-    std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
+    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@ -274,7 +274,7 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
-    std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
+    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@ -291,7 +291,7 @@ public:
    }
    result = result*div;
-    std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
+    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@ -208,8 +208,8 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -367,28 +367,67 @@ public:
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
        });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.adjoint() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@ -414,8 +453,8 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -514,28 +553,70 @@ public:
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.adjoint() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@ -661,28 +742,40 @@ public:
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
@ -809,28 +902,40 @@ public:
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
--- a/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
+++ b/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
@ -144,11 +144,11 @@ public:
      acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
    }    
    RealD t4 = usecond();
-    std::cout << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
-    std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
-    std::cout << "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
-    std::cout << "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
-    std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
  }
  void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
@ -242,16 +242,16 @@ public:
    RealD flops = 8.0*M*N*K;
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
-    std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
-    std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #else
    int nrhs;
    GridBase *grid;
@ -358,17 +358,17 @@ public:
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
    xybytes = 4*xybytes/(t2-t1)/1.e3;
-    std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
-    std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #endif
  }
 };
--- a/Grid/algorithms/iterative/AdefMrhs.h
+++ b/Grid/algorithms/iterative/AdefMrhs.h
@ -63,7 +63,12 @@ class TwoLevelCGmrhs
  GridStopWatch SmoothTimer;
  GridStopWatch InsertTimer;
-  
+  /*
    Field rrr;
  Field sss;
  Field qqq;
  Field zzz;
  */  
  // more most opertor functions
  TwoLevelCGmrhs(RealD tol,
 		 Integer maxit,
@ -74,6 +79,12 @@ class TwoLevelCGmrhs
    MaxIterations(maxit),
    _FineLinop(FineLinop),
    _Smoother(Smoother)
    /*
    rrr(fine),
    sss(fine),
    qqq(fine),
    zzz(fine)
 */
  {
    grid       = fine;
  };
@ -81,8 +92,8 @@ class TwoLevelCGmrhs
  // Vector case
  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
  {
-    SolveSingleSystem(src,x);
+    //    SolveSingleSystem(src,x);
-    //    SolvePrecBlockCG(src,x);
+    SolvePrecBlockCG(src,x);
  }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -657,6 +668,8 @@ public:
    CoarseField PleftProjMrhs(this->coarsegridmrhs);
    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
    //    this->rrr=in[0];
 #undef SMOOTHER_BLOCK_SOLVE
 #if SMOOTHER_BLOCK_SOLVE
    this->SmoothTimer.Start();
@ -669,6 +682,7 @@ public:
      this->SmoothTimer.Stop();
    }
 #endif
    //    this->sss=Min[0];
    for(int rhs=0;rhs<nrhs;rhs++) {
@ -705,9 +719,11 @@ public:
    this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]  
    this->PromoteTimer.Stop();
    this->FineTimer.Start();
    //    this->qqq=tmp[0];
    for(int rhs=0;rhs<nrhs;rhs++) {
      axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
    }
    //    this->zzz=out[0];
    this->FineTimer.Stop();
  }
 };
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@ -74,7 +74,7 @@ public:
  void operator() (const Field &src, Field &psi){
-    psi=Zero();
+    //    psi=Zero();
    RealD cp, ssq,rsq;
    ssq=norm2(src);
    rsq=Tolerance*Tolerance*ssq;
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 NAMESPACE_BEGIN(Grid);
 inline RealD AggregatePowerLaw(RealD x)
@ -124,6 +126,53 @@ public:
    }
  }
  virtual void CreateSubspaceGCR(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
  {
    RealD scale;
    TrivialPrecon<FineField> simple_fine;
    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
    FineField noise(FineGrid);
    FineField src(FineGrid);
    FineField guess(FineGrid);
    FineField Mn(FineGrid);
    for(int b=0;b<nn;b++){
      subspace[b] = Zero();
      gaussian(RNG,noise);
      scale = std::pow(norm2(noise),-0.5); 
      noise=noise*scale;
      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
      for(int i=0;i<3;i++){
 	//  void operator() (const Field &src, Field &psi){
 #if 1
 	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
 	src = noise;
 	guess=Zero();
 	GCR(src,guess);
 	subspace[b] = guess;
 #else
 	std::cout << GridLogMessage << " inverting on zero "<<std::endl;
 	src=Zero();
 	guess = noise;
 	GCR(src,guess);
 	subspace[b] = guess;
 #endif
 	noise = subspace[b];
 	scale = std::pow(norm2(noise),-0.5); 
 	noise=noise*scale;
      }
      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
      subspace[b]   = noise;
    }
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
@ -160,14 +209,21 @@ public:
    int b =0;
    {
      ComplexD ip;
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
      hermop.Op(Mn,tmp);
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      ip= innerProduct(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
      hermop.AdjOp(Mn,tmp); 
      ip = innerProduct(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
      b++;
    }
@ -213,8 +269,18 @@ public:
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
 	  ComplexD ip;
 	  hermop.Op(Mn,tmp);
-	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  ip= innerProduct(Mn,tmp); 
 	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
 	  hermop.AdjOp(Mn,tmp); 
 	  ip = innerProduct(Mn,tmp); 
 	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
 	  b++;
 	}
@ -228,6 +294,70 @@ public:
    }
    assert(b==nn);
  }
  virtual void CreateSubspacePolyCheby(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo1,
 				       int orderfilter,
 				       double lo2,
 				       int orderstep)
  {
    RealD scale;
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    FineField tmp(FineGrid);
    // New normalised noise
    gaussian(RNG,noise);
    scale = std::pow(norm2(noise),-0.5); 
    noise=noise*scale;
    std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
    // Initial matrix element
    hermop.Op(noise,Mn);
    std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
    int b =0;
    {
      // Filter
      std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
      Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
      hermop.Op(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
    }
    // Generate a full sequence of Chebyshevs
    for(int n=1;n<nn;n++){
      std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
      Chebyshev<FineField> Cheb(lo2,hi,orderstep);
      Cheb(hermop,subspace[n-1],Mn);
      for(int m=0;m<n;m++){
 	ComplexD c = innerProduct(subspace[m],Mn);
 	Mn = Mn - c*subspace[m];
      }
      // normalise
      scale = std::pow(norm2(Mn),-0.5);
      Mn=Mn*scale;
      subspace[n]=Mn;
      hermop.Op(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
    }
  }
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@ -69,7 +69,7 @@ public:
  }
  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
@ -175,10 +175,11 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
 template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview
-template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // 
+template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // Really want to deprecate
 template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page
 template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector
 /*
 template<class T> class vecView
 {
 protected:
@ -214,6 +215,7 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
 #define autoVecView(v_v,v,mode)					\
  auto v_v = VectorView(v,mode);				\
  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
 */
 NAMESPACE_END(Grid);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -9,6 +9,7 @@ static char print_buffer [ MAXLINE ];
 #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
 #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug  << print_buffer << std::endl;
 //#define dprintf(...) 
 //#define mprintf(...) 
 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
@ -109,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -119,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
    AccCache.AccPtr=(uint64_t) NULL;
-    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@ -139,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@ -153,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@ -167,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@ -182,7 +183,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx\n",
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx",
 	  (uint64_t)AccCache.bytes,
 	  (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
+    dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
+    dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
  if(bytes>=DeviceMaxBytes) {
    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
  }
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error
  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n",
+    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
-    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
+    dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
+    dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
  } else {
    assert(0);
  }
@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
-    dprintf("AccCache entry removed from LRU \n");
+    dprintf("AccCache entry removed from LRU ");
    LRUremove(AccCache);
  }
@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
  } else {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 #include <Grid/communicator/SharedMemory.h>
 #define NVLINK_GET
 NAMESPACE_BEGIN(Grid);
 extern bool Stencil_force_mpi ;
@ -147,6 +149,7 @@ public:
 			    sizeof(obj),d*100+p);
      }
      if (!list.empty()) // avoid triggering assert in comms == none
 	CommsComplete(list);
      for(int p=1;p<_processors[d];p++){
 	accum = accum + column[p];
@ -192,6 +195,11 @@ public:
 				      void *recv,
 				      int recv_from_rank,int do_recv,
 				      int xbytes,int rbytes,int dir);
  // Could do a PollHtoD and have a CommsMerge dependence
  void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 ////////////////////////////////////////////
@ -362,8 +363,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
  std::vector<MpiCommsRequest_t> reqs(0);
  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
  int myrank = _processor;
  int ierr;
@ -379,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);
  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@ -399,6 +395,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 #ifdef ACCELERATOR_AWARE_MPI
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
 void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int dest,int dox,
@ -561,53 +559,105 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-#undef DEVICE_TO_HOST_CONCURRENT // pipeline
+
 #ifdef DEVICE_TO_HOST_CONCURRENT
      tag= dir+_processor*32;
      host_xmit = this->HostBufferMalloc(xbytes);
-      acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+      CommsRequest_t srq;
      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      //      assert(ierr==0);
      //      off_node_bytes+=xbytes;
      CommsRequest_t srq;
      srq.PacketType = InterNodeXmit;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = host_xmit;
      srq.device_buf = xmit;
      srq.tag        = tag;
      srq.dest       = dest;
      srq.commdir    = commdir;
      list.push_back(srq);
 #else
      tag= dir+_processor*32;
      host_xmit = this->HostBufferMalloc(xbytes);
      const int chunks=1;
      for(int n=0;n<chunks;n++){
 	void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
 	void * xmitc      = (void *)( (uint64_t) xmit      + n*xbytes/chunks);
 	acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
      }
      acceleratorCopySynchronise(); // Complete all pending copy transfers
      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      off_node_bytes+=xbytes;
      CommsRequest_t srq;
      srq.PacketType = InterNodeXmit;
      srq.bytes      = xbytes;
      srq.req        = xrq;
      srq.host_buf   = host_xmit;
      srq.device_buf = xmit;
      list.push_back(srq);
 #endif
    }
  }
  return off_node_bytes;
 }
 /*
 * In the interest of better pipelining, poll for completion on each DtoH and 
 * start MPI_ISend in the meantime
 */
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
 {
  int pending = 0;
  do {
    pending = 0;
    for(int idx = 0; idx<list.size();idx++){
      if ( list[idx].PacketType==InterNodeRecv ) {
 	int flag = 0;
 	MPI_Status status;
 	int ierr = MPI_Test(&list[idx].req,&flag,&status);
 	assert(ierr==0);
 	if ( flag ) {
 	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
 	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
 	  list[idx].PacketType=InterNodeReceiveHtoD;
 	} else {
 	  pending ++;
 	}
      }
    }
    //    std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
  } while ( pending );
 }
 void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
 {
  int pending = 0;
  do {
    pending = 0;
    for(int idx = 0; idx<list.size();idx++){
      if ( list[idx].PacketType==InterNodeXmit ) {
 	if ( acceleratorEventIsComplete(list[idx].ev) ) {
 	  void *host_xmit = list[idx].host_buf;
 	  uint32_t xbytes = list[idx].bytes;
 	  int dest        = list[idx].dest;
 	  int tag         = list[idx].tag;
 	  int commdir     = list[idx].commdir;
 	  ///////////////////
 	  // Send packet
 	  ///////////////////
 	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
 	  MPI_Request xrq;
 	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
 	  assert(ierr==0);
 	  list[idx].req        = xrq; // Update the MPI request in the list
 	  list[idx].PacketType=InterNodeXmitISend;
 	} else {
 	  // not done, so return to polling loop
 	  pending++;
 	}
      }
    }
  } while (pending);
 }  
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
@ -644,69 +694,89 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   * - complete all copies
   * - post MPI send asynch
   */
 #ifdef NVLINK_GET
  if ( dor ) {
-  //  static int printed;
+    if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
-  //  if((printed<8) && this->IsBoss() ) {
+      // Intranode
-  //    printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
-  //    printed++;
+      assert(shm!=NULL);
  //  }
      CommsRequest_t srq;
      srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
      srq.PacketType = IntraNodeRecv;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = NULL;
      srq.device_buf = xmit;
      srq.tag        = -1;
      srq.dest       = dest;
      srq.commdir    = dir;
      list.push_back(srq);
    }
  }  
 #else
  if (dox) {
-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+    if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
-#ifdef DEVICE_TO_HOST_CONCURRENT
+      // Intranode
      tag= dir+_processor*32;
      // Find the send in the prepared list
      int list_idx=-1;
      for(int idx = 0; idx<list.size();idx++){
 	if ( (list[idx].device_buf==xmit)
 	   &&(list[idx].PacketType==InterNodeXmit)
 	   &&(list[idx].bytes==xbytes) ) {
 	  list_idx = idx;
 	  host_xmit = list[idx].host_buf;
 	}
      }
      assert(list_idx != -1); // found it
      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      list[list_idx].req        = xrq; // Update the MPI request in the list
      off_node_bytes+=xbytes;
 #endif      
    } else {
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+
      CommsRequest_t srq;
      srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
      srq.PacketType = IntraNodeXmit;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = NULL;
      srq.device_buf = xmit;
      srq.tag        = -1;
      srq.dest       = dest;
      srq.commdir    = dir;
      list.push_back(srq);
    }
  }
 #endif
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  int nreq=list.size();
+  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
-  if (nreq==0) return;
+  std::vector<MPI_Status> status;
-  std::vector<MPI_Status> status(nreq);
+  std::vector<MPI_Request> MpiRequests;
  std::vector<MPI_Request> MpiRequests(nreq);
-  for(int r=0;r<nreq;r++){
+  for(int r=0;r<list.size();r++){
-    MpiRequests[r] = list[r].req;
+    // Must check each Send buf is clear to reuse
    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
  }
-  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
+  int nreq=MpiRequests.size();
  if (nreq>0) {
    status.resize(MpiRequests.size());
    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
    assert(ierr==0);
  for(int r=0;r<nreq;r++){
    if ( list[r].PacketType==InterNodeRecv ) {
      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
    }
  }
-  acceleratorCopySynchronise(); // Complete all pending copy transfers
+  //  for(int r=0;r<nreq;r++){
  //    if ( list[r].PacketType==InterNodeRecv ) {
  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
  //    }
  //  }
  list.resize(0);               // Delete the list
  this->HostBufferFreeAll();    // Clean up the buffer allocs
-  this->StencilBarrier(); 
+#ifndef NVLINK_GET
  this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
 #endif   
 }
 #endif
 ////////////////////////////////////////////
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@ -132,6 +132,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
 void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int xmit_to_rank,int dox,
@ -139,7 +141,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
 							   int recv_from_rank,int dor,
 							   int xbytes,int rbytes, int dir)
 {
-  return xbytes+rbytes;
+  return 0.0;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@ -50,12 +50,30 @@ typedef MPI_Request MpiCommsRequest_t;
 #ifdef ACCELERATOR_AWARE_MPI
 typedef MPI_Request CommsRequest_t;
 #else
-enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
+/*
 * Enable state transitions as each packet flows.
 */
 enum PacketType_t {
  FaceGather,
  InterNodeXmit,
  InterNodeRecv,
  IntraNodeXmit,
  IntraNodeRecv,
  InterNodeXmitISend,
  InterNodeReceiveHtoD
 };
 /*
 *Package arguments needed for various actions along packet flow
 */
 typedef struct {
  PacketType_t PacketType;
  void *host_buf;
  void *device_buf;
  int dest;
  int tag;
  int commdir;
  unsigned long bytes;
  acceleratorEvent_t ev;
  MpiCommsRequest_t req;
 } CommsRequest_t;
 #endif
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -542,7 +542,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  printf("Host buffer allocate for GPU non-aware MPI\n");
+  // printf("Host buffer allocate for GPU non-aware MPI\n");
 #if 0
  HostCommBuf= acceleratorAllocHost(bytes);
 #else 
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@ -68,7 +68,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }
-#if 1
+
 template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];
@ -125,6 +125,10 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
 #ifndef ACCELERATOR_AWARE_MPI
  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
 #endif
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@ -156,16 +160,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      tcomms-=usecond();
      grid->Barrier();
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
 #else
      // bouncy bouncy
      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
      grid->SendToRecvFrom((void *)&hsend_buf[0],
 			   xmit_to_rank,
 			   (void *)&hrecv_buf[0],
 			   recv_from_rank,
 			   bytes);
      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
 #endif
      xbytes+=bytes;
      grid->Barrier();
      tcomms+=usecond();
@ -231,6 +248,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
 #ifndef ACCELERATOR_AWARE_MPI
  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
 #endif
  int bytes = buffer_size*sizeof(scalar_object);
@ -283,11 +304,22 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
 #ifdef ACCELERATOR_AWARE_MPI
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
 #else
      // bouncy bouncy
 	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
 	grid->SendToRecvFrom((void *)&hsend_buf[0],
 			     xmit_to_rank,
 			     (void *)&hrecv_buf[0],
 			     recv_from_rank,
 			     bytes);
 	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
 #endif
 	xbytes+=bytes;
 	grid->Barrier();
@ -311,234 +343,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  }
 }
 #else 
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  GridBase *grid=rhs.Grid();
  Lattice<vobj> temp(rhs.Grid());
  int fd              = rhs.Grid()->_fdimensions[dimension];
  int rd              = rhs.Grid()->_rdimensions[dimension];
  int pd              = rhs.Grid()->_processors[dimension];
  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
  assert(simd_layout==1);
  assert(comm_dim==1);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
  vobj *send_buf;
  vobj *recv_buf;
  {
    grid->ShmBufferFreeAll();
    size_t bytes = buffer_size*sizeof(vobj);
    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
  }
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  for(int x=0;x<rd;x++){       
    int sx        =  (x+sshift)%rd;
    int comm_proc = ((x+sshift)/rd)%pd;
    if (comm_proc==0) {
      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
      tcopy+=usecond();
    } else {
      int words = buffer_size;
      if (cbmask != 0x3) words=words>>1;
      int bytes = words * sizeof(vobj);
      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
      tgather+=usecond();
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      tcomms-=usecond();
      //      grid->Barrier();
      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
      xbytes+=bytes;
      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
      //      grid->Barrier();
      tcomms+=usecond();
      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
      tscatter+=usecond();
    }
  }
  if(Cshift_verbose){
    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  }
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid=rhs.Grid();
  const int Nsimd = grid->Nsimd();
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  int fd = grid->_fdimensions[dimension];
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int pd = grid->_processors[dimension];
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;
  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
  assert(comm_dim==1);
  assert(simd_layout==2);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int permute_type=grid->PermuteType(dimension);
  ///////////////////////////////////////////////
  // Simd direction uses an extract/merge pair
  ///////////////////////////////////////////////
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);
  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
  {
    size_t bytes = sizeof(scalar_object)*buffer_size;
    grid->ShmBufferFreeAll();
    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
  }
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
  int bytes = buffer_size*sizeof(scalar_object);
  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
  ///////////////////////////////////////////
  // Work out what to send where
  ///////////////////////////////////////////
  int cb    = (cbmask==0x2)? Odd : Even;
  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  // loop over outer coord planes orthog to dim
  for(int x=0;x<rd;x++){       
    // FIXME call local permute copy if none are offnode.
    for(int i=0;i<Nsimd;i++){       
      pointers[i] = &send_buf_extract[i][0];
    }
    tgather-=usecond();
    int sx   = (x+sshift)%rd;
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
    tgather+=usecond();
    for(int i=0;i<Nsimd;i++){
      int inner_bit = (Nsimd>>(permute_type+1));
      int ic= (i&inner_bit)? 1:0;
      int my_coor          = rd*ic + x;
      int nbr_coor         = my_coor+sshift;
      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
      int nbr_lane = (i&(~inner_bit));
      int recv_from_rank;
      int xmit_to_rank;
      if (nbr_ic) nbr_lane|=inner_bit;
      assert (sx == nbr_ox);
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 	tcomms-=usecond();
 	//	grid->Barrier();
 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
 	xbytes+=bytes;
 	//	grid->Barrier();
 	tcomms+=usecond();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
      }
    }
    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
  }
  if(Cshift_verbose){
    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
  }
 }
 #endif
 NAMESPACE_END(Grid); 
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -466,6 +466,12 @@ public:
    static deviceVector<vobj> recv_buf;
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);
 #ifndef ACCELERATOR_AWARE_MPI
    static hostVector<vobj> hsend_buf; 
    static hostVector<vobj> hrecv_buf;
    hsend_buf.resize(buffer_size*2*depth);    
    hrecv_buf.resize(buffer_size*2*depth);
 #endif    
    std::vector<MpiCommsRequest_t> fwd_req;   
    std::vector<MpiCommsRequest_t> bwd_req;   
@ -495,9 +501,17 @@ public:
      t_gather+=usecond()-t;
      t=usecond();
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&send_buf[d*buffer_size], xmit_to_rank,
 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
 #else
      acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
 				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
 #endif
      t_comms+=usecond()-t;
     }
    for ( int d=0;d < depth ; d ++ ) {
@ -508,9 +522,17 @@ public:
      t_gather+= usecond() - t;
      t=usecond();
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
 #else
      acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
 #endif      
      t_comms+=usecond()-t;
    }
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -484,6 +484,11 @@ public:
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
 #ifdef NVLINK_GET
    this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
    // Or issue barrier AFTER the DMA is running
 #endif    
  }
 };
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -504,7 +504,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    autoView(st_v , st,AcceleratorRead);
   if( interior && exterior ) {
-     acceleratorFenceComputeStream();
+     //     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #ifndef GRID_CUDA
@ -517,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
-     // dependent on result of merge
+     //     // dependent on result of merge
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -363,12 +363,16 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    //    std::cout << "Communicate Begin "<<std::endl;
    //    _grid->Barrier();
    FlightRecorder::StepLog("Communicate begin");
    // All GPU kernel tasks must complete
    //    accelerator_barrier();     // All kernels should ALREADY be complete
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate prepare "<<i<<std::endl;
      //      _grid->Barrier();
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
 					  Packets[i].send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
@ -376,8 +380,15 @@ public:
 					  Packets[i].from_rank,Packets[i].do_recv,
 					  Packets[i].xbytes,Packets[i].rbytes,i);
    }
    //    std::cout << "Communicate PollDtoH "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
    //    std::cout << "Communicate CopySynch "<<std::endl;
    //    _grid->Barrier();
    acceleratorCopySynchronise();
    // Starts intranode
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate Begin "<<i<<std::endl;
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
@ -395,7 +406,14 @@ public:
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    //    std::cout << "Communicate Complete "<<std::endl;
    //    _grid->Barrier();
    FlightRecorder::StepLog("Start communicate complete");
    //    std::cout << "Communicate Complete PollIRecv "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromPollIRecv(MpiReqs);
    //    std::cout << "Communicate Complete Complete "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
@ -483,6 +501,9 @@ public:
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
    //    accelerator_barrier();
    //////////////////////////////////
    // I will overwrite my send buffers
    //////////////////////////////////
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    assert(source.Grid()==_grid);
@ -496,7 +517,11 @@ public:
      HaloGatherDir(source,compress,point,face_idx);
    }
    accelerator_barrier(); // All my local gathers are complete
-    //    _grid->StencilBarrier();// Synch shared memory on a single nodes
+#ifdef NVLINK_GET
    _grid->StencilBarrier(); // He can now get mu local gather, I can get his
    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
    // Or issue barrier AFTER the DMA is running
 #endif    
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
  }
@ -535,6 +560,7 @@ public:
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
      acceleratorFenceComputeStream();
      // Also fenced in WilsonKernels
    }
  }
@ -663,7 +689,6 @@ public:
 	}
      }
    }
    std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
    surface_list.resize(surface_list_size);
    std::vector<int> surface_list_host(surface_list_size);
    int32_t ss=0;
@ -683,6 +708,7 @@ public:
      }
    }
    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -245,12 +245,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
+inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
+inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
 }
@ -343,11 +343,28 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
+
-inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); }
+///////
-inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
+// Asynch event interface
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+///////
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+typedef sycl::event acceleratorEvent_t;
 inline void acceleratorEventWait(acceleratorEvent_t ev)
 {
  ev.wait();
 }
 inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
 {
  return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
 }
 inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);}
 inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
 inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
 inline int  acceleratorIsCommunicable(void *ptr)
@ -358,8 +375,10 @@ inline int  acceleratorIsCommunicable(void *ptr)
  else return 0;
 #endif
  return 1;
 }
 #endif
 //////////////////////////////////////////////
@ -492,19 +511,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
 {
  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 }
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
 }
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
 }
 inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
@ -564,9 +583,9 @@ inline void acceleratorMem(void)
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
 inline void acceleratorCopySynchronise(void) {};
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
@ -649,15 +668,15 @@ accelerator_inline void acceleratorFence(void)
  return;
 }
-inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
+inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes)
 {
  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
  acceleratorCopySynchronise();
 }
-template<class T> void acceleratorPut(T& dev,T&host)
+template<class T> void acceleratorPut(T& dev,const T&host)
 {
-  acceleratorCopyToDevice(&host,&dev,sizeof(T));
+  acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
 }
 template<class T> T acceleratorGet(T& dev)
 {
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_critical                                     DO_PRAGMA(omp critical)
 #ifdef GRID_OMP
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
-  uint64_t *ufrom = (uint64_t *)from;
+  const uint64_t *ufrom = (const uint64_t *)from;
  uint64_t *uto   = (uint64_t *)to;
  assert(bytes%8==0);
  uint64_t words=bytes/8;
@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
  });
 }
 #else
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
  bcopy(from,to,bytes);
 }
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -509,6 +509,13 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);
  if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
    std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
    FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
    FlightRecorder::PrintEntireLog = 1;
    FlightRecorder::ChecksumComms  = 1;
    FlightRecorder::ChecksumCommsSend=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
@ -651,3 +658,4 @@ void Grid_debug_handler_init(void)
 }
 NAMESPACE_END(Grid);
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@ -50,7 +50,7 @@ namespace Grid{
      int64_t index64;
      IndexFromCoorReversed(coor,index64,dims);
      if ( index64>=2*1024*1024*1024LL ){
-	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+	//	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
      }
      assert(index64<2*1024*1024*1024LL);
      index = (int) index64;
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@ -52,7 +52,7 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();
-  int Ls=8;
+  int Ls=16;
  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@ -175,8 +175,8 @@ public:
 	    timestat.statistics(t_time);
 	    dbytes=dbytes*ppn;
-	    double xbytes    = dbytes*0.5;
+	    double xbytes    = dbytes;
-	    double bidibytes = dbytes;
+	    double bidibytes = dbytes*2.0;
 	    std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
 		     << bytes << " \t "
@ -492,17 +492,18 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;
+	uint64_t no    = 50;
-
+	uint64_t ni    = 100;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
+	std::vector<double> t_time(no);
-	for(uint64_t i=0;i<ncall;i++){
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
 	  for(uint64_t j=0;j<ni;j++){
 	    Dw.DhopEO(src_o,r_e,DaggerNo);
 	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@ -520,11 +521,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
+	mf_hi = flops/timestat.min*ni;
-	mf_lo = flops/timestat.max;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@ -535,6 +536,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
      }
@ -654,17 +656,19 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	uint64_t ncall = 500;
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
 	uint64_t ni    = 100;
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
+	std::vector<double> t_time(no);
-	for(uint64_t i=0;i<ncall;i++){
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
 	  for(uint64_t j=0;j<ni;j++){
 	    Ds.DhopEO(src_o,r_e,DaggerNo);
 	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@ -675,11 +679,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
+	mf_hi = flops/timestat.min*ni;
-	mf_lo = flops/timestat.max;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@ -689,6 +693,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
      }
@ -792,19 +797,18 @@ public:
 	  Dc.M(src,r);
 	}
 	FGrid->Barrier();
-	double t1=usecond();
+	uint64_t ni = 100;
-	uint64_t ncall = 500;
+	uint64_t no = 50;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
+	std::vector<double> t_time(no);
-	for(uint64_t i=0;i<ncall;i++){
+	for(uint64_t i=0;i<no;i++){
-	  t0=usecond();
+	  double t0=usecond();
 	  for(uint64_t j=0;j<ni;j++){
 	    Dc.M(src,r);
-	  t1=usecond();
+	  }
 	  double t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
@ -814,20 +818,21 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
+	mf_hi = flops/timestat.min*ni;
-	mf_lo = flops/timestat.max;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call   "<< timestat.mean/ni<<std::endl;
      }
@ -872,7 +877,7 @@ int main (int argc, char ** argv)
  int do_dslash=1;
  int sel=4;
-  std::vector<int> L_list({8,12,16,24});
+  std::vector<int> L_list({8,12,16,24,32});
  int selm1=sel-1;
  std::vector<double> clover;
--- a/systems/Aurora/benchmarks/bench16.pbs
+++ b/systems/Aurora/benchmarks/bench16.pbs
@ -0,0 +1,74 @@
 #!/bin/bash
 ##PBS -q LatticeQCD_aesp_CNDA
 #PBS -q debug-scaling
 ##PBS -q prod
 #PBS -l select=16
 #PBS -l walltime=00:20:00
 #PBS -A LatticeQCD_aesp_CNDA
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cp $PBS_NODEFILE nodefile
 export OMP_NUM_THREADS=4
 export MPICH_OFI_NIC_POLICY=GPU
 #export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 #
 # Local vol 16.16.16.32
 #
 LX=16
 LY=16
 LZ=16
 LT=32
 NX=2
 NY=2
 NZ=4
 NT=1
 GX=2
 GY=2
 GZ=1
 GT=3
 PX=$((NX * GX ))
 PY=$((NY * GY ))
 PZ=$((NZ * GZ ))
 PT=$((NT * GT ))
 VX=$((PX * LX ))
 VY=$((PY * LY ))
 VZ=$((PZ * LZ ))
 VT=$((PT * LT ))
 NP=$((PX*PY*PZ*PT))
 VOL=${VX}.${VY}.${VZ}.${VT}
 AT=8
 MPI=${PX}.${PY}.${PZ}.${PT}
 CMD="mpiexec -np $NP -ppn 12  -envall \
 	     ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi $MPI --grid $VOL \
 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
 echo VOL $VOL
 echo MPI $MPI
 echo NPROC $NP
 echo $CMD
 $CMD
--- a/systems/Aurora/benchmarks/gpu_tile.sh
+++ b/systems/Aurora/benchmarks/gpu_tile.sh
@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
@ -30,8 +30,8 @@ echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_A
 if [ $PALS_RANKID = "0" ]
 then
-    numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
+#    numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
-#    numactl -p $NUMAP -N $NUMAP  "$@"
+    numactl -p $NUMAP -N $NUMAP  "$@"
 else 
    numactl -p $NUMAP -N $NUMAP  "$@"
 fi
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@ -1,18 +1,19 @@
 #Ahead of time compile for PVC
-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" 
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
 #JIT compile 
 #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
-../../configure \
+../configure \
 	--enable-simd=GPU \
 	--enable-reduction=grid \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
 	--prefix $HOME/gpt-install \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \
--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@ -2,7 +2,7 @@
 spack load c-lime
 module load emacs 
 module load PrgEnv-gnu
-module load rocm
+module load rocm/6.0.0
 module load cray-mpich
 module load gmp
 module load cray-fftw
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@ -0,0 +1,206 @@
 The purpose of this file is to collate all non-obvious known magic shell variables
 and compiler flags required for either correctness or performance on various systems.
 A repository of work-arounds.
 Contents:
 1. Interconnect + MPI
 2. Compilation
 3. Profiling
 ************************
 * 1. INTERCONNECT + MPI
 ************************
 --------------------------------------------------------------------
 MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O 
 --------------------------------------------------------------------
 export OMPI_MCA_io=romio321
 --------------------------------------
 ROMIO fail with > 2GB per node read (32 bit issue)
 --------------------------------------
 Use later MPICH
 https://github.com/paboyle/Grid/issues/381
 https://github.com/pmodels/mpich/commit/3a479ab0
 --------------------------------------------------------------------
 Slingshot: Frontier and Perlmutter libfabric slow down 
 and physical memory fragmentation 
 --------------------------------------------------------------------
 export FI_MR_CACHE_MONITOR=disabled
 or
 export FI_MR_CACHE_MONITOR=kdreg2
 --------------------------------------------------------------------
 Perlmutter
 --------------------------------------------------------------------
 export MPICH_RDMA_ENABLED_CUDA=1
 export MPICH_GPU_IPC_ENABLED=1
 export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
 export MPICH_GPU_NO_ASYNC_MEMCPY=0
 --------------------------------------------------------------------
 Frontier/LumiG
 --------------------------------------------------------------------
 Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
 cat << EOF > select_gpu
 #!/bin/bash
 export MPICH_GPU_SUPPORT_ENABLED=1
 export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 export GPU_MAP=(0 1 2 3 7 6 5 4)
 export NUMA_MAP=(3 3 1 1 2 2 0 0)
 export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
 export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
 export HIP_VISIBLE_DEVICES=\$GPU
 unset ROCR_VISIBLE_DEVICES
 echo RANK \$SLURM_LOCALID using GPU \$GPU    
 exec numactl -m \$NUMA -N \$NUMA \$*
 EOF
 chmod +x ./select_gpu
 srun ./select_gpu BINARY
 --------------------------------------------------------------------
 Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
 --------------------------------------------------------------------
 export OMPI_MCA_btl=^uct,openib
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 --------------------------------------------------------------------
 Mellanox + A100 correctness (Tursa, Booster, Leonardo)
 --------------------------------------------------------------------
 export UCX_MEMTYPE_CACHE=n
 --------------------------------------------------------------------
 MPICH/Aurora/PVC correctness and performance 
 --------------------------------------------------------------------
 https://github.com/pmodels/mpich/issues/7302
 --enable-cuda-aware-mpi=no  
 --enable-unified=no
 Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
 Do not use SVM
 Ideally use MPICH with fix to issue 7302:
 https://github.com/pmodels/mpich/pull/7312
 Ideally:
 MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
 Alternatives:
 export MPIR_CVAR_NOLOCAL=1
 export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
 --------------------------------------------------------------------
 MPICH/Aurora/PVC correctness and performance 
 --------------------------------------------------------------------
 Broken:
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 This gives good peformance without requiring 
 --enable-cuda-aware-mpi=no  
 But is an open issue reported by James Osborn
 https://github.com/pmodels/mpich/issues/7139
 Possibly resolved but unclear if in the installed software yet.
 ************************
 * 2. COMPILATION
 ************************
 --------------------------------------------------------------------
 G++ compiler breakage / graveyard
 --------------------------------------------------------------------
 9.3.0, 10.3.1, 
 https://github.com/paboyle/Grid/issues/290
 https://github.com/paboyle/Grid/issues/264
 Working (-) Broken (X):
 4.9.0 -
 4.9.1 -
 5.1.0 X
 5.2.0 X
 5.3.0 X
 5.4.0 X
 6.1.0 X
 6.2.0 X
 6.3.0 -
 7.1.0 -
 8.0.0 (HEAD) -
 https://github.com/paboyle/Grid/issues/100
 --------------------------------------------------------------------
 AMD GPU nodes :
 --------------------------------------------------------------------
 multiple ROCM versions broken; use 5.3.0
 manifests itself as wrong results in fp32 
 https://github.com/paboyle/Grid/issues/464
 --------------------------------------------------------------------
 Aurora/PVC
 --------------------------------------------------------------------
 SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
 SYCL slow link and relocatable code issues (Christoph Lehner)
 Opt large register file required for good performance in fp64
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
 export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fPIC"
 --------------------------------------------------------------------
 Aurora/PVC useful extra options
 --------------------------------------------------------------------
 Host only sanitizer:
 -Xarch_host -fsanitize=leak
 -Xarch_host -fsanitize=address
 Deterministic MPI reduction:
 export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 ************************
 * 3. Visual profile tools
 ************************
 --------------------------------------------------------------------
 Frontier/rocprof
 --------------------------------------------------------------------
 --------------------------------------------------------------------
 Aurora/unitrace
 --------------------------------------------------------------------
 --------------------------------------------------------------------
 Tursa/nsight-sys
 --------------------------------------------------------------------
--- a/systems/sdcc-genoa/bench.slurm
+++ b/systems/sdcc-genoa/bench.slurm
@ -0,0 +1,32 @@
 #!/bin/bash
 #SBATCH --partition lqcd
 #SBATCH --time=00:50:00
 #SBATCH -A lqcdtest
 #SBATCH -q lqcd
 #SBATCH --exclusive
 #SBATCH --nodes=1
 #SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
 #SBATCH --ntasks=1
 #SBATCH --cpus-per-task=64
 #SBATCH --qos lqcd
 source sourceme.sh
 export PLACES=(1:16:4 1:32:2 0:64:1);
 export THR=(16 32 64)
 for t in  2 
 do
 export OMP_NUM_THREADS=${THR[$t]}
 export OMP_PLACES=${PLACES[$t]}
 export thr=${THR[$t]}
 #for vol in 24.24.24.24 32.32.32.32 48.48.48.96
 for vol in 48.48.48.96
 do
 srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
 done
 #srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
 done
--- a/systems/sdcc-genoa/bench2.slurm
+++ b/systems/sdcc-genoa/bench2.slurm
@ -0,0 +1,36 @@
 #!/bin/bash
 #SBATCH --partition lqcd
 #SBATCH --time=00:50:00
 #SBATCH -A lqcdtest
 #SBATCH -q lqcd
 #SBATCH --exclusive
 #SBATCH --nodes=2
 #SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
 #SBATCH --ntasks=2
 #SBATCH --cpus-per-task=64
 #SBATCH --qos lqcd
 source sourceme.sh
 export PLACES=(1:16:4 1:32:2 0:64:1);
 export THR=(16 32 64)
 nodes=2
 mpi=1.1.1.2
 for t in 2 
 do
 export OMP_NUM_THREADS=${THR[$t]}
 export OMP_PLACES=${PLACES[$t]}
 export thr=${THR[$t]}
 #srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
 for vol in 64.64.64.128
 do
 srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
 done
 done
--- a/systems/sdcc-genoa/config-command
+++ b/systems/sdcc-genoa/config-command
@ -0,0 +1,16 @@
 ../../configure \
 --enable-comms=mpi-auto \
 --enable-unified=yes \
 --enable-shm=shmopen \
 --enable-shm-fast-path=shmopen \
 --enable-accelerator=none \
 --enable-simd=AVX512 \
 --disable-accelerator-cshift \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=clang++ \
 MPICXX=mpicxx \
 CXXFLAGS="-std=c++17"
--- a/systems/sdcc-genoa/sourceme.sh
+++ b/systems/sdcc-genoa/sourceme.sh
@ -0,0 +1,4 @@
 source $HOME/spack/share/spack/setup-env.sh
 spack load llvm@17.0.4
 export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
 module load openmpi
--- a/tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc
@ -0,0 +1,781 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_general_coarse_hdcg.cc
    Copyright (C) 2023
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
 #include <Grid/algorithms/iterative/PowerSpectrum.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 using namespace std;
 using namespace Grid;
 template<class aggregation>
 void SaveFineEvecs(aggregation &Agg,std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(Agg[0].Grid()->IsBoss());
  WR.open(file);
  for(int b=0;b<Agg.size();b++){
    WR.writeScidacFieldRecord(Agg[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
  }
  WR.close();
 #endif
 }
 template<class aggregation>
 void SaveBasis(aggregation &Agg,std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(Agg.FineGrid->IsBoss());
  WR.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
    //    WR.writeScidacFieldRecord(Agg.subspace[b],record);
  }
  WR.close();
 #endif
 }
 template<class aggregation>
 void LoadBasis(aggregation &Agg, std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacReader RD ;
  RD.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
    RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
  }    
  RD.close();
 #endif
 }
 template<class aggregation>
 void LoadBasisSkip(aggregation &Agg, std::string file,int N,LatticeFermionF & tmp)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacReader RD ;
  RD.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
    for(int n=0;n<N;n++){
      RD.readScidacFieldRecord(tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
      if(n==0) precisionChange(Agg.subspace[b],tmp);
    }
    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
  }    
  RD.close();
 #endif
 }
 template<class aggregation>
 void LoadBasisSum(aggregation &Agg, std::string file,int N,LatticeFermionF & tmp)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacReader RD ;
  LatticeFermionF sum(tmp.Grid());
  RD.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
    sum=Zero();
    for(int n=0;n<N;n++){
      RD.readScidacFieldRecord(tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
      sum=sum+tmp;
    }
    precisionChange(Agg.subspace[b],sum);
    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
  }    
  RD.close();
 #endif
 }
 template<class CoarseVector>
 void SaveEigenvectors(std::vector<RealD>            &eval,
 		      std::vector<CoarseVector>     &evec,
 		      std::string evec_file,
 		      std::string eval_file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(evec[0].Grid()->IsBoss());
  WR.open(evec_file);
  for(int b=0;b<evec.size();b++){
    WR.writeScidacFieldRecord(evec[b],record,0,0);
  }
  WR.close();
  XmlWriter WRx(eval_file);
  write(WRx,"evals",eval);
 #endif
 }
 template<class CoarseVector>
 void LoadEigenvectors(std::vector<RealD>            &eval,
 		      std::vector<CoarseVector>     &evec,
 		      std::string evec_file,
 		      std::string eval_file)
 {
 #ifdef HAVE_LIME
    XmlReader RDx(eval_file);
    read(RDx,"evals",eval);
    emptyUserRecord record;
    Grid::ScidacReader RD ;
    RD.open(evec_file);
    assert(evec.size()==eval.size());
    for(int k=0;k<eval.size();k++) {
      RD.readScidacFieldRecord(evec[k],record);
    }
    RD.close();
 #endif
 }
 // Want Op in CoarsenOp to call MatPcDagMatPc
 template<class Field>
 class HermOpAdaptor : public LinearOperatorBase<Field>
 {
  LinearOperatorBase<Field> & wrapped;
 public:
  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
 };
 template<class Field> class FixedCGPolynomial : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field> FineOperator;
  FineOperator   & _SmootherOperator;
  ConjugateGradientPolynomial<Field>  CG;
  int iters;
  bool record;
  int replay_count;
  FixedCGPolynomial(int _iters, FineOperator &SmootherOperator) :
    _SmootherOperator(SmootherOperator),
    iters(_iters),
    record(true),
    CG(0.0,_iters,false)
  {
    std::cout << GridLogMessage<<" FixedCGPolynomial order "<<iters<<std::endl;
    replay_count = 0;
  };
  void operator() (const Field &in, Field &out) 
  {
 #if 1
    GridBase *grid = in.Grid();
    Field Mx0(grid);
    Field r0(grid);
    Field Minvr0(grid);
    _SmootherOperator.HermOp(out,Mx0);
    r0 = in - Mx0;
    Minvr0 = Zero();
    Minvr0.Checkerboard()=in.Checkerboard();
    if ( record ) {
      std::cout << " FixedCGPolynomial recording polynomial "<<std::endl;
      CG.Solve(_SmootherOperator,r0,Minvr0);
      record = false;
      /*
      std::cout << "P(x) = 0 "<<std::endl;
      for(int i=0;i<CG.polynomial.size();i++){
 	std::cout<<" + "<< CG.polynomial[i]<<" * (x**"<<i<<")"<<std::endl;
 	}
      */
      Field tmp(Minvr0.Grid());
      CG.CGsequenceHermOp(_SmootherOperator,r0,tmp);
      tmp = tmp - Minvr0;
      std::cout << " CGsequence error "<<norm2(tmp)<<" / "<<norm2(out)<<std::endl;
    } else {
      std::cout << " FixedCGPolynomial replaying polynomial "<<std::endl;
      CG.CGsequenceHermOp(_SmootherOperator,r0,Minvr0);
      if ( replay_count %5== 0 ) record=true;
      replay_count++;
    }
    out = out + Minvr0;
    _SmootherOperator.HermOp(out,r0);
    r0 = r0 - in;
    RealD rr=norm2(r0);
    RealD ss=norm2(in);
    std::cout << " FixedCGPolynomial replayed polynomial resid "<<::sqrt(rr/ss)<<std::endl;
 #else
    out = Zero();
    out.Checkerboard()=in.Checkerboard();
    if ( record ) {
      std::cout << " FixedCGPolynomial recording polynomial "<<std::endl;
      CG.Solve(_SmootherOperator,in,out);
      record = false;
      std::cout << "P(x) = 0 "<<std::endl;
      for(int i=0;i<CG.polynomial.size();i++){
 	std::cout<<" + "<< CG.polynomial[i]<<" * (x**"<<i<<")"<<std::endl;
      }
      Field tmp(in.Grid());
      CG.CGsequenceHermOp(_SmootherOperator,in,tmp);
      tmp = tmp - out;
      std::cout << " CGsequence error "<<norm2(tmp)<<" / "<<norm2(out)<<std::endl;
    } else {
      std::cout << " FixedCGPolynomial replaying polynomial "<<std::endl;
      CG.CGsequenceHermOp(_SmootherOperator,in,out);
      if ( replay_count %5== 5 ) record=true;
      replay_count++;
    }
 #endif
  }
  void operator() (const std::vector<Field> &in, std::vector<Field> &out)
  {
    for(int i=0;i<out.size();i++){
      out[i]=Zero();
    }
    int blockDim = 0;//not used for BlockCGVec
    BlockConjugateGradient<Field>    BCGV  (BlockCGrQVec,blockDim,0.0,iters,false);
    BCGV(_SmootherOperator,in,out);
  }
 };
 template<class Field> class CGSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field> FineOperator;
  FineOperator   & _SmootherOperator;
  int iters;
  CGSmoother(int _iters, FineOperator &SmootherOperator) :
    _SmootherOperator(SmootherOperator),
    iters(_iters)
  {
    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
  };
  void operator() (const Field &in, Field &out) 
  {
    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
    out=Zero();
    CG(_SmootherOperator,in,out);
  }
 };
 RealD InverseApproximation(RealD x){
  return 1.0/x;
 }
 template<class Field> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field> FineOperator;
  FineOperator   & _SmootherOperator;
  Chebyshev<Field> Cheby;
  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
    _SmootherOperator(SmootherOperator),
    Cheby(_lo,_hi,_ord,InverseApproximation)
  {
    std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
  };
  void operator() (const Field &in, Field &out) 
  {
    //    Field r(out.Grid());
    Cheby(_SmootherOperator,in,out);
    //    _SmootherOperator.HermOp(out,r);
    //    r=r-in;
    //    RealD rr=norm2(r);
    //    RealD ss=norm2(in);
    //    std::cout << GridLogMessage<<" Chebyshev smoother resid "<<::sqrt(rr/ss)<<std::endl;
  }
 };
 template<class Field> class ChebyshevInverter : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field> FineOperator;
  FineOperator   & _Operator;
  Chebyshev<Field> Cheby;
  ChebyshevInverter(RealD _lo,RealD _hi,int _ord, FineOperator &Operator) :
    _Operator(Operator),
    Cheby(_lo,_hi,_ord,InverseApproximation)
  {
    std::cout << GridLogMessage<<" Chebyshev Inverter order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
  };
  void operator() (const Field &in, Field &out) 
  {
    Field r(in.Grid());
    Field AinvR(in.Grid());
    _Operator.HermOp(out,r);
    r = in - r; // b - A x
    Cheby(_Operator,r,AinvR); // A^{-1} ( b - A x ) ~ A^{-1} b - x
    out = out + AinvR;
    _Operator.HermOp(out,r);
    r = in - r; // b - A x
    RealD rr = norm2(r);
    RealD ss = norm2(in);
    std::cout << "ChebshevInverse resid " <<::sqrt(rr/ss)<<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int sample=1;
  if( GridCmdOptionExists(argv,argv+argc,"--sample") ){
    std::string arg;
    arg = GridCmdOptionPayload(argv,argv+argc,"--sample");
    GridCmdOptionInt(arg,sample);
  }
  const int Ls=24;
  const int nbasis = 62;
  const int cb = 0 ;
  RealD mass=0.00078;
  if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
    std::string arg;
    arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
    GridCmdOptionFloat(arg,mass);
  }
  RealD M5=1.8;
  RealD b=1.5;
  RealD c=0.5;
  std::cout << GridLogMessage << " *************************** " <<std::endl;
  std::cout << GridLogMessage << " Mass " <<mass<<std::endl;
  std::cout << GridLogMessage << " M5   " <<M5<<std::endl;
  std::cout << GridLogMessage << " Ls   " <<Ls<<std::endl;
  std::cout << GridLogMessage << " b    " <<b<<std::endl;
  std::cout << GridLogMessage << " c    " <<c<<std::endl;
  std::cout << GridLogMessage << " *************************** " <<std::endl;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  //////////////////////////////////////////
  // Single precision grids -- lanczos + smoother
  //////////////////////////////////////////
  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 								   GridDefaultSimd(Nd,vComplexF::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
  ///////////////////////// Configuration /////////////////////////////////
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("ckpoint_lat.1000");
  NerscIO::readConfiguration(Umu,header,file);
  //////////////////////// Fermion action //////////////////////////////////
  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
  std::cout << "**************************************"<<std::endl;
  std::cout << "         Fine Power method            "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  {
    LatticeFermionD pm_src(FrbGrid);
    pm_src = ComplexD(1.0);
    PowerMethod<LatticeFermionD>       fPM;
    fPM(HermOpEO,pm_src);
  }
  if(0)
  {
    std::cout << "**************************************"<<std::endl;
    std::cout << "         Fine Lanczos           "<<std::endl;
    std::cout << "**************************************"<<std::endl;
    typedef LatticeFermionF FermionField;
    LatticeGaugeFieldF UmuF(UGridF);
    precisionChange(UmuF,Umu);
    MobiusFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,b,c);
    SchurDiagMooeeOperator<MobiusFermionF, LatticeFermionF> HermOpEOF(DdwfF);
    const int Fine_Nstop = 200;
    const int Fine_Nk = 200;
    const int Fine_Np = 200;
    const int Fine_Nm = Fine_Nk+Fine_Np;
    const int Fine_MaxIt= 10;
    RealD Fine_resid = 1.0e-4;
    std::cout << GridLogMessage << "Fine Lanczos "<<std::endl;
    std::cout << GridLogMessage << "Nstop "<<Fine_Nstop<<std::endl;
    std::cout << GridLogMessage << "Nk "<<Fine_Nk<<std::endl;
    std::cout << GridLogMessage << "Np "<<Fine_Np<<std::endl;
    std::cout << GridLogMessage << "resid "<<Fine_resid<<std::endl;
    Chebyshev<FermionField> Cheby(0.002,92.0,401);
    //    Chebyshev<FermionField> Cheby(0.1,92.0,401);
    FunctionHermOp<FermionField> OpCheby(Cheby,HermOpEOF);
    PlainHermOp<FermionField> Op     (HermOpEOF);
    ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Fine_Nstop,Fine_Nk,Fine_Nm,Fine_resid,Fine_MaxIt);
    std::vector<RealD>          Fine_eval(Fine_Nm);
    FermionField                Fine_src(FrbGridF); 
    Fine_src = ComplexF(1.0);
    std::vector<FermionField> Fine_evec(Fine_Nm,FrbGridF);
    int Fine_Nconv;
    std::cout << GridLogMessage <<" Calling IRL.calc single prec"<<std::endl;
    IRL.calc(Fine_eval,Fine_evec,Fine_src,Fine_Nconv);
    std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
    SaveFineEvecs(Fine_evec,evec_file);
  }
  //////////////////////////////////////////
  // Construct a coarsened grid with 4^4 cell
  //////////////////////////////////////////
  Coordinate Block({4,4,6,4});
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/Block[d];
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
 							    GridDefaultSimd(Nd,vComplex::Nsimd()),
 							    GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
  ///////////////////////// RNGs /////////////////////////////////
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
  HermFineMatrix FineHermOp(HermOpEO);
  ////////////////////////////////////////////////////////////
  ///////////// Coarse basis and Little Dirac Operator ///////
  ////////////////////////////////////////////////////////////
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse5d,FrbGrid,cb);
  ////////////////////////////////////////////////////////////
  // Need to check about red-black grid coarsening
  ////////////////////////////////////////////////////////////
  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.mixed.2500.60");
  //  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62");
  //  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.mixed.2500.60");
  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.mixed.60");
  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
  std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
  bool load_agg=true;
  bool load_refine=true;
  bool load_mat=false;
  bool load_evec=false;
  int refine=1;
  if ( load_agg ) {
    if ( !(refine) || (!load_refine) ) { 
      LoadBasis(Aggregates,subspace_file);
    }
  } else {
    //    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
    //    					0.0003,1.0e-5,2000); // Lo, tol, maxit
    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run
    Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 
    SaveBasis(Aggregates,subspace_file);
  }
  std::cout << "**************************************"<<std::endl;
  std::cout << "Building MultiRHS Coarse operator"<<std::endl;
  std::cout << "**************************************"<<std::endl;
  ConjugateGradient<CoarseVector>  coarseCG(4.0e-2,20000,true);
  const int nrhs=24;
  Coordinate mpi=GridDefaultMpi();
  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
  typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
  MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
  std::cout << "**************************************"<<std::endl;
  std::cout << "         Coarse Lanczos               "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
  Chebyshev<CoarseVector>      IRLCheby(0.005,42.0,301);  // 1 iter
  MrhsHermMatrix MrhsCoarseOp     (mrhs);
  //  CoarseVector pm_src(CoarseMrhs);
  //  pm_src = ComplexD(1.0);
  //  PowerMethod<CoarseVector>       cPM;   cPM(MrhsCoarseOp,pm_src);
  int Nk=192;
  int Nm=384;
  int Nstop=Nk;
  int Nconv_test_interval=1;
  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
 							  Coarse5d,
 							  CoarseMrhs,
 							  nrhs,
 							  IRLCheby,
 							  Nstop,
 							  Nconv_test_interval,
 							  nrhs,
 							  Nk,
 							  Nm,
 							  1e-5,10);
  int Nconv;
  std::vector<RealD>            eval(Nm);
  std::vector<CoarseVector>     evec(Nm,Coarse5d);
  std::vector<CoarseVector>     c_src(nrhs,Coarse5d);
  ///////////////////////
  // Deflation guesser object
  ///////////////////////
  MultiRHSDeflation<CoarseVector> MrhsGuesser;
  //////////////////////////////////////////
  // Block projector for coarse/fine
  //////////////////////////////////////////
  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
  //////////////////////////
  // Extra HDCG parameters
  //////////////////////////
  int maxit=300;
  ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false);
  ConjugateGradient<CoarseVector>  CGstart(5.0e-2,maxit,false);
  RealD lo=2.0;
  int ord = 7;
  //  int ord = 11;
  int blockDim = 0;//not used for BlockCG
  BlockConjugateGradient<CoarseVector>    BCG  (BlockCGrQ,blockDim,5.0e-5,maxit,true);
  DoNothingGuesser<CoarseVector> DoNothing;
  //  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
  //  HPDSolver<CoarseVector> HPDSolveMrhsStart(MrhsCoarseOp,CGstart,DoNothing);
  //  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,BCG,DoNothing);
  //  HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,BCG,DoNothing);
  //  FixedCGPolynomial<CoarseVector>  HPDSolveMrhs(maxit,MrhsCoarseOp);
  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,120,MrhsCoarseOp);  //
  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,110,MrhsCoarseOp);  // 114 iter with Chebysmooth and BlockCG
  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,120,MrhsCoarseOp); // 138 iter with Chebysmooth
  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,200,MrhsCoarseOp); // 139 iter
  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(3.0e-3,40.0,200,MrhsCoarseOp); // 137 iter, CG smooth, flex
  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-3,40.0,200,MrhsCoarseOp); // 146 iter, CG smooth, flex
  //  ChebyshevInverter<CoarseVector> HPDSolveMrhs(3.0e-4,40.0,200,MrhsCoarseOp); // 156 iter, CG smooth, flex
  /////////////////////////////////////////////////
  // Mirs smoother
  /////////////////////////////////////////////////
  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,lo);
  //  FixedCGPolynomial<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
  //  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
  ChebyshevSmoother<LatticeFermionD> CGsmooth(2.0,92.0,8,HermOpEO) ;
  if ( load_refine ) {
    LoadBasis(Aggregates,refine_file);
    //    LatticeFermionF conv_tmp(FrbGridF);
    //    LoadBasisSum(Aggregates,refine_file,sample,conv_tmp);
  } else {
    Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters
    SaveBasis(Aggregates,refine_file);
  }
  Aggregates.Orthogonalise();
  std::cout << "**************************************"<<std::endl;
  std::cout << "Coarsen after refine"<<std::endl;
  std::cout << "**************************************"<<std::endl;
  mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
  std::cout << "**************************************"<<std::endl;
  std::cout << " Recompute coarse evecs  "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  evec.resize(Nm,Coarse5d);
  eval.resize(Nm);
  for(int r=0;r<nrhs;r++){
    random(CRNG,c_src[r]);
  }
 IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl);
  std::cout << "**************************************"<<std::endl;
  std::cout << " Reimport coarse evecs  "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  MrhsGuesser.ImportEigenBasis(evec,eval);
  std::cout << "**************************************"<<std::endl;
  std::cout << " Setting up mRHS HDCG"<<std::endl;
  std::cout << "**************************************"<<std::endl;
  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
  MrhsProjector.ImportBasis(Aggregates.subspace);
  std::cout << "**************************************"<<std::endl;
  std::cout << "Calling mRHS HDCG"<<std::endl;
  std::cout << "**************************************"<<std::endl;
  TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
    HDCGmrhs(1.0e-8, 300,
 	     FineHermOp,
 	     CGsmooth,
 	     HPDSolveMrhs,    // Used in M1
 	     HPDSolveMrhs,          // Used in Vstart
 	     MrhsProjector,
 	     MrhsGuesser,
 	     CoarseMrhs);
  std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
  std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
  LatticeFermionD result_accurate(FrbGrid);
  LatticeFermionD result_sloppy(FrbGrid);
  LatticeFermionD error(FrbGrid);
  LatticeFermionD residual(FrbGrid);
  for(int r=0;r<nrhs;r++){
    random(RNG5,src_mrhs[r]);
    res_mrhs[r]=Zero();
  }
  HDCGmrhs(src_mrhs,res_mrhs);
  result_accurate = res_mrhs[0];
 #if 0
  std::vector<RealD>   bins({1.0e-3,1.0e-2,1.0e-1,1.0,10.0,100.0});
  std::vector<int>   orders({6000  ,4000  ,1000  ,500,500 ,500});
  PowerSpectrum GraphicEqualizer(bins,orders);
  std::cout << "**************************************"<<std::endl;
  std::cout << GridLogMessage << " PowerSpectrum of rrr "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  GraphicEqualizer(FineHermOp,HDCGmrhs.rrr);
  std::cout << "**************************************"<<std::endl;
  std::cout << GridLogMessage << " PowerSpectrum of sss "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  GraphicEqualizer(FineHermOp,HDCGmrhs.sss);
  std::cout << "**************************************"<<std::endl;
  std::cout << GridLogMessage << " PowerSpectrum of qqq "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  GraphicEqualizer(FineHermOp,HDCGmrhs.qqq);
  std::cout << "**************************************"<<std::endl;
  std::cout << GridLogMessage << " PowerSpectrum of zzz "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  GraphicEqualizer(FineHermOp,HDCGmrhs.zzz);
  std::vector<RealD> tols({1.0e-3,1.0e-4,1.0e-5});
  for(auto tol : tols) {
    TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
      HDCGmrhsSloppy(tol, 500,
 		     FineHermOp,
 		     CGsmooth,
 		     HPDSolveMrhs,    // Used in M1
 		     HPDSolveMrhs,    // Used in Vstart
 		     MrhsProjector,
 		     MrhsGuesser,
 		     CoarseMrhs);
    //  Solve again to 10^-5
    for(int r=0;r<nrhs;r++){
      res_mrhs[r]=Zero();
    }
    HDCGmrhsSloppy(src_mrhs,res_mrhs);
    result_sloppy = res_mrhs[0];
    error = result_sloppy - result_accurate;
    FineHermOp.HermOp(result_sloppy,residual);
    residual = residual - src_mrhs[0];
    std::cout << "**************************************"<<std::endl;
    std::cout << GridLogMessage << " Converged to tolerance "<< tol<<std::endl;
    std::cout << GridLogMessage << " Absolute error "<<norm2(error)<<std::endl;
    std::cout << GridLogMessage << " Residual       "<<norm2(residual)<<std::endl;
    std::cout << "**************************************"<<std::endl;
    std::cout << "**************************************"<<std::endl;
    std::cout << GridLogMessage << " PowerSpectrum of error   "<<std::endl;
    std::cout << "**************************************"<<std::endl;
    GraphicEqualizer(FineHermOp,error);
    std::cout << "**************************************"<<std::endl;
    std::cout << GridLogMessage << " PowerSpectrum of residual   "<<std::endl;
    std::cout << "**************************************"<<std::endl;
    GraphicEqualizer(FineHermOp,residual);
  };
 #endif
  // Standard CG
 #if 0
  {
  std::cout << "**************************************"<<std::endl;
  std::cout << "Calling red black CG"<<std::endl;
  std::cout << "**************************************"<<std::endl;
    LatticeFermion result(FrbGrid); result=Zero();
    LatticeFermion    src(FrbGrid); random(RNG5,src);
    result=Zero();
    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
    CGfine(HermOpEO, src, result);
  }
 #endif  
  Grid_finalize();
  return 0;
 }
--- a/tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc
@ -0,0 +1,355 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_general_coarse_hdcg.cc
    Copyright (C) 2023
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
 using namespace std;
 using namespace Grid;
 template<class aggregation>
 void SaveFineEvecs(aggregation &Agg,std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(Agg[0].Grid()->IsBoss());
  WR.open(file);
  for(int b=0;b<Agg.size();b++){
    WR.writeScidacFieldRecord(Agg[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
  }
  WR.close();
 #endif
 }
 template<class aggregation>
 void SaveBasis(aggregation &Agg,std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(Agg.FineGrid->IsBoss());
  WR.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
    //    WR.writeScidacFieldRecord(Agg.subspace[b],record);
  }
  WR.close();
 #endif
 }
 template<class aggregation>
 void LoadBasis(aggregation &Agg, std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacReader RD ;
  RD.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
    RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
  }    
  RD.close();
 #endif
 }
 template<class aggregation>
 void LoadFineEvecs(aggregation &Agg, std::string file,LatticeFermionF & conv_tmp)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacReader RD ;
  RD.open(file);
  for(int b=0;b<Agg.size();b++){
    RD.readScidacFieldRecord(conv_tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
    precisionChange(Agg[b],conv_tmp);
  }    
  RD.close();
 #endif
 }
 template<class CoarseVector>
 void SaveEigenvectors(std::vector<RealD>            &eval,
 		      std::vector<CoarseVector>     &evec,
 		      std::string evec_file,
 		      std::string eval_file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(evec[0].Grid()->IsBoss());
  WR.open(evec_file);
  for(int b=0;b<evec.size();b++){
    WR.writeScidacFieldRecord(evec[b],record,0,0);
  }
  WR.close();
  XmlWriter WRx(eval_file);
  write(WRx,"evals",eval);
 #endif
 }
 template<class CoarseVector>
 void LoadEigenvectors(std::vector<RealD>            &eval,
 		      std::vector<CoarseVector>     &evec,
 		      std::string evec_file,
 		      std::string eval_file)
 {
 #ifdef HAVE_LIME
    XmlReader RDx(eval_file);
    read(RDx,"evals",eval);
    emptyUserRecord record;
    Grid::ScidacReader RD ;
    RD.open(evec_file);
    assert(evec.size()==eval.size());
    for(int k=0;k<eval.size();k++) {
      RD.readScidacFieldRecord(evec[k],record);
    }
    RD.close();
 #endif
 }
 // Want Op in CoarsenOp to call MatPcDagMatPc
 template<class Field>
 class HermOpAdaptor : public LinearOperatorBase<Field>
 {
  LinearOperatorBase<Field> & wrapped;
 public:
  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
 };
 template<class Field> class CGSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field> FineOperator;
  FineOperator   & _SmootherOperator;
  int iters;
  CGSmoother(int _iters, FineOperator &SmootherOperator) :
    _SmootherOperator(SmootherOperator),
    iters(_iters)
  {
    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
  };
  void operator() (const Field &in, Field &out) 
  {
    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
    out=Zero();
    CG(_SmootherOperator,in,out);
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=24;
  const int nbasis = 62;
  const int cb = 0 ;
  RealD mass=0.00078;
  RealD M5=1.8;
  RealD b=1.5;
  RealD c=0.5;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  // Construct a coarsened grid with 4^4 cell
  Coordinate Block({4,4,6,4});
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/Block[d];
  }
  //////////////////////////////////////////
  // Double precision grids 
  //////////////////////////////////////////
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
 							    GridDefaultSimd(Nd,vComplex::Nsimd()),
 							    GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
  //////////////////////////////////////////
  // Single precision grids -- lanczos + smoother
  //////////////////////////////////////////
  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 								   GridDefaultSimd(Nd,vComplexF::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
  ///////////////////////// RNGs /////////////////////////////////
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
  ///////////////////////// Configuration /////////////////////////////////
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("ckpoint_lat.1000");
  NerscIO::readConfiguration(Umu,header,file);
  //////////////////////// Fermion action //////////////////////////////////
  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
  const int Fine_Nstop = 200;
  const int Fine_Nk = 100;
  const int Fine_Np = 100;
  const int Fine_Nm = Fine_Nk+Fine_Np;
  typedef LatticeFermion FermionField;
  std::vector<RealD>        Fine_eval;
  std::vector<FermionField> Fine_evec;
  LatticeFermionF conv_tmp(FrbGridF);
  Fine_eval.resize(Fine_Nstop);
  Fine_evec.resize(Fine_Nstop,FrbGrid);
  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
  LoadFineEvecs(Fine_evec,evec_file,conv_tmp);
  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
  HermFineMatrix FineHermOp(HermOpEO);
  ////////////////////////////////////////////////////////////
  ///////////// Coarse basis and Little Dirac Operator ///////
  ////////////////////////////////////////////////////////////
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse5d,FrbGrid,cb);
  ////////////////////////////////////////////////////////////
  // Need to check about red-black grid coarsening
  ////////////////////////////////////////////////////////////
  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.mixed.2500.60");
  //  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62");
  //  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evec");
  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.mixed.2500.60");
  //  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.mixed.60");
  //  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
  //  std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
  bool load_agg=true;
  bool load_refine=true;
  //////////////////////////////////////////
  // Block projector for coarse/fine
  //////////////////////////////////////////
  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
  /////////////////////////////////////////////////
  // Mirs smoother
  /////////////////////////////////////////////////
  int ord=8;
  RealD lo=2.0;
  RealD MirsShift = lo;
  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
  LoadBasis(Aggregates,refine_file);
  Aggregates.Orthogonalise();
  std::cout << "**************************************"<<std::endl;
  std::cout << " Using filtered subspace"<<std::endl;
  std::cout << "**************************************"<<std::endl;
  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
  MrhsProjector.ImportBasis(Aggregates.subspace);
  FermionField Ftmp(FrbGrid);
  std::vector<FermionField> Fine_ev(1,FrbGrid);
  std::vector<FermionField> Fine_ev_compressed(1,FrbGrid);
  std::vector<CoarseVector>  c_evec(1,Coarse5d);
  for(int ev=0;ev<Fine_evec.size();ev++){
    Fine_ev[0] = Fine_evec[ev];
    MrhsProjector.blockProject(Fine_ev,c_evec);
    MrhsProjector.blockPromote(Fine_ev_compressed,c_evec);
    Ftmp = Fine_ev_compressed[0];
    RealD div = 1.0/ sqrt(norm2(Ftmp));
    Ftmp = Ftmp * div;
    std::cout << GridLogMessage<<" "<<ev<<" uncomp "<< norm2(Fine_ev[0])  <<std::endl;
    std::cout << GridLogMessage<<" "<<ev<<" comp   "<< norm2(Ftmp)  <<std::endl;
    Ftmp = Fine_ev[0] - Ftmp;
    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
    CGsmooth(Fine_ev_compressed[0],Ftmp);
    Ftmp = Ftmp *lo;
    std::cout << GridLogMessage<<" "<<ev<<" smoothed "<< norm2(Ftmp)  <<std::endl;
    div = 1.0/ sqrt(norm2(Ftmp));
    Ftmp=Ftmp*div;
    Ftmp = Fine_ev[0]-Ftmp;
    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
  }
  std::cout << "**************************************"<<std::endl;
  std::cout << " Using eigenvector subspace "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  for(int i=0;i<Aggregates.subspace.size();i++){
    Aggregates.subspace[i] = Fine_evec[i];
  }
  Aggregates.Orthogonalise();
  MrhsProjector.ImportBasis(Aggregates.subspace);
  for(int ev=0;ev<Fine_evec.size();ev++){
    Fine_ev[0] = Fine_evec[ev];
    MrhsProjector.blockProject(Fine_ev,c_evec);
    MrhsProjector.blockPromote(Fine_ev_compressed,c_evec);
    Ftmp = Fine_ev_compressed[0];
    RealD div = 1.0/ sqrt(norm2(Ftmp));
    Ftmp = Ftmp * div;
    std::cout << GridLogMessage<<" "<<ev<<" uncomp "<< norm2(Fine_ev[0])  <<std::endl;
    std::cout << GridLogMessage<<" "<<ev<<" comp   "<< norm2(Ftmp)  <<std::endl;
    Ftmp = Fine_ev[0] - Ftmp;
    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
    CGsmooth(Fine_ev_compressed[0],Ftmp);
    Ftmp = Ftmp *lo;
    std::cout << GridLogMessage<<" "<<ev<<" smoothed "<< norm2(Ftmp)  <<std::endl;
    div = 1.0/ sqrt(norm2(Ftmp));
    Ftmp=Ftmp*div;
    Ftmp = Fine_ev[0]-Ftmp;
    std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp)  <<std::endl;
  }
  // Standard CG
  Grid_finalize();
  return 0;
 }
--- a/tests/debug/Test_general_coarse_pvdagm.cc
+++ b/tests/debug/Test_general_coarse_pvdagm.cc
@ -36,28 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
 template<class Field>
 class HermOpAdaptor : public LinearOperatorBase<Field>
 {
  LinearOperatorBase<Field> & wrapped;
 public:
  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    wrapped.HermOp(in,out);
  }
  void AdjOp     (const Field &in, Field &out){
    wrapped.HermOp(in,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    wrapped.HermOp(in,out);
  }
 };
 template<class Matrix,class Field>
 class PVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
@ -69,78 +47,169 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
-    _PV.M(tmp,out);
+    _PV.M(in,tmp);
-    _Mat.Mdag(in,tmp);
+    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    std::cout << "HermOp"<<std::endl;
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    //    _Mat.M(in,tmp);
    //    _PV.Mdag(tmp,out);
    //    _PV.M(out,tmp);
    //    _Mat.Mdag(tmp,out);
    Op(in,tmp);
    AdjOp(tmp,out);
    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Matrix,class Field>
 class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  RealD shift;
 public:
  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
-    _PV.M(out,tmp);
+    out = out + shift * in;
    _Mat.Mdag(tmp,out);
    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Field> class DumbOperator  : public LinearOperatorBase<Field> {
 public:
  LatticeComplex scale;
  DumbOperator(GridBase *grid) : scale(grid)
  {
    scale = 0.0;
    LatticeComplex scalesft(grid);
    LatticeComplex scaletmp(grid);
    for(int d=0;d<4;d++){
      Lattice<iScalar<vInteger> > x(grid); LatticeCoordinate(x,d+1);
      LatticeCoordinate(scaletmp,d+1);
      scalesft = Cshift(scaletmp,d+1,1);
      scale = 100.0*scale + where( mod(x    ,2)==(Integer)0, scalesft,scaletmp);
    }
    std::cout << " scale\n" << scale << std::endl;
  }
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {};
  void OpDir  (const Field &in, Field &out,int dir,int disp){};
  void OpDirAll  (const Field &in, std::vector<Field> &out) {};
  void Op     (const Field &in, Field &out){
    out = scale * in;
  }
  void AdjOp     (const Field &in, Field &out){
-    out = scale * in;
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    double n1, n2;
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
-    HermOpAndNorm(in,out,n1,n2);
+    Field tmp(in.Grid());
-  }
+    Op(in,tmp);
-  void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
+    AdjOp(tmp,out);
    ComplexD dot;
    out = scale * in;
    dot= innerProduct(in,out);
    n1=real(dot);
    dot = innerProduct(out,out);
    n2=real(dot);
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
  typedef LinearOperatorBase<FineField>                            FineOperator;
  typedef LinearFunction    <FineField>                            FineSmoother;
  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
  Aggregates     & _Aggregates;
  FineOperator   & _FineOperator;
  FineSmoother   & _PreSmoother;
  FineSmoother   & _PostSmoother;
  CoarseOperator & _CoarseOperator;
  CoarseSolver   & _CoarseSolve;
  int    level;  void Level(int lv) {level = lv; };
  MGPreconditioner(Aggregates &Agg,
 		   FineOperator &Fine,
 		   FineSmoother &PreSmoother,
 		   FineSmoother &PostSmoother,
 		   CoarseOperator &CoarseOperator_,
 		   CoarseSolver &CoarseSolve_)
    : _Aggregates(Agg),
      _FineOperator(Fine),
      _PreSmoother(PreSmoother),
      _PostSmoother(PostSmoother),
      _CoarseOperator(CoarseOperator_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  virtual void operator()(const FineField &in, FineField & out) 
  {
    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
    //    auto CoarseGrid = _CoarseOperator.Grid();
    CoarseVector Csrc(CoarseGrid);
    CoarseVector Csol(CoarseGrid);
    FineField vec1(in.Grid());
    FineField vec2(in.Grid());
    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
    double t;
    // Fine Smoother
    //    out = in;
    out = Zero();
    t=-usecond();
    _PreSmoother(in,out);
    t+=usecond();
    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
    // Update the residual
    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
    // Fine to Coarse 
    t=-usecond();
    _Aggregates.ProjectToSubspace  (Csrc,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse correction
    t=-usecond();
    Csol = Zero();
    _CoarseSolve(Csrc,Csol);
    //Csol=Zero();
    t+=usecond();
    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse to Fine
    t=-usecond();  
    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
    _Aggregates.PromoteFromSubspace(Csol,vec1); 
    add(out,out,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
    // Residual
    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
    // Fine Smoother
    t=-usecond();
    //    vec2=vec1;
    vec2=Zero();
    _PostSmoother(vec1,vec2);
    t+=usecond();
    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
    add( out,out,vec2);
    std::cout<<GridLogMessage << "Done " <<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
-  const int Ls=2;
+  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@ -151,7 +220,8 @@ int main (int argc, char ** argv)
  // Construct a coarsened grid
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
-    clatt[d] = clatt[d]/4;
+    clatt[d] = clatt[d]/2;
    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
@ -173,15 +243,14 @@ int main (int argc, char ** argv)
  FieldMetaData header;
  std::string file("ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu,header,file);
  //Umu = 1.0;
-  RealD mass=0.5;
+  RealD mass=0.01;
  RealD M5=1.8;
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
-  const int nbasis = 1;
+  const int nbasis = 20;
  const int cb = 0 ;
  LatticeFermion prom(FGrid);
@ -194,24 +263,50 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
-  PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM(Ddwf,Dpv);
+  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
-  HermOpAdaptor<LatticeFermionD> HOA(PVdagM);
+  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
  PVdagM_t PVdagM(Ddwf,Dpv);
  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
  // Run power method on HOA??
-  PowerMethod<LatticeFermion>       PM;   PM(HOA,src);
+  //  PowerMethod<LatticeFermion>       PM;   PM(PVdagM,src);
  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace AggregatesPD(Coarse5d,FGrid,cb);
  /*
  AggregatesPD.CreateSubspaceChebyshev(RNG5,
-				       HOA,
+				       PVdagM,
 				       nbasis,
-				       5000.0,
+				       4000.0,
-				       0.02,
+				       2.0,
-				       100,
+				       200,
-				       50,
+				       200,
-				       50,
+				       200,
 				       0.0);
  */
  AggregatesPD.CreateSubspaceGCR(RNG5,
 				 PVdagM,
 				 nbasis);
  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
  LittleDiracOpPV.CoarsenOperator(PVdagM,AggregatesPD);
@ -257,6 +352,60 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
  /**********
   * Some solvers
   **********
   */
  ///////////////////////////////////////
  // Coarse grid solver test
  ///////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<CoarseVector> simple;
  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-2, 100, LinOpCoarse,simple,10,10); 
  L2PGCR.Level(3);
  c_res=Zero();
  L2PGCR(c_src,c_res);
  ////////////////////////////////////////
  // Fine grid smoother
  ////////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<LatticeFermionD> simple_fine;
  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
  SmootherGCR.Level(2);
  LatticeFermionD f_src(FGrid);
  LatticeFermionD f_res(FGrid);
  f_src = one;  // 1 in every element for vector 1.
  f_res=Zero();
  SmootherGCR(f_src,f_res);
  typedef MGPreconditioner<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
  TwoLevelMG TwoLevelPrecon(AggregatesPD,
 			    PVdagM,
 			    simple_fine,
 			    SmootherGCR,
 			    LinOpCoarse,
 			    L2PGCR);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16);
  L1PGCR.Level(1);
  f_res=Zero();
  L1PGCR(f_src,f_res);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
Author	SHA1	Message	Date
Peter Boyle	199818bd6c	Merge pull request #475 from lehner/feature-aurora Sync with GPT on Aurora	2025-03-13 08:55:55 -04:00
Christoph Lehner	fe66c7ca30	verbosity	2025-03-13 12:49:36 +00:00
Christoph Lehner	e9177e4af3	Blas compatibility	2025-03-13 08:48:23 +00:00
Christoph Lehner	d15a6c5933	Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora	2025-03-13 07:29:55 +00:00
Peter Boyle	25ab9325e7	Use hostVector but remove construct resize	2025-03-11 15:02:32 +00:00
Peter Boyle	19f9378b98	Should work on Aurora nowb	2025-03-11 13:50:43 +00:00
Christoph Lehner	9ffd1ed4ce	Merged	2025-03-08 15:30:08 +00:00
Peter Boyle	3d014864e2	Makinig LLVM happy	2025-03-06 14:19:25 -05:00
Peter Boyle	1d22841811	Working on aurora, GPT issue turned up is fixed	2025-03-06 03:20:18 +00:00
Peter Boyle	a1cdda833f	Update WorkArounds.txt	2025-03-05 14:04:23 -05:00
Peter Boyle	ad6db92690	Update WorkArounds.txt	2025-03-05 14:00:26 -05:00
Peter Boyle	e8ff9d8e50	Update WorkArounds.txt	2025-03-05 14:00:04 -05:00
Peter Boyle	795769c636	Update WorkArounds.txt	2025-03-05 13:50:41 -05:00
Peter Boyle	267a39d943	Update WorkArounds.txt	2025-03-05 13:49:43 -05:00
Peter Boyle	3624bd3d22	Update WorkArounds.txt	2025-03-05 13:45:09 -05:00
Peter Boyle	bc12dbbb38	Update WorkArounds.txt	2025-03-05 12:48:56 -05:00
Peter Boyle	eb8a008a8f	Create WorkArounds.txt	2025-03-05 12:41:59 -05:00
Peter Boyle	c4d9aa1a21	Config command that makes GPT happier	2025-02-27 20:12:49 +00:00
Peter Boyle	6ae809ed40	Print not liked on GPT compile	2025-02-27 20:12:49 +00:00
Peter Boyle	311e2aab3f	Update Accelerator.h	2025-02-26 11:42:52 -05:00
Peter Boyle	438dfbdb83	Only throw if there is a pending list entry in CommsComplete	2025-02-25 16:57:27 +00:00
Peter Boyle	b2ce760cf4	Verbose issue with GPT	2025-02-25 16:55:23 +00:00
Peter Boyle	ba9bbe0221	Bounce MPI through host	2025-02-12 19:34:59 +00:00
Peter Boyle	4c3dd82d84	CSHIFT with bounce throuhgh Host memory on MPI packets	2025-02-12 19:09:53 +00:00
Peter Boyle	44e911b5b7	Comment change	2025-02-12 17:37:55 +00:00
Peter Boyle	a7a16df9d0	GET not put has kinder barrier sequence for NVLINK type access as when GET is done, I can use it without barrier. Moves a barrier to a nicer place, overlapped with DtoH DMA	2025-02-12 14:59:28 +00:00
Peter Boyle	382e0abefd	Was issueing a double fence -- the gather also fences	2025-02-12 14:57:28 +00:00
Peter Boyle	6fdefe5b90	Barrier sequencing if doing "GET" not "PUT" is different. This is somewhat better timing for Barriers	2025-02-12 14:55:20 +00:00
Peter Boyle	4788dd8e2e	More states in packet progression for GPU non aware MPI	2025-02-12 14:53:57 +00:00
Peter Boyle	1cc5f221f3	GET not put ordering is better as I know when I've got all MY data	2025-02-12 14:53:05 +00:00
Peter Boyle	93251bfba0	GET not put for better ordering in the downstream dependent kernels -- I know when I'm done, so we can move a barrier / handshake between ranks intranode to a point off critical path	2025-02-12 14:50:21 +00:00
Peter Boyle	18b79508b8	New line better for pretty print	2025-02-12 14:49:48 +00:00
Peter Boyle	4de5ed1613	Remove vector view. The std::vector will not inform Memory manager of deletion and so a stale entry could be left. It is not and should not be used.	2025-02-12 14:48:46 +00:00
Peter Boyle	0baaddbe98	Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384 nodes. More concurrency/fine grained scheduling is possible.	2025-02-04 19:27:26 +00:00
Peter Boyle	b50fb34e71	Perf on Aurora	2025-02-01 18:39:34 +00:00
Peter Boyle	de84d730ff	Fastest run config on Aurora to date	2025-02-01 18:08:40 +00:00
Peter Boyle	c74d11e3d7	PVdagM MG	2025-02-01 11:04:13 -05:00
Christoph Lehner	84cab5e6e7	no comms and log cleanup	2025-02-01 16:37:21 +01:00
Peter Boyle	c4fc972fec	Merge branch 'feature/deprecate-uvm' into develop	2025-01-31 16:32:36 +00:00
Peter Boyle	3f3661a86f	Heading towards PVdagM multigrid	2025-01-17 14:33:35 +00:00
Peter Boyle	5a4f9bf2e3	Force the ROCM version	2024-10-29 18:12:31 -04:00
Peter Boyle	f617468e04	Update Lattice_base.h	2024-10-11 10:39:16 -04:00
Peter Boyle	ee4046fe92	Added a dimension ordered column sum based reduction for scalar. Removes dependence on MPI_Allreduce and allows for work around on systems where this is bollox.	2024-09-27 09:26:03 -04:00
Peter Boyle	2a9cfeb9ea	New files	2024-09-26 14:23:29 -04:00
Peter Boyle	1147b8ea40	Cheby poly setup	2024-09-26 14:20:32 -04:00
Peter Boyle	3f9119b39d	Remove vectors used for the power spectrum table in paper	2024-09-26 14:19:41 -04:00
Peter Boyle	35e8225abd	Verbose control	2024-09-26 14:18:35 -04:00
Peter Boyle	bdbfbb7a14	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-09-26 14:05:45 -04:00
Peter Boyle	f7d4be8d96	Calculate bytes correctly	2024-09-26 14:04:44 -04:00