Adding simple lanczos, boundary to specflow(!)

Adding mass step
Move out src initialization for re-use / Adding antiperiodic BC
2025-11-10 08:39:31 +00:00 · 2025-08-06 23:41:53 +00:00 · 2025-08-06 16:52:51 +00:00 · 2025-08-06 16:51:14 +00:00 · 2025-07-11 15:57:23 -04:00 · 2025-04-25 10:48:41 -04:00
104 changed files with 4730 additions and 588 deletions
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -73,6 +73,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/iterative/SimpleLanczos.h>
 #include <Grid/algorithms/iterative/PowerMethod.h>
 #include <Grid/algorithms/iterative/AdefGeneric.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@@ -191,7 +191,7 @@ public:
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
-    std::cout << "CPU view" << std::endl;
+    //std::cout << "CPU view" << std::endl;
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@@ -215,7 +215,7 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
-    std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -229,7 +229,7 @@ public:
    }
    // Barrel shift and collect global pencil
-    std::cout << GridLogPerformance<<"Making pencil" << std::endl;
+    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@@ -251,7 +251,7 @@ public:
      }
    }
-    std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
+    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@@ -274,7 +274,7 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
-    std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
+    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@@ -291,7 +291,7 @@ public:
    }
    result = result*div;
-    std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
+    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -277,6 +277,38 @@ public:
    assert(0);
  }
 };
 template<class Matrix,class Field>
 class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  RealD shift;
 public:
  ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
    _Mat.Mdiag(in,out);
    out = out + shift*in;
  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){
    _Mat.MdirAll(in,out);
  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    _Mat.Mdag(in,out);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    assert(0);
  }
  void HermOp(const Field &in, Field &out){
    assert(0);
  }
 };
 //////////////////////////////////////////////////////////
 // Even Odd Schur decomp operators; there are several
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -208,8 +208,8 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@@ -367,28 +367,67 @@ public:
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
        });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.adjoint() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@@ -414,8 +453,8 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@@ -514,28 +553,70 @@ public:
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.adjoint() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@@ -661,29 +742,41 @@ public:
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
-	  } );
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
      }
@@ -809,28 +902,40 @@ public:
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -245,9 +245,10 @@ until convergence
 	_HermOp(src_n,tmp);
 	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
 	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
-	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+//	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
 	RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
 	RealD vden = norm2(src_n);
-	RealD na = vnum/vden;
+	RealD na = std::sqrt(vnum/vden);
 	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
@@ -255,6 +256,7 @@ until convergence
 	src_n = tmp;
      }
    }
    std::cout << GridLogIRL << " Final evalMaxApprox  " << evalMaxApprox << std::endl;
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
--- a/Grid/algorithms/iterative/SimpleLanczos.h
+++ b/Grid/algorithms/iterative/SimpleLanczos.h
@@ -0,0 +1,931 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Chulwoo Jung <chulwoo@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LANC_H
 #define GRID_LANC_H
 #include <string.h>		//memset
 #ifdef USE_LAPACK
 #ifdef USE_MKL
 #include<mkl_lapack.h>
 #else
 void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
 		    double *vl, double *vu, int *il, int *iu, double *abstol,
 		    int *m, double *w, double *z, int *ldz, int *isuppz,
 		    double *work, int *lwork, int *iwork, int *liwork,
 		    int *info);
 //#include <lapacke/lapacke.h>
 #endif
 #endif
 //#include <Grid/algorithms/densematrix/DenseMatrix.h>
 // eliminate temorary vector in calc()
 #define MEM_SAVE
 namespace Grid
 {
  struct Bisection
  {
 #if 0
    static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
 			  std::vector < RealD > &BETA,
 			  std::vector < RealD > &eig)
    {
      int i, j;
        std::vector < RealD > evec1 (row_num + 3);
        std::vector < RealD > evec2 (row_num + 3);
      RealD eps2;
        ALPHA[1] = 0.;
        BETHA[1] = 0.;
      for (i = 0; i < row_num - 1; i++)
 	{
 	  ALPHA[i + 1] = A[i * (row_num + 1)].real ();
 	  BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
 	}
      ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
      // Do we really need to sort here?
      int begin = 1;
      int end = row_num;
      int swapped = 1;
      while (swapped)
 	{
 	  swapped = 0;
 	  for (i = begin; i < end; i++)
 	    {
 	      if (mag (evec2[i]) > mag (evec2[i + 1]))
 		{
 		  swap (evec2 + i, evec2 + i + 1);
 		  swapped = 1;
 		}
 	    }
 	  end--;
 	  for (i = end - 1; i >= begin; i--)
 	    {
 	      if (mag (evec2[i]) > mag (evec2[i + 1]))
 		{
 		  swap (evec2 + i, evec2 + i + 1);
 		  swapped = 1;
 		}
 	    }
 	  begin++;
 	}
      for (i = 0; i < row_num; i++)
 	{
 	  for (j = 0; j < row_num; j++)
 	    {
 	      if (i == j)
 		H[i * row_num + j] = evec2[i + 1];
 	      else
 		H[i * row_num + j] = 0.;
 	    }
 	}
    }
 #endif
    static void bisec (std::vector < RealD > &c,
 		       std::vector < RealD > &b,
 		       int n,
 		       int m1,
 		       int m2,
 		       RealD eps1,
 		       RealD relfeh, std::vector < RealD > &x, RealD & eps2)
    {
      std::vector < RealD > wu (n + 2);
      RealD h, q, x1, xu, x0, xmin, xmax;
      int i, a, k;
      b[1] = 0.0;
      xmin = c[n] - fabs (b[n]);
      xmax = c[n] + fabs (b[n]);
      for (i = 1; i < n; i++)
 	{
 	  h = fabs (b[i]) + fabs (b[i + 1]);
 	  if (c[i] + h > xmax)
 	    xmax = c[i] + h;
 	  if (c[i] - h < xmin)
 	    xmin = c[i] - h;
 	}
      xmax *= 2.;
      eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
      if (eps1 <= 0.0)
 	eps1 = eps2;
      eps2 = 0.5 * eps1 + 7.0 * (eps2);
      x0 = xmax;
      for (i = m1; i <= m2; i++)
 	{
 	  x[i] = xmax;
 	  wu[i] = xmin;
 	}
      for (k = m2; k >= m1; k--)
 	{
 	  xu = xmin;
 	  i = k;
 	  do
 	    {
 	      if (xu < wu[i])
 		{
 		  xu = wu[i];
 		  i = m1 - 1;
 		}
 	      i--;
 	    }
 	  while (i >= m1);
 	  if (x0 > x[k])
 	    x0 = x[k];
 	  while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
 	    {
 	      x1 = (xu + x0) / 2;
 	      a = 0;
 	      q = 1.0;
 	      for (i = 1; i <= n; i++)
 		{
 		  q =
 		    c[i] - x1 -
 		    ((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
 		  if (q < 0)
 		    a++;
 		}
 //      printf("x1=%0.14e a=%d\n",x1,a);
 	      if (a < k)
 		{
 		  if (a < m1)
 		    {
 		      xu = x1;
 		      wu[m1] = x1;
 		    }
 		  else
 		    {
 		      xu = x1;
 		      wu[a + 1] = x1;
 		      if (x[a] > x1)
 			x[a] = x1;
 		    }
 		}
 	      else
 		x0 = x1;
 	    }
 	  printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
 	  x[k] = (x0 + xu) / 2;
 	}
    }
  };
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
  template < class Field > class SimpleLanczos
  {
    const RealD small = 1.0e-16;
  public:
    int lock;
    int get;
    int Niter;
    int converged;
    int Nstop;			// Number of evecs checked for convergence
    int Nk;			// Number of converged sought
    int Np;			// Np -- Number of spare vecs in kryloc space
    int Nm;			// Nm -- total number of vectors
    RealD OrthoTime;
    RealD eresid;
 //    SortEigen < Field > _sort;
    LinearFunction < Field > &_Linop;
 //    OperatorFunction < Field > &_poly;
    /////////////////////////
    // Constructor
    /////////////////////////
    void init (void)
    {
    };
 //    void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector  < RealD > >&evecs);
    SimpleLanczos (LinearFunction < Field > &Linop,	// op
 //		   OperatorFunction < Field > &poly,	// polynmial
 		   int _Nstop,	// sought vecs
 		   int _Nk,	// sought vecs
 		   int _Nm,	// spare vecs
 		   RealD _eresid,	// resid in lmdue deficit 
 		   int _Niter):	// Max iterations
      _Linop (Linop),
 //     _poly (poly),
      Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
    {
      Np = Nm - Nk;
      assert (Np > 0);
    };
    /////////////////////////
    // Sanity checked this routine (step) against Saad.
    /////////////////////////
    void RitzMatrix (std::vector < Field > &evec, int k)
    {
      if (1)
 	return;
      GridBase *grid = evec[0].Grid();
      Field w (grid);
      std::cout << GridLogMessage << "RitzMatrix " << std::endl;
      for (int i = 0; i < k; i++)
 	{
 	  _Linop(evec[i], w);
 //      _poly(_Linop,evec[i],w);
 	  std::cout << GridLogMessage << "[" << i << "] ";
 	  for (int j = 0; j < k; j++)
 	    {
 	      ComplexD in = innerProduct (evec[j], w);
 	      if (fabs ((double) i - j) > 1)
 		{
 		  if (abs (in) > 1.0e-9)
 		    {
 		      std::cout << GridLogMessage << "oops" << std::endl;
 		      abort ();
 		    }
 		  else
 		    std::cout << GridLogMessage << " 0 ";
 		}
 	      else
 		{
 		  std::cout << GridLogMessage << " " << in << " ";
 		}
 	    }
 	  std::cout << GridLogMessage << std::endl;
 	}
    }
    void step (std::vector < RealD > &lmd,
 	       std::vector < RealD > &lme,
 	       Field & last, Field & current, Field & next, uint64_t k)
    {
      if (lmd.size () <= k)
 	lmd.resize (k + Nm);
      if (lme.size () <= k)
 	lme.resize (k + Nm);
 //      _poly(_Linop,current,next );   // 3. wk:=Avk−βkv_{k−1}
      _Linop(current, next);	// 3. wk:=Avk−βkv_{k−1}
      if (k > 0)
 	{
 	  next -= lme[k - 1] * last;
 	}
 //      std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
      ComplexD zalph = innerProduct (current, next);	// 4. αk:=(wk,vk)
      RealD alph = real (zalph);
      next = next - alph * current;	// 5. wk:=wk−αkvk
 //      std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
      RealD beta = normalise (next);	// 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
      // 7. vk+1 := wk/βk+1
 //       norm=beta;
      int interval = Nm / 100 + 1;
      if ((k % interval) == 0)
 	std::
 	  cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
 	  beta << std::endl;
      const RealD tiny = 1.0e-20;
      if (beta < tiny)
 	{
 	  std::cout << GridLogMessage << " beta is tiny " << beta << std::
 	    endl;
 	}
      lmd[k] = alph;
      lme[k] = beta;
    }
    void qr_decomp (std::vector < RealD > &lmd,
 		    std::vector  < RealD > &lme,
 		    int Nk,
 		    int Nm,
 		    std::vector  < RealD > &Qt, RealD Dsh, int kmin, int kmax)
    {
      int k = kmin - 1;
      RealD x;
      RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
      RealD c = (lmd[k] - Dsh) * Fden;
      RealD s = -lme[k] * Fden;
      RealD tmpa1 = lmd[k];
      RealD tmpa2 = lmd[k + 1];
      RealD tmpb = lme[k];
      lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
      lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
      lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
      x = -s * lme[k + 1];
      lme[k + 1] = c * lme[k + 1];
      for (int i = 0; i < Nk; ++i)
 	{
 	  RealD Qtmp1 = Qt[i + Nm * k];
 	  RealD Qtmp2 = Qt[i + Nm * (k + 1)];
 	  Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
 	  Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
 	}
      // Givens transformations
      for (int k = kmin; k < kmax - 1; ++k)
 	{
 	  RealD Fden = 1.0 / hypot (x, lme[k - 1]);
 	  RealD c = lme[k - 1] * Fden;
 	  RealD s = -x * Fden;
 	  RealD tmpa1 = lmd[k];
 	  RealD tmpa2 = lmd[k + 1];
 	  RealD tmpb = lme[k];
 	  lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
 	  lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
 	  lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
 	  lme[k - 1] = c * lme[k - 1] - s * x;
 	  if (k != kmax - 2)
 	    {
 	      x = -s * lme[k + 1];
 	      lme[k + 1] = c * lme[k + 1];
 	    }
 	  for (int i = 0; i < Nk; ++i)
 	    {
 	      RealD Qtmp1 = Qt[i + Nm * k];
 	      RealD Qtmp2 = Qt[i + Nm * (k + 1)];
 	      Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
 	      Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
 	    }
 	}
    }
 #if 0
 #ifdef USE_LAPACK
 #ifdef USE_MKL
 #define LAPACK_INT MKL_INT
 #else
 #define LAPACK_INT long long
 #endif
    void diagonalize_lapack (std::vector  < RealD > &lmd, std::vector  < RealD > &lme, int N1,	// all
 			     int N2,	// get
 			     GridBase * grid)
    {
      const int size = Nm;
      LAPACK_INT NN = N1;
      double evals_tmp[NN];
      double DD[NN];
      double EE[NN];
      for (int i = 0; i < NN; i++)
 	for (int j = i - 1; j <= i + 1; j++)
 	  if (j < NN && j >= 0)
 	    {
 	      if (i == j)
 		DD[i] = lmd[i];
 	      if (i == j)
 		evals_tmp[i] = lmd[i];
 	      if (j == (i - 1))
 		EE[j] = lme[j];
 	    }
      LAPACK_INT evals_found;
      LAPACK_INT lwork =
 	((18 * NN) >
 	 (1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
      LAPACK_INT liwork = 3 + NN * 10;
      LAPACK_INT iwork[liwork];
      double work[lwork];
      LAPACK_INT isuppz[2 * NN];
      char jobz = 'N';		// calculate evals only
      char range = 'I';		// calculate il-th to iu-th evals
      //    char range = 'A'; // calculate all evals
      char uplo = 'U';		// refer to upper half of original matrix
      char compz = 'I';		// Compute eigenvectors of tridiagonal matrix
      int ifail[NN];
      LAPACK_INT info;
 //  int total = QMP_get_number_of_nodes();
 //  int node = QMP_get_node_number();
 //  GridBase *grid = evec[0]._grid;
      int total = grid->_Nprocessors;
      int node = grid->_processor;
      int interval = (NN / total) + 1;
      double vl = 0.0, vu = 0.0;
      LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
      if (iu > NN)
 	iu = NN;
      double tol = 0.0;
      if (1)
 	{
 	  memset (evals_tmp, 0, sizeof (double) * NN);
 	  if (il <= NN)
 	    {
 	      printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
 #ifdef USE_MKL
 	      dstegr (&jobz, &range, &NN,
 #else
 	      LAPACK_dstegr (&jobz, &range, &NN,
 #endif
 			     (double *) DD, (double *) EE, &vl, &vu, &il, &iu,	// these four are ignored if second parameteris 'A'
 			     &tol,	// tolerance
 			     &evals_found, evals_tmp, (double *) NULL, &NN,
 			     isuppz, work, &lwork, iwork, &liwork, &info);
 	      for (int i = iu - 1; i >= il - 1; i--)
 		{
 		  printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
 			  evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
 		  evals_tmp[i] = evals_tmp[i - (il - 1)];
 		  if (il > 1)
 		    evals_tmp[i - (il - 1)] = 0.;
 		}
 	    }
 	  {
 	    grid->GlobalSumVector (evals_tmp, NN);
 	  }
 	}
 // cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
    }
 #undef LAPACK_INT
 #endif
    void diagonalize (std::vector  < RealD > &lmd,
 		      std::vector  < RealD > &lme,
 		      int N2, int N1, GridBase * grid)
    {
 #ifdef USE_LAPACK
      const int check_lapack = 0;	// just use lapack if 0, check against lapack if 1
      if (!check_lapack)
 	return diagonalize_lapack (lmd, lme, N2, N1, grid);
 //      diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
 #endif
    }
 #endif
    static RealD normalise (Field & v)
    {
      RealD nn = norm2 (v);
      nn = sqrt (nn);
      v = v * (1.0 / nn);
      return nn;
    }
    void orthogonalize (Field & w, std::vector < Field > &evec, int k)
    {
      double t0 = -usecond () / 1e6;
      typedef typename Field::scalar_type MyComplex;
      MyComplex ip;
      if (0)
 	{
 	  for (int j = 0; j < k; ++j)
 	    {
 	      normalise (evec[j]);
 	      for (int i = 0; i < j; i++)
 		{
 		  ip = innerProduct (evec[i], evec[j]);	// are the evecs normalised? ; this assumes so.
 		  evec[j] = evec[j] - ip * evec[i];
 		}
 	    }
 	}
      for (int j = 0; j < k; ++j)
 	{
 	  ip = innerProduct (evec[j], w);	// are the evecs normalised? ; this assumes so.
 	  w = w - ip * evec[j];
 	}
      normalise (w);
      t0 += usecond () / 1e6;
      OrthoTime += t0;
    }
    void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
    {
      for (int i = 0; i < Qt.size (); ++i)
 	Qt[i] = 0.0;
      for (int k = 0; k < Nm; ++k)
 	Qt[k + k * Nm] = 1.0;
    }
    void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
    {
      GridBase *grid = src.Grid();
 //      assert(grid == src._grid);
      std::
 	cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
 	endl;
      std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
      std::cout << GridLogMessage << " -- size of eval   = " << eval.
 	size () << std::endl;
 //      assert(c.size() && Nm == eval.size());
      std::vector < RealD > lme (Nm);
      std::vector < RealD > lmd (Nm);
      Field current (grid);
      Field last (grid);
      Field next (grid);
      Nconv = 0;
      RealD beta_k;
      // Set initial vector
      // (uniform vector) Why not src??
      //      evec[0] = 1.0;
      current = src;
      std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
 	endl;
      normalise (current);
      std::
 	cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
 	std::endl;
      // Initial Nk steps
      OrthoTime = 0.;
      double t0 = usecond () / 1e6;
      RealD norm;		// sqrt norm of last vector
      uint64_t iter = 0;
      bool initted = false;
      std::vector < RealD > low (Nstop * 10);
      std::vector < RealD > high (Nstop * 10);
      RealD cont = 0.;
      while (1) {
 	  cont = 0.;
 	  std::vector < RealD > lme2 (Nm);
 	  std::vector < RealD > lmd2 (Nm);
 	  for (uint64_t k = 0; k < Nm; ++k, iter++) {
 	      step (lmd, lme, last, current, next, iter);
 	      last = current;
 	      current = next;
 	    }
 	  double t1 = usecond () / 1e6;
 	  std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
 	    t0 << "seconds" << std::endl;
 	  t0 = t1;
 	  std::
 	    cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
 	    OrthoTime << "seconds" << std::endl;
 	  // getting eigenvalues
 	  lmd2.resize (iter + 2);
 	  lme2.resize (iter + 2);
 	  for (uint64_t k = 0; k < iter; ++k) {
 	      lmd2[k + 1] = lmd[k];
 	      lme2[k + 2] = lme[k];
 	    }
 	  t1 = usecond () / 1e6;
 	  std::cout << GridLogMessage << "IRL:: copy: " << t1 -
 	    t0 << "seconds" << std::endl;
 	  t0 = t1;
 	  {
 	    int total = grid->_Nprocessors;
 	    int node = grid->_processor;
 	    int interval = (Nstop / total) + 1;
 	    int iu = (iter + 1) - (interval * node + 1);
 	    int il = (iter + 1) - (interval * (node + 1));
 	    std::vector < RealD > eval2 (iter + 3);
 	    RealD eps2;
 	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
 			      eps2);
 //        diagonalize(eval2,lme2,iter,Nk,grid);
 	    RealD diff = 0.;
 	    for (int i = il; i <= iu; i++) {
 		if (initted)
 		  diff =
 		    fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
 						      fabs (high[iu-i]));
 		if (initted && (diff > eresid))
 		  cont = 1.;
 		if (initted)
 		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
 			  high[iu-i], diff);
 		high[iu-i] = eval2[i];
 	      }
 	    il = (interval * node + 1);
 	    iu = (interval * (node + 1));
 	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
 			      eps2);
 	    for (int i = il; i <= iu; i++) {
 		if (initted)
 		  diff =
 		    fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
 						fabs (low[i]));
 		if (initted && (diff > eresid))
 		  cont = 1.;
 		if (initted)
 		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
 			  low[i], diff);
 		low[i] = eval2[i];
 	      }
 	    t1 = usecond () / 1e6;
 	    std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
 	      t0 << "seconds" << std::endl;
 	    t0 = t1;
 	  }
 	  for (uint64_t k = 0; k < Nk; ++k) {
 //          eval[k] = eval2[k];
 	    }
 	  if (initted)
 	    {
 	      grid->GlobalSumVector (&cont, 1);
 	      if (cont < 1.) return;
 	    }
 	  initted = true;
 	}
    }
 #if 0
 /**
   There is some matrix Q such that for any vector y
   Q.e_1 = y and Q is unitary.
 **/
    template < class T >
      static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
    {
      int N = y.size ();	//Matrix Size
      Fill (Q, 0.0);
      T tau;
      for (int i = 0; i < N; i++)
 	{
 	  Q[i][0] = y[i];
 	}
      T sig = conj (y[0]) * y[0];
      T tau0 = fabs (sqrt (sig));
      for (int j = 1; j < N; j++)
 	{
 	  sig += conj (y[j]) * y[j];
 	  tau = abs (sqrt (sig));
 	  if (abs (tau0) > 0.0)
 	    {
 	      T gam = conj ((y[j] / tau) / tau0);
 	      for (int k = 0; k <= j - 1; k++)
 		{
 		  Q[k][j] = -gam * y[k];
 		}
 	      Q[j][j] = tau0 / tau;
 	    }
 	  else
 	    {
 	      Q[j - 1][j] = 1.0;
 	    }
 	  tau0 = tau;
 	}
      return tau;
    }
 /**
 	There is some matrix Q such that for any vector y
 	Q.e_k = y and Q is unitary.
 **/
    template < class T >
      static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
    {
      T tau = orthQ (Q, y);
      SL (Q);
      return tau;
    }
 /**
 	Wind up with a matrix with the first con rows untouched
 say con = 2
 	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
 	and the matrix is upper hessenberg
 	and with f and Q appropriately modidied with Q is the arnoldi factorization
 **/
    template < class T > static void Lock (DenseMatrix < T > &H,	///Hess mtx     
 					   DenseMatrix < T > &Q,	///Lock Transform
 					   T val,	///value to be locked
 					   int con,	///number already locked
 					   RealD small, int dfg, bool herm)
    {
      //ForceTridiagonal(H);
      int M = H.dim;
      DenseVector < T > vec;
      Resize (vec, M - con);
      DenseMatrix < T > AH;
      Resize (AH, M - con, M - con);
      AH = GetSubMtx (H, con, M, con, M);
      DenseMatrix < T > QQ;
      Resize (QQ, M - con, M - con);
      Unity (Q);
      Unity (QQ);
      DenseVector < T > evals;
      Resize (evals, M - con);
      DenseMatrix < T > evecs;
      Resize (evecs, M - con, M - con);
      Wilkinson < T > (AH, evals, evecs, small);
      int k = 0;
      RealD cold = abs (val - evals[k]);
      for (int i = 1; i < M - con; i++)
 	{
 	  RealD cnew = abs (val - evals[i]);
 	  if (cnew < cold)
 	    {
 	      k = i;
 	      cold = cnew;
 	    }
 	}
      vec = evecs[k];
      ComplexD tau;
      orthQ (QQ, vec);
      //orthQM(QQ,AH,vec);
      AH = Hermitian (QQ) * AH;
      AH = AH * QQ;
      for (int i = con; i < M; i++)
 	{
 	  for (int j = con; j < M; j++)
 	    {
 	      Q[i][j] = QQ[i - con][j - con];
 	      H[i][j] = AH[i - con][j - con];
 	    }
 	}
      for (int j = M - 1; j > con + 2; j--)
 	{
 	  DenseMatrix < T > U;
 	  Resize (U, j - 1 - con, j - 1 - con);
 	  DenseVector < T > z;
 	  Resize (z, j - 1 - con);
 	  T nm = norm (z);
 	  for (int k = con + 0; k < j - 1; k++)
 	    {
 	      z[k - con] = conj (H (j, k + 1));
 	    }
 	  normalise (z);
 	  RealD tmp = 0;
 	  for (int i = 0; i < z.size () - 1; i++)
 	    {
 	      tmp = tmp + abs (z[i]);
 	    }
 	  if (tmp < small / ((RealD) z.size () - 1.0))
 	    {
 	      continue;
 	    }
 	  tau = orthU (U, z);
 	  DenseMatrix < T > Hb;
 	  Resize (Hb, j - 1 - con, M);
 	  for (int a = 0; a < M; a++)
 	    {
 	      for (int b = 0; b < j - 1 - con; b++)
 		{
 		  T sum = 0;
 		  for (int c = 0; c < j - 1 - con; c++)
 		    {
 		      sum += H[a][con + 1 + c] * U[c][b];
 		    }		//sum += H(a,con+1+c)*U(c,b);}
 		  Hb[b][a] = sum;
 		}
 	    }
 	  for (int k = con + 1; k < j; k++)
 	    {
 	      for (int l = 0; l < M; l++)
 		{
 		  H[l][k] = Hb[k - 1 - con][l];
 		}
 	    }			//H(Hb[k-1-con][l] , l,k);}}
 	  DenseMatrix < T > Qb;
 	  Resize (Qb, M, M);
 	  for (int a = 0; a < M; a++)
 	    {
 	      for (int b = 0; b < j - 1 - con; b++)
 		{
 		  T sum = 0;
 		  for (int c = 0; c < j - 1 - con; c++)
 		    {
 		      sum += Q[a][con + 1 + c] * U[c][b];
 		    }		//sum += Q(a,con+1+c)*U(c,b);}
 		  Qb[b][a] = sum;
 		}
 	    }
 	  for (int k = con + 1; k < j; k++)
 	    {
 	      for (int l = 0; l < M; l++)
 		{
 		  Q[l][k] = Qb[k - 1 - con][l];
 		}
 	    }			//Q(Qb[k-1-con][l] , l,k);}}
 	  DenseMatrix < T > Hc;
 	  Resize (Hc, M, M);
 	  for (int a = 0; a < j - 1 - con; a++)
 	    {
 	      for (int b = 0; b < M; b++)
 		{
 		  T sum = 0;
 		  for (int c = 0; c < j - 1 - con; c++)
 		    {
 		      sum += conj (U[c][a]) * H[con + 1 + c][b];
 		    }		//sum += conj( U(c,a) )*H(con+1+c,b);}
 		  Hc[b][a] = sum;
 		}
 	    }
 	  for (int k = 0; k < M; k++)
 	    {
 	      for (int l = con + 1; l < j; l++)
 		{
 		  H[l][k] = Hc[k][l - 1 - con];
 		}
 	    }			//H(Hc[k][l-1-con] , l,k);}}
 	}
    }
 #endif
  };
 }
 #endif
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@@ -97,7 +97,7 @@ public:
    RealD scale;
-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    ConjugateGradient<FineField> CG(1.0e-3,400,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
@@ -110,7 +110,7 @@ public:
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-      for(int i=0;i<1;i++){
+      for(int i=0;i<4;i++){
 	CG(hermop,noise,subspace[b]);
@@ -146,7 +146,7 @@ public:
      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
-      for(int i=0;i<3;i++){
+      for(int i=0;i<2;i++){
 	//  void operator() (const Field &src, Field &psi){
 #if 1
 	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@@ -441,8 +441,20 @@ public:
    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
  }
 #else
  //////////////////////////////////////////////////////////////////////
  // Galerkin projection of matrix
  //////////////////////////////////////////////////////////////////////
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
  {
    CoarsenOperator(linop,Subspace,Subspace);
  }
  //////////////////////////////////////////////////////////////////////
  // Petrov - Galerkin projection of matrix
  //////////////////////////////////////////////////////////////////////
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & U,
 		       Aggregation<Fobj,CComplex,nbasis> & V)
  {
    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
    GridBase *grid = FineGrid();
@@ -458,11 +470,9 @@ public:
    // Orthogonalise the subblocks over the basis
    /////////////////////////////////////////////////////////////
    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
+    blockOrthogonalise(InnerProd,V.subspace);
    blockOrthogonalise(InnerProd,U.subspace);
    //    for(int s=0;s<Subspace.subspace.size();s++){
      //      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
    //    }
    const int npoint = geom.npoint;
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -542,7 +552,7 @@ public:
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
 	tphaseBZ-=usecond();
-	phaV = phaF[p]*Subspace.subspace[i];
+	phaV = phaF[p]*V.subspace[i];
 	tphaseBZ+=usecond();
 	/////////////////////////////////////////////////////////////////////
@@ -555,7 +565,7 @@ public:
 	//	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
 	tproj-=usecond();
-	blockProject(coarseInner,MphaV,Subspace.subspace);
+	blockProject(coarseInner,MphaV,U.subspace);
 	coarseInner = conjugate(pha[p]) * coarseInner;
 	ComputeProj[p] = coarseInner;
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -69,7 +69,7 @@ public:
  }
  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
@@ -175,10 +175,11 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
 template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview
-template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // 
+template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // Really want to deprecate
 template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page
 template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector
 /*
 template<class T> class vecView
 {
 protected:
@@ -214,6 +215,7 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
 #define autoVecView(v_v,v,mode)					\
  auto v_v = VectorView(v,mode);				\
  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
 */
 NAMESPACE_END(Grid);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -9,6 +9,7 @@ static char print_buffer [ MAXLINE ];
 #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
 #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug  << print_buffer << std::endl;
 //#define dprintf(...) 
 //#define mprintf(...) 
 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
@@ -109,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -119,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
    AccCache.AccPtr=(uint64_t) NULL;
-    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@@ -139,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@@ -153,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@@ -167,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -182,7 +183,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx\n",
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx",
 	  (uint64_t)AccCache.bytes,
 	  (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
@@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
+    dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
+    dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
  if(bytes>=DeviceMaxBytes) {
    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
  }
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
@@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error
  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n",
+    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
@@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
-    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
+    dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
+    dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
  } else {
    assert(0);
  }
@@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
-    dprintf("AccCache entry removed from LRU \n");
+    dprintf("AccCache entry removed from LRU ");
    LRUremove(AccCache);
  }
@@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
  } else {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 #include <Grid/communicator/SharedMemory.h>
 #define NVLINK_GET
 NAMESPACE_BEGIN(Grid);
 extern bool Stencil_force_mpi ;
@@ -147,7 +149,8 @@ public:
 			    sizeof(obj),d*100+p);
      }
-      CommsComplete(list);
+      if (!list.empty()) // avoid triggering assert in comms == none
 	CommsComplete(list);
      for(int p=1;p<_processors[d];p++){
 	accum = accum + column[p];
      }
@@ -192,6 +195,11 @@ public:
 				      void *recv,
 				      int recv_from_rank,int do_recv,
 				      int xbytes,int rbytes,int dir);
  // Could do a PollHtoD and have a CommsMerge dependence
  void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 ////////////////////////////////////////////
@@ -362,8 +363,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
  std::vector<MpiCommsRequest_t> reqs(0);
  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
  int myrank = _processor;
  int ierr;
@@ -379,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);
  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@@ -399,6 +395,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 #ifdef ACCELERATOR_AWARE_MPI
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
 void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int dest,int dox,
@@ -440,8 +438,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(rrq);
      off_node_bytes+=rbytes;
    }
 #ifdef NVLINK_GET
    else { 
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
    }
 #endif
  }
-  
+  // This is a NVLINK PUT  
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
@@ -450,9 +455,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(xrq);
      off_node_bytes+=xbytes;
    } else {
 #ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
 #endif
    }
  }
  return off_node_bytes;
@@ -461,7 +468,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  int nreq=list.size();
-
+  /*finishes Get/Put*/
  acceleratorCopySynchronise();
  if (nreq==0) return;
@@ -561,53 +568,105 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-#undef DEVICE_TO_HOST_CONCURRENT // pipeline
+
 #ifdef DEVICE_TO_HOST_CONCURRENT
      tag= dir+_processor*32;
      host_xmit = this->HostBufferMalloc(xbytes);
-      acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+      CommsRequest_t srq;
      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      //      assert(ierr==0);
      //      off_node_bytes+=xbytes;
      CommsRequest_t srq;
      srq.PacketType = InterNodeXmit;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = host_xmit;
      srq.device_buf = xmit;
      srq.tag        = tag;
      srq.dest       = dest;
      srq.commdir    = commdir;
      list.push_back(srq);
 #else
      tag= dir+_processor*32;
      host_xmit = this->HostBufferMalloc(xbytes);
      const int chunks=1;
      for(int n=0;n<chunks;n++){
 	void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
 	void * xmitc      = (void *)( (uint64_t) xmit      + n*xbytes/chunks);
 	acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
      }
      acceleratorCopySynchronise(); // Complete all pending copy transfers
      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      off_node_bytes+=xbytes;
      CommsRequest_t srq;
      srq.PacketType = InterNodeXmit;
      srq.bytes      = xbytes;
      srq.req        = xrq;
      srq.host_buf   = host_xmit;
      srq.device_buf = xmit;
      list.push_back(srq);
 #endif
    }
  }
  return off_node_bytes;
 }
 /*
 * In the interest of better pipelining, poll for completion on each DtoH and 
 * start MPI_ISend in the meantime
 */
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
 {
  int pending = 0;
  do {
    pending = 0;
    for(int idx = 0; idx<list.size();idx++){
      if ( list[idx].PacketType==InterNodeRecv ) {
 	int flag = 0;
 	MPI_Status status;
 	int ierr = MPI_Test(&list[idx].req,&flag,&status);
 	assert(ierr==0);
 	if ( flag ) {
 	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
 	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
 	  list[idx].PacketType=InterNodeReceiveHtoD;
 	} else {
 	  pending ++;
 	}
      }
    }
    //    std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
  } while ( pending );
 }
 void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
 {
  int pending = 0;
  do {
    pending = 0;
    for(int idx = 0; idx<list.size();idx++){
      if ( list[idx].PacketType==InterNodeXmit ) {
 	if ( acceleratorEventIsComplete(list[idx].ev) ) {
 	  void *host_xmit = list[idx].host_buf;
 	  uint32_t xbytes = list[idx].bytes;
 	  int dest        = list[idx].dest;
 	  int tag         = list[idx].tag;
 	  int commdir     = list[idx].commdir;
 	  ///////////////////
 	  // Send packet
 	  ///////////////////
 	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
 	  MPI_Request xrq;
 	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
 	  assert(ierr==0);
 	  list[idx].req        = xrq; // Update the MPI request in the list
 	  list[idx].PacketType=InterNodeXmitISend;
 	} else {
 	  // not done, so return to polling loop
 	  pending++;
 	}
      }
    }
  } while (pending);
 }  
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
@@ -644,69 +703,89 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   * - complete all copies
   * - post MPI send asynch
   */
 #ifdef NVLINK_GET
  if ( dor ) {
-  //  static int printed;
+    if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
-  //  if((printed<8) && this->IsBoss() ) {
+      // Intranode
-  //    printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
-  //    printed++;
+      assert(shm!=NULL);
-  //  }
+
-  
+      CommsRequest_t srq;
      srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
      srq.PacketType = IntraNodeRecv;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = NULL;
      srq.device_buf = xmit;
      srq.tag        = -1;
      srq.dest       = dest;
      srq.commdir    = dir;
      list.push_back(srq);
    }
  }  
 #else
  if (dox) {
-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+    if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
-#ifdef DEVICE_TO_HOST_CONCURRENT
+      // Intranode
      tag= dir+_processor*32;
      // Find the send in the prepared list
      int list_idx=-1;
      for(int idx = 0; idx<list.size();idx++){
 	if ( (list[idx].device_buf==xmit)
 	   &&(list[idx].PacketType==InterNodeXmit)
 	   &&(list[idx].bytes==xbytes) ) {
 	  list_idx = idx;
 	  host_xmit = list[idx].host_buf;
 	}
      }
      assert(list_idx != -1); // found it
      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      list[list_idx].req        = xrq; // Update the MPI request in the list
      off_node_bytes+=xbytes;
 #endif      
    } else {
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+
      CommsRequest_t srq;
      srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
      srq.PacketType = IntraNodeXmit;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = NULL;
      srq.device_buf = xmit;
      srq.tag        = -1;
      srq.dest       = dest;
      srq.commdir    = dir;
      list.push_back(srq);
    }
  }
 #endif
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  int nreq=list.size();
+  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
-  if (nreq==0) return;
+  std::vector<MPI_Status> status;
-  std::vector<MPI_Status> status(nreq);
+  std::vector<MPI_Request> MpiRequests;
-  std::vector<MPI_Request> MpiRequests(nreq);
+    
  for(int r=0;r<list.size();r++){
    // Must check each Send buf is clear to reuse
    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
  }
-  for(int r=0;r<nreq;r++){
+  int nreq=MpiRequests.size();
-    MpiRequests[r] = list[r].req;
+
  if (nreq>0) {
    status.resize(MpiRequests.size());
    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
    assert(ierr==0);
  }
-  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
+  //  for(int r=0;r<nreq;r++){
-  assert(ierr==0);
+  //    if ( list[r].PacketType==InterNodeRecv ) {
-
+  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
-  for(int r=0;r<nreq;r++){
+  //    }
-    if ( list[r].PacketType==InterNodeRecv ) {
+  //  }
-      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
+  
    }
  }
  acceleratorCopySynchronise(); // Complete all pending copy transfers
  list.resize(0);               // Delete the list
  this->HostBufferFreeAll();    // Clean up the buffer allocs
-  this->StencilBarrier(); 
+#ifndef NVLINK_GET
  this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
 #endif   
 }
 #endif
 ////////////////////////////////////////////
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -132,6 +132,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
 void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int xmit_to_rank,int dox,
@@ -139,7 +141,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
 							   int recv_from_rank,int dor,
 							   int xbytes,int rbytes, int dir)
 {
-  return xbytes+rbytes;
+  return 0.0;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -50,12 +50,30 @@ typedef MPI_Request MpiCommsRequest_t;
 #ifdef ACCELERATOR_AWARE_MPI
 typedef MPI_Request CommsRequest_t;
 #else
-enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
+/*
 * Enable state transitions as each packet flows.
 */
 enum PacketType_t {
  FaceGather,
  InterNodeXmit,
  InterNodeRecv,
  IntraNodeXmit,
  IntraNodeRecv,
  InterNodeXmitISend,
  InterNodeReceiveHtoD
 };
 /*
 *Package arguments needed for various actions along packet flow
 */
 typedef struct {
  PacketType_t PacketType;
  void *host_buf;
  void *device_buf;
  int dest;
  int tag;
  int commdir;
  unsigned long bytes;
  acceleratorEvent_t ev;
  MpiCommsRequest_t req;
 } CommsRequest_t;
 #endif
@@ -119,7 +137,7 @@ public:
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
-  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
+  //  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
  static void SharedMemoryZero(void *dest,size_t bytes);
 };
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -542,12 +542,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  printf("Host buffer allocate for GPU non-aware MPI\n");
+  // printf("Host buffer allocate for GPU non-aware MPI\n");
 #if 0
  HostCommBuf= acceleratorAllocHost(bytes);
 #else 
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
-#ifdef HAVE_NUMAIF_H
+#if 0
  #warning "Moving host buffers to specific NUMA domain"
  int numa;
  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
@@ -916,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
  bzero(dest,bytes);
 #endif
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-{
+//{
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-  acceleratorCopyToDevice(src,dest,bytes);
+//  acceleratorCopyToDevice(src,dest,bytes);
-#else   
+//#else   
-  bcopy(src,dest,bytes);
+//  bcopy(src,dest,bytes);
-#endif
+//#endif
-}
+//}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
@@ -959,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
  }
  ShmBufferFreeAll();
@@ -989,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif
-  //SharedMemoryTest();
+  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@@ -1011,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
       check[0]=GlobalSharedMemory::WorldNode;
       check[1]=r;
       check[2]=magic;
-       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
+       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
    }
  }
  ShmBarrier();
  for(uint64_t r=0;r<ShmSize;r++){
-    ShmBarrier();
+    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
    ShmBarrier();
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==magic);
    ShmBarrier();
  }
  ShmBarrier();
  std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
 }
 void *SharedMemory::ShmBuffer(int rank)
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -68,7 +68,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }
-#if 1
+
 template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];
@@ -125,7 +125,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
-    
+#ifndef ACCELERATOR_AWARE_MPI
  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
 #endif
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  RealD tcopy=0.0;
@@ -156,16 +160,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      tcomms-=usecond();
      grid->Barrier();
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
 #else
      // bouncy bouncy
      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
      grid->SendToRecvFrom((void *)&hsend_buf[0],
 			   xmit_to_rank,
 			   (void *)&hrecv_buf[0],
 			   recv_from_rank,
 			   bytes);
      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
 #endif
      xbytes+=bytes;
      grid->Barrier();
      tcomms+=usecond();
@@ -226,12 +243,16 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  static std::vector<deviceVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
- 
+
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
-
+#ifndef ACCELERATOR_AWARE_MPI
  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
 #endif
  int bytes = buffer_size*sizeof(scalar_object);
  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
@@ -283,11 +304,22 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
 #ifdef ACCELERATOR_AWARE_MPI
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
 #else
      // bouncy bouncy
 	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
 	grid->SendToRecvFrom((void *)&hsend_buf[0],
 			     xmit_to_rank,
 			     (void *)&hrecv_buf[0],
 			     recv_from_rank,
 			     bytes);
 	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
 #endif
 	xbytes+=bytes;
 	grid->Barrier();
@@ -311,234 +343,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  }
 }
 #else 
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  GridBase *grid=rhs.Grid();
  Lattice<vobj> temp(rhs.Grid());
  int fd              = rhs.Grid()->_fdimensions[dimension];
  int rd              = rhs.Grid()->_rdimensions[dimension];
  int pd              = rhs.Grid()->_processors[dimension];
  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
  assert(simd_layout==1);
  assert(comm_dim==1);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
  vobj *send_buf;
  vobj *recv_buf;
  {
    grid->ShmBufferFreeAll();
    size_t bytes = buffer_size*sizeof(vobj);
    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
  }
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  for(int x=0;x<rd;x++){       
    int sx        =  (x+sshift)%rd;
    int comm_proc = ((x+sshift)/rd)%pd;
    if (comm_proc==0) {
      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
      tcopy+=usecond();
    } else {
      int words = buffer_size;
      if (cbmask != 0x3) words=words>>1;
      int bytes = words * sizeof(vobj);
      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
      tgather+=usecond();
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      tcomms-=usecond();
      //      grid->Barrier();
      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
      xbytes+=bytes;
      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
      //      grid->Barrier();
      tcomms+=usecond();
      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
      tscatter+=usecond();
    }
  }
  if(Cshift_verbose){
    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  }
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid=rhs.Grid();
  const int Nsimd = grid->Nsimd();
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  int fd = grid->_fdimensions[dimension];
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int pd = grid->_processors[dimension];
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;
  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
  assert(comm_dim==1);
  assert(simd_layout==2);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int permute_type=grid->PermuteType(dimension);
  ///////////////////////////////////////////////
  // Simd direction uses an extract/merge pair
  ///////////////////////////////////////////////
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);
  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
  {
    size_t bytes = sizeof(scalar_object)*buffer_size;
    grid->ShmBufferFreeAll();
    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
  }
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
  int bytes = buffer_size*sizeof(scalar_object);
  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
  ///////////////////////////////////////////
  // Work out what to send where
  ///////////////////////////////////////////
  int cb    = (cbmask==0x2)? Odd : Even;
  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  // loop over outer coord planes orthog to dim
  for(int x=0;x<rd;x++){       
    // FIXME call local permute copy if none are offnode.
    for(int i=0;i<Nsimd;i++){       
      pointers[i] = &send_buf_extract[i][0];
    }
    tgather-=usecond();
    int sx   = (x+sshift)%rd;
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
    tgather+=usecond();
    for(int i=0;i<Nsimd;i++){
      int inner_bit = (Nsimd>>(permute_type+1));
      int ic= (i&inner_bit)? 1:0;
      int my_coor          = rd*ic + x;
      int nbr_coor         = my_coor+sshift;
      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
      int nbr_lane = (i&(~inner_bit));
      int recv_from_rank;
      int xmit_to_rank;
      if (nbr_ic) nbr_lane|=inner_bit;
      assert (sx == nbr_ox);
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 	tcomms-=usecond();
 	//	grid->Barrier();
 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
 	xbytes+=bytes;
 	//	grid->Barrier();
 	tcomms+=usecond();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
      }
    }
    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
  }
  if(Cshift_verbose){
    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
  }
 }
 #endif
 NAMESPACE_END(Grid); 
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
  //copy offsets to device
-  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
+  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
    exit(EXIT_FAILURE);
  }
-  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
+  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  //sync after copy
  accelerator_barrier();
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -466,6 +466,12 @@ public:
    static deviceVector<vobj> recv_buf;
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);
 #ifndef ACCELERATOR_AWARE_MPI
    static hostVector<vobj> hsend_buf; 
    static hostVector<vobj> hrecv_buf;
    hsend_buf.resize(buffer_size*2*depth);    
    hrecv_buf.resize(buffer_size*2*depth);
 #endif    
    std::vector<MpiCommsRequest_t> fwd_req;   
    std::vector<MpiCommsRequest_t> bwd_req;   
@@ -495,9 +501,16 @@ public:
      t_gather+=usecond()-t;
      t=usecond();
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&send_buf[d*buffer_size], xmit_to_rank,
 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
 #else
      acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
 				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
 #endif
      t_comms+=usecond()-t;
     }
    for ( int d=0;d < depth ; d ++ ) {
@@ -508,9 +521,16 @@ public:
      t_gather+= usecond() - t;
      t=usecond();
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
 #else
      acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
 #endif      
      t_comms+=usecond()-t;
    }
@@ -533,8 +553,13 @@ public:
    t=usecond();
    grid->CommsComplete(fwd_req);
 #ifndef ACCELERATOR_AWARE_MPI
    for ( int d=0;d < depth ; d ++ ) {
      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
    }
 #endif
    t_comms+= usecond() - t;
-
+    
    t=usecond();
    for ( int d=0;d < depth ; d ++ ) {
      ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
@@ -543,6 +568,11 @@ public:
    t=usecond();
    grid->CommsComplete(bwd_req);
 #ifndef ACCELERATOR_AWARE_MPI
    for ( int d=0;d < depth ; d ++ ) {
      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
    }
 #endif
    t_comms+= usecond() - t;
    t=usecond();
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
@@ -0,0 +1,196 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
    Copyright (C) 2020 - 2025
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Nils Meyer <nils.meyer@ur.de>
    Author: Christoph Lehner <christoph@lhnr.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 // see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
 template<class Impl, class CloverHelpers>
 class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
 				     public WilsonCloverHelpers<Impl>,
 				     public CompactWilsonCloverHelpers<Impl> {
  /////////////////////////////////////////////
  // Sizes
  /////////////////////////////////////////////
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  /////////////////////////////////////////////
  // Type definitions
  /////////////////////////////////////////////
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonFermion5D<Impl>            WilsonBase;
  typedef WilsonCloverHelpers<Impl>        Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  /////////////////////////////////////////////
  // Constructors
  /////////////////////////////////////////////
 public:
  CompactWilsonCloverFermion5D(GaugeField& _Umu,
 			       GridCartesian         &FiveDimGrid,
 			       GridRedBlackCartesian &FiveDimRedBlackGrid,
 			       GridCartesian         &FourDimGrid,
 			       GridRedBlackCartesian &FourDimRedBlackGrid,
 			       const RealD _mass,
 			       const RealD _csw_r = 0.0,
 			       const RealD _csw_t = 0.0,
 			       const RealD _cF = 1.0,
 			       const ImplParams& impl_p = ImplParams());
  /////////////////////////////////////////////
  // Member functions (implementing interface)
  /////////////////////////////////////////////
 public:
  virtual void Instantiatable() {};
  int          ConstEE()     override { return 0; };
  int          isTrivialEE() override { return 0; };
  void Dhop(const FermionField& in, FermionField& out, int dag) override;
  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
  void M(const FermionField& in, FermionField& out) override;
  void Mdag(const FermionField& in, FermionField& out) override;
  void Meooe(const FermionField& in, FermionField& out) override;
  void MeooeDag(const FermionField& in, FermionField& out) override;
  void Mooee(const FermionField& in, FermionField& out) override;
  void MooeeDag(const FermionField& in, FermionField& out) override;
  void MooeeInv(const FermionField& in, FermionField& out) override;
  void MooeeInvDag(const FermionField& in, FermionField& out) override;
  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  /////////////////////////////////////////////
  // Member functions (internals)
  /////////////////////////////////////////////
  void MooeeInternal(const FermionField&        in,
                     FermionField&              out,
                     const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle);
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
  void ImportGauge(const GaugeField& _Umu) override;
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
 private:
  template<class Field>
  const MaskField* getCorrectMaskField(const Field &in) const {
    if(in.Grid()->_isCheckerBoarded) {
      if(in.Checkerboard() == Odd) {
        return &this->BoundaryMaskOdd;
      } else {
        return &this->BoundaryMaskEven;
      }
    } else {
      return &this->BoundaryMask;
    }
  }
  template<class Field>
  void ApplyBoundaryMask(Field& f) {
    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
    assert(m != nullptr);
    CompactHelpers::ApplyBoundaryMask(f, *m);
  }
  /////////////////////////////////////////////
  // Member Data
  /////////////////////////////////////////////
 public:
  RealD csw_r;
  RealD csw_t;
  RealD cF;
  int n_rhs;
  bool fixedBoundaries;
  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
  FermionField Tmp;
  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@@ -164,12 +165,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
 // Compact Clover fermions
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
 typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
 typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
 typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
 typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
 typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -484,6 +484,11 @@ public:
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
 #ifdef NVLINK_GET
    this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
    // Or issue barrier AFTER the DMA is running
 #endif    
  }
 };
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -91,13 +91,13 @@ public:
  virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};
  // half checkerboard operations; leave unimplemented as abstract for now
-  virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
-  virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
-  virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
-  virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
-  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
-  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
@@ -0,0 +1,376 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
    Copyright (C) 2017 - 2025
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Christoph Lehner <christoph@lhnr.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
 NAMESPACE_BEGIN(Grid);
 template<class Impl, class CloverHelpers>
 CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
 										GridCartesian         &FiveDimGrid,
 										GridRedBlackCartesian &FiveDimRedBlackGrid,
 										GridCartesian         &FourDimGrid,
 										GridRedBlackCartesian &FourDimRedBlackGrid,
 										const RealD _mass,
 										const RealD _csw_r,
 										const RealD _csw_t,
 										const RealD _cF,
 										const ImplParams& impl_p)
  : WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
  , csw_r(_csw_r)
  , csw_t(_csw_t)
  , cF(_cF)
  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
  , Diagonal(&FourDimGrid),        Triangle(&FourDimGrid)
  , DiagonalEven(&FourDimRedBlackGrid),    TriangleEven(&FourDimRedBlackGrid)
  , DiagonalOdd(&FourDimRedBlackGrid),     TriangleOdd(&FourDimRedBlackGrid)
  , DiagonalInv(&FourDimGrid),     TriangleInv(&FourDimGrid)
  , DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
  , DiagonalInvOdd(&FourDimRedBlackGrid),  TriangleInvOdd(&FourDimRedBlackGrid)
  , Tmp(&FiveDimGrid)
  , BoundaryMask(&FiveDimGrid)
  , BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
 {
  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
  csw_r *= 0.5;
  csw_t *= 0.5;
  //if (clover_anisotropy.isAnisotropic)
  //  csw_r /= clover_anisotropy.xi_0;
  ImportGauge(_Umu);
  if (fixedBoundaries) {
    this->BoundaryMaskEven.Checkerboard() = Even;
    this->BoundaryMaskOdd.Checkerboard() = Odd;
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
  }
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
  if(this->fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
  if(this->fixedBoundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
    } else {
      MooeeInternal(in, out, DiagonalEven, TriangleEven);
    }
  } else {
    MooeeInternal(in, out, Diagonal, Triangle);
  }
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
    } else {
      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
    }
  } else {
    MooeeInternal(in, out, DiagonalInv, TriangleInv);
  }
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
  assert(!fixedBoundaries); // TODO check for changes required for open bc
  // NOTE: code copied from original clover term
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
        continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
 								      FermionField&              out,
 								      const CloverDiagonalField& diagonal,
 								      const CloverTriangleField& triangle) {
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  out.Checkerboard() = in.Checkerboard();
  conformable(in, out);
  CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation
  // Import gauge into base class
  double t0 = usecond();
  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
  // Initialize temporary variables
  double t1 = usecond();
  conformable(_Umu.Grid(), this->GaugeGrid());
  GridBase* grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  CloverField TmpOriginal(grid);
  CloverField TmpInverse(grid);
  // Compute the field strength terms mu>nu
  double t2 = usecond();
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
  double t3 = usecond();
  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
  // Instantiate the clover term
  // - In case of the standard clover the mass term is added
  // - In case of the exponential clover the clover term is exponentiated
  double t4 = usecond();
  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
  // Convert the data layout of the clover term
  double t5 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
  // Modify the clover term at the temporal boundaries in case of open boundary conditions
  double t6 = usecond();
  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
  // Invert the Clover term
  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
  // TODO: For now this inversion is explictly done on the CPU
  double t7 = usecond();
  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
  // Fill the remaining clover fields
  double t8 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
  // Report timings
  double t9 = usecond();
  std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -14,6 +14,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
 Author: Christoph Lehner <christoph@lhnr.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -484,6 +485,54 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
  Dhop(in,out,dag); // -0.5 is included
  axpy(out,4.0-M5,in,out);
 }
 template <class Impl>
 void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
 {
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
    DhopOE(in, out, DaggerNo);
  }
 }
 template <class Impl>
 void WilsonFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
 {
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
    DhopOE(in, out, DaggerYes);
  }
 }
 template <class Impl>
 void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(4.0 + M5);
  out = scal * in;
 }
 template <class Impl>
 void WilsonFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
  out = (1.0/(4.0 + M5))*in;
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in,out);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else {							\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  acceleratorSynchronise();						\
+  acceleratorSynchronise();					\
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
  Recon(result, Uchi);
@@ -517,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
-     // dependent on result of merge
+     //     // dependent on result of merge
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
--- a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master
@@ -0,0 +1,45 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master
    Copyright (C) 2017 - 2025
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    Author: Christoph Lehner <christoph@lhnr.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
 template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc
@@ -0,0 +1 @@
 ../CompactWilsonCloverFermion5DInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc
@@ -0,0 +1 @@
 ../CompactWilsonCloverFermion5DInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -62,7 +62,7 @@ do
 done
 done
-CC_LIST="CompactWilsonCloverFermionInstantiation"
+CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation"
 for impl in $COMPACT_WILSON_IMPL_LIST
 do
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@@ -76,27 +76,27 @@ public:
    return action;
  };
-  virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
    //extend Ta to include Lorentz indexes
    RealD factor_p = c_plaq/RealD(Nc)*0.5;
    RealD factor_r = c_rect/RealD(Nc)*0.5;
-    GridBase *grid = Umu.Grid();
+    GridBase *grid = U.Grid();
-    std::vector<GaugeLinkField> U (Nd,grid);
+    std::vector<GaugeLinkField> Umu (Nd,grid);
    for(int mu=0;mu<Nd;mu++){
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+      Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
    }
    std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
-    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
+    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, Umu, workspace);
    GaugeLinkField dSdU_mu(grid);
    GaugeLinkField staple(grid);
    for (int mu=0; mu < Nd; mu++){
-      dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
+      dSdU_mu = Ta(Umu[mu]*Staple[mu])*factor_p;
-      dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
+      dSdU_mu = dSdU_mu + Ta(Umu[mu]*RectStaple[mu])*factor_r;
-	  
+
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
--- a/Grid/qcd/action/gauge/WilsonGaugeAction.h
+++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h
@@ -73,20 +73,23 @@ public:
    // extend Ta to include Lorentz indexes
    RealD factor = 0.5 * beta / RealD(Nc);
    GridBase *grid = U.Grid();
-    GaugeLinkField Umu(U.Grid());
+    GaugeLinkField dSdU_mu(grid);
-    GaugeLinkField dSdU_mu(U.Grid());
+    std::vector<GaugeLinkField> Umu(Nd, grid);
    for (int mu = 0; mu < Nd; mu++) {
      Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
    }
-      Umu = PeekIndex<LorentzIndex>(U, mu);
+    for (int mu = 0; mu < Nd; mu++) {
      // Staple in direction mu
-      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, Umu, mu);
-      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
+      dSdU_mu = Ta(Umu[mu] * dSdU_mu) * factor;
-      
+
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
  }
 private:
  RealD beta;  
 };
--- a/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
@@ -111,8 +111,8 @@ public:
  };
  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
-    std::string config, rng;
+    std::string config, rng, smr;
-    this->build_filenames(traj, Params, config, rng);
+    this->build_filenames(traj, Params, config, smr, rng);
    this->check_filename(rng);
    this->check_filename(config);
--- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -75,7 +75,7 @@ public:
                          GridParallelRNG &pRNG) {
    if ((traj % Params.saveInterval) == 0) {
      std::string config, rng, smr;
-      this->build_filenames(traj, Params, config, rng);
+      this->build_filenames(traj, Params, config, smr, rng);
      GridBase *grid = SmartConfig.get_U(false).Grid();
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
@@ -102,7 +102,7 @@ public:
      if ( Params.saveSmeared ) { 
 	IldgWriter _IldgWriter(grid->IsBoss());
 	_IldgWriter.open(smr);
-	_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
+	_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, smr, smr);
 	_IldgWriter.close();
 	std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
@@ -118,8 +118,8 @@ public:
  void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
                         GridParallelRNG &pRNG) {
-    std::string config, rng;
+    std::string config, rng, smr;
-    this->build_filenames(traj, Params, config, rng);
+    this->build_filenames(traj, Params, config, smr, rng);
    this->check_filename(rng);
    this->check_filename(config);
--- a/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
@@ -107,8 +107,8 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
                         GridParallelRNG &pRNG) {
-    std::string config, rng;
+    std::string config, rng, smr;
-    this->build_filenames(traj, Params, config, rng);
+    this->build_filenames(traj, Params, config, smr, rng);
    this->check_filename(rng);
    this->check_filename(config);
--- a/Grid/qcd/smearing/HISQSmearing.h
+++ b/Grid/qcd/smearing/HISQSmearing.h
@@ -62,15 +62,15 @@ accelerator_inline int stencilIndex(int mu, int nu) {
 /*!  @brief structure holding the link treatment */
-struct SmearingParameters{
+struct HISQSmearingParameters{
-    SmearingParameters(){}
+    HISQSmearingParameters(){}
    Real c_1;               // 1 link
    Real c_naik;            // Naik term
    Real c_3;               // 3 link
    Real c_5;               // 5 link
    Real c_7;               // 7 link
    Real c_lp;              // 5 link Lepage
-    SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
+    HISQSmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
        : c_1(c1),
          c_naik(cnaik),
          c_3(c3),
@@ -86,7 +86,7 @@ class Smear_HISQ : public Gimpl {
 private:
    GridCartesian* const _grid;
-    SmearingParameters _linkTreatment;
+    HISQSmearingParameters _linkTreatment;
 public:
@@ -117,7 +117,7 @@ public:
    //          IN--u_thin
    void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
-        SmearingParameters lt = this->_linkTreatment;
+        HISQSmearingParameters lt = this->_linkTreatment;
        auto grid = this->_grid;
        // Create a padded cell of extra padding depth=1 and fill the padding.
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -207,11 +207,14 @@ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(con
 }
 template <class Gimpl>
-void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
+void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int meas_interval){
-  addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+  addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "  << step << "  " << t << "  " << energyDensityPlaquette(t,U) << std::endl;
    });
-  addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+  addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Energy density (cloverleaf) : "  << step << "  " << t << "  " << energyDensityCloverleaf(t,U) << std::endl;
    });
  addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "  << step << "  " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
    });
 }
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -292,19 +292,21 @@ public:
  //////////////////////////////////////////////////
  // the sum over all nu-oriented staples for nu != mu on each site
  //////////////////////////////////////////////////
-  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
+  static void Staple(GaugeMat &staple, const GaugeLorentz &U, int mu) {
-    GridBase *grid = Umu.Grid();
+    std::vector<GaugeMat> Umu(Nd, U.Grid());
    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
-      U[d] = PeekIndex<LorentzIndex>(Umu, d);
+      Umu[d] = PeekIndex<LorentzIndex>(U, d);
    }
-    Staple(staple, U, mu);
+    Staple(staple, Umu, mu);
  }
-  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
+  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &Umu, int mu) {
-    staple = Zero();
+
    autoView(staple_v, staple, AcceleratorWrite);
    accelerator_for(i, staple.Grid()->oSites(), Simd::Nsimd(), {
        staple_v[i] = Zero();
    });
    for (int nu = 0; nu < Nd; nu++) {
@@ -318,12 +320,12 @@ public:
        //      |
        //    __|
        //
-     
+
        staple += Gimpl::ShiftStaple(
 				     Gimpl::CovShiftForward(
-							    U[nu], nu,
+							    Umu[nu], nu,
 							    Gimpl::CovShiftBackward(
-										    U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+										    Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))),
 				     mu);
        //  __
@@ -333,8 +335,8 @@ public:
        //
        staple += Gimpl::ShiftStaple(
-				     Gimpl::CovShiftBackward(U[nu], nu,
+				     Gimpl::CovShiftBackward(Umu[nu], nu,
-							     Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+							     Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu);
      }
    }
  }
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -363,12 +363,16 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    //    std::cout << "Communicate Begin "<<std::endl;
    //    _grid->Barrier();
    FlightRecorder::StepLog("Communicate begin");
    // All GPU kernel tasks must complete
    //    accelerator_barrier();     // All kernels should ALREADY be complete
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate prepare "<<i<<std::endl;
      //      _grid->Barrier();
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
 					  Packets[i].send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
@@ -376,8 +380,15 @@ public:
 					  Packets[i].from_rank,Packets[i].do_recv,
 					  Packets[i].xbytes,Packets[i].rbytes,i);
    }
    //    std::cout << "Communicate PollDtoH "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
    //    std::cout << "Communicate CopySynch "<<std::endl;
    //    _grid->Barrier();
    acceleratorCopySynchronise();
    // Starts intranode
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate Begin "<<i<<std::endl;
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
@@ -395,7 +406,14 @@ public:
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    //    std::cout << "Communicate Complete "<<std::endl;
    //    _grid->Barrier();
    FlightRecorder::StepLog("Start communicate complete");
    //    std::cout << "Communicate Complete PollIRecv "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromPollIRecv(MpiReqs);
    //    std::cout << "Communicate Complete Complete "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
@@ -428,6 +446,7 @@ public:
    Communicate();
    CommsMergeSHM(compress);
    CommsMerge(compress);
    accelerator_barrier();
  }
  template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@@ -483,6 +502,9 @@ public:
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
    //    accelerator_barrier();
    //////////////////////////////////
    // I will overwrite my send buffers
    //////////////////////////////////
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    assert(source.Grid()==_grid);
@@ -496,7 +518,11 @@ public:
      HaloGatherDir(source,compress,point,face_idx);
    }
    accelerator_barrier(); // All my local gathers are complete
-    //    _grid->StencilBarrier();// Synch shared memory on a single nodes
+#ifdef NVLINK_GET
    _grid->StencilBarrier(); // He can now get mu local gather, I can get his
    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
    // Or issue barrier AFTER the DMA is running
 #endif    
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
  }
@@ -535,6 +561,7 @@ public:
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
      acceleratorFenceComputeStream();
      // Also fenced in WilsonKernels
    }
  }
@@ -663,7 +690,7 @@ public:
 	}
      }
    }
-    std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    //    std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
    surface_list.resize(surface_list_size);
    std::vector<int> surface_list_host(surface_list_size);
    int32_t ss=0;
@@ -683,6 +710,7 @@ public:
      }
    }
    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
    //    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@@ -774,8 +802,8 @@ public:
    this->_entries_host_p = &_entries[0];
    this->_entries_p = &_entries_device[0];
-    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
+    //    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
-	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
+    //	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
    for(int ii=0;ii<npoints;ii++){
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -242,19 +242,33 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };
 typedef int acceleratorEvent_t;
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
 inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
  return 0;
 }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
  acceleratorCopyFromDevice(from,to,bytes);
  return 0;
 }
 inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
  return 0;
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
 inline void acceleratorEventWait(acceleratorEvent_t ev)
 {
  //auto discard=cudaStreamSynchronize(ev);
 }
 inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
 inline int  acceleratorIsCommunicable(void *ptr)
@@ -343,11 +357,28 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
+
-inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); }
+///////
-inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
+// Asynch event interface
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+///////
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+typedef sycl::event acceleratorEvent_t;
 inline void acceleratorEventWait(acceleratorEvent_t ev)
 {
  ev.wait();
 }
 inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
 {
  return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
 }
 inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);}
 inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
 inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
 inline int  acceleratorIsCommunicable(void *ptr)
@@ -358,8 +389,10 @@ inline int  acceleratorIsCommunicable(void *ptr)
  else return 0;
 #endif
  return 1;
 }
 #endif
 //////////////////////////////////////////////
@@ -459,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 inline void *acceleratorAllocHost(size_t bytes)
 {
  void *ptr=NULL;
-  auto err = hipMallocHost((void **)&ptr,bytes);
+  auto err = hipHostMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
@@ -492,23 +525,35 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+typedef int acceleratorEvent_t;
 inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
  return 0;
 }
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
+  acceleratorCopyToDevice(from,to,bytes);
  return 0;
 }
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
+  acceleratorCopyFromDevice(from,to,bytes);
  return 0;
 }
 inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
 inline void acceleratorEventWait(acceleratorEvent_t ev)
 {
  //  auto discard=hipStreamSynchronize(ev);
 }
 inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
 #endif
 inline void acceleratorPin(void *ptr,unsigned long bytes)
@@ -545,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)
 #undef GRID_SIMT
 typedef int acceleratorEvent_t;
 inline void acceleratorMem(void)
 {
  /*
@@ -565,8 +612,13 @@ inline void acceleratorMem(void)
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { acceleratorCopyToDevice(from,to,bytes); return 0; }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { acceleratorCopyFromDevice(from,to,bytes); return 0; }
 inline void acceleratorEventWait(acceleratorEvent_t ev){}
 inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
 inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); return 0;}
 inline void acceleratorCopySynchronise(void) {};
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
@@ -655,9 +707,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
  acceleratorCopySynchronise();
 }
-template<class T> void acceleratorPut(T& dev,T&host)
+template<class T> void acceleratorPut(T& dev,const T&host)
 {
-  acceleratorCopyToDevice(&host,&dev,sizeof(T));
+  acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
 }
 template<class T> T acceleratorGet(T& dev)
 {
--- a/Grid/threads/ThreadReduction.h
+++ b/Grid/threads/ThreadReduction.h
@@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once 
 #ifndef MIN
 #define MIN(x,y) ((x)>(y)?(y):(x))
 #endif
 // Introduce a class to gain deterministic bit reproducible reduction.
 // make static; perhaps just a namespace is required.
 NAMESPACE_BEGIN(Grid);
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_critical                                     DO_PRAGMA(omp critical)
 #ifdef GRID_OMP
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
-  uint64_t *ufrom = (uint64_t *)from;
+  const uint64_t *ufrom = (const uint64_t *)from;
  uint64_t *uto   = (uint64_t *)to;
  assert(bytes%8==0);
  uint64_t words=bytes/8;
@@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
  });
 }
 #else
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
  bcopy(from,to,bytes);
 }
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);
-
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
    std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
    FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
    FlightRecorder::PrintEntireLog = 1;
    FlightRecorder::ChecksumComms  = 1;
    FlightRecorder::ChecksumCommsSend=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@@ -651,3 +658,4 @@ void Grid_debug_handler_init(void)
 }
 NAMESPACE_END(Grid);
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@@ -50,7 +50,7 @@ namespace Grid{
      int64_t index64;
      IndexFromCoorReversed(coor,index64,dims);
      if ( index64>=2*1024*1024*1024LL ){
-	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+	//	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
      }
      assert(index64<2*1024*1024*1024LL);
      index = (int) index64;
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@@ -25,13 +25,20 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
 #endif
 using namespace Grid;
 int main(int argc, char **argv)
 {
 #if Nc != 3
 #warning FTHMC2p1f will not work for Nc != 3
  std::cout << "This program will currently only work for Nc == 3." << std::endl;
 #else
  std::cout << std::setprecision(12);
  Grid_init(&argc, &argv);
@@ -220,7 +227,6 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing
  Grid_finalize();
 #endif
 } // main
--- a/HMC/FTHMC2p1f_3GeV.cc
+++ b/HMC/FTHMC2p1f_3GeV.cc
@@ -24,14 +24,22 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
 #endif
 using namespace Grid;
 int main(int argc, char **argv)
 {
 #if Nc != 3
 #warning FTHMC2p1f_3GeV will not work for Nc != 3
  std::cout << "This program will currently only work for Nc == 3." << std::endl;
 #else
  std::cout << std::setprecision(12);
  Grid_init(&argc, &argv);
@@ -220,6 +228,7 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing
  Grid_finalize();
 #endif
 } // main
--- a/HMC/HMC2p1f_3GeV.cc
+++ b/HMC/HMC2p1f_3GeV.cc
@@ -25,13 +25,20 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
 #endif
 using namespace Grid;
 int main(int argc, char **argv)
 {
 #if Nc != 3
 #warning HMC2p1f_3GeV will not work for Nc != 3
  std::cout << "This program will currently only work for Nc == 3." << std::endl;
 #else
  std::cout << std::setprecision(12);
  Grid_init(&argc, &argv);
@@ -220,6 +227,7 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing
  Grid_finalize();
 #endif
 } // main
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -52,7 +52,7 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();
-  int Ls=8;
+  int Ls=16;
  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -492,17 +492,18 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;
+	uint64_t no    = 50;
-
+	uint64_t ni    = 100;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
+	std::vector<double> t_time(no);
-	for(uint64_t i=0;i<ncall;i++){
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Dw.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
 	    Dw.DhopEO(src_o,r_e,DaggerNo);
 	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@@ -520,11 +521,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
+	mf_hi = flops/timestat.min*ni;
-	mf_lo = flops/timestat.max;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@@ -535,6 +536,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
      }
@@ -654,17 +656,19 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	uint64_t ncall = 500;
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
 	uint64_t ni    = 100;
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
+	std::vector<double> t_time(no);
-	for(uint64_t i=0;i<ncall;i++){
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
 	    Ds.DhopEO(src_o,r_e,DaggerNo);
 	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@@ -675,11 +679,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
+	mf_hi = flops/timestat.min*ni;
-	mf_lo = flops/timestat.max;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@@ -689,6 +693,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
      }
@@ -792,19 +797,18 @@ public:
 	  Dc.M(src,r);
 	}
 	FGrid->Barrier();
-	double t1=usecond();
+	uint64_t ni = 100;
-	uint64_t ncall = 500;
+	uint64_t no = 50;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
+	std::vector<double> t_time(no);
-	for(uint64_t i=0;i<ncall;i++){
+	for(uint64_t i=0;i<no;i++){
-	  t0=usecond();
+	  double t0=usecond();
-	  Dc.M(src,r);
+	  for(uint64_t j=0;j<ni;j++){
-	  t1=usecond();
+	    Dc.M(src,r);
 	  }
 	  double t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
@@ -814,20 +818,21 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
+	mf_hi = flops/timestat.min*ni;
-	mf_lo = flops/timestat.max;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call   "<< timestat.mean/ni<<std::endl;
      }
@@ -868,11 +873,11 @@ int main (int argc, char ** argv)
  int do_su4=0;
  int do_memory=1;
  int do_comms =1;
-  int do_blas  =1;
+  int do_blas  =0;
  int do_dslash=1;
  int sel=4;
-  std::vector<int> L_list({8,12,16,24});
+  std::vector<int> L_list({8,12,16,24,32});
  int selm1=sel-1;
  std::vector<double> clover;
--- a/configure.ac
+++ b/configure.ac
@@ -151,7 +151,7 @@ AC_ARG_ENABLE([tracing],
 case ${ac_TRACING} in
    nvtx)
        AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
-	LIBS="${LIBS} -lnvToolsExt64_1"
+	LIBS="${LIBS} -lnvToolsExt"
 	;;
    roctx)
        AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])
--- a/examples/Example_Laplacian_smearing.cc
+++ b/examples/Example_Laplacian_smearing.cc
@@ -93,10 +93,13 @@ int main(int argc, char ** argv)
  Real coeff = (width*width) / Real(4*Iterations);
  chi=kronecker;
  //  chi = (1-p^2/2N)^N kronecker
  for(int n = 0; n < Iterations; ++n) {
    Laplacian.M(chi,psi);
    chi = chi - coeff*psi;
    RealD n2 = norm2(chi);
    chi = chi * (1.0/std::sqrt(n2));
  }
  std::cout << " Wuppertal smeared operator is chi = \n" << chi <<std::endl;
--- a/systems/Aurora/benchmarks/bench16.pbs
+++ b/systems/Aurora/benchmarks/bench16.pbs
@@ -32,15 +32,9 @@ export MPICH_OFI_NIC_POLICY=GPU
 # Local vol 16.16.16.32
 #
-VOL 128.64.128.96
+LX=16
 MPI 4.4.4.3
 NPROC 192
 mpiexec -np 192 -ppn 12 -envall ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 4.4.4.3 --grid 128.64.128.96 --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap
 LX=32
 LY=16
-LZ=32
+LZ=16
 LT=32
 NX=2
--- a/systems/Aurora/benchmarks/gpu_tile.sh
+++ b/systems/Aurora/benchmarks/gpu_tile.sh
@@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -1,18 +1,19 @@
 #Ahead of time compile for PVC
-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" 
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
 #JIT compile 
 #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
-../../configure \
+../configure \
 	--enable-simd=GPU \
 	--enable-reduction=grid \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
 	--prefix $HOME/gpt-install \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \
--- a/systems/Frontier-rocm631/config-command
+++ b/systems/Frontier-rocm631/config-command
@@ -0,0 +1,22 @@
 CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 ../../configure --enable-comms=mpi-auto \
 --with-lime=$CLIME \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-tracing=none \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
 --with-gmp=$OLCF_GMP_ROOT \
 --with-fftw=$FFTW_DIR/.. \
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
 LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
--- a/systems/Frontier-rocm631/sourceme631.sh
+++ b/systems/Frontier-rocm631/sourceme631.sh
@@ -0,0 +1,16 @@
 echo spack
 . /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
 #module load cce/15.0.1
 module load rocm/6.3.1
 module load cray-fftw
 module load craype-accel-amd-gfx90a
 export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 #Ugly hacks to get down level software working on current system
 #export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
 #export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
 #ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
--- a/systems/Frontier/benchmarks/bench2.slurm
+++ b/systems/Frontier/benchmarks/bench2.slurm
@@ -30,14 +30,10 @@ source ${root}/sourceme.sh
 export OMP_NUM_THREADS=7
 export MPICH_GPU_SUPPORT_ENABLED=1
-export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
-
+#64.64.32.96
-for vol in 32.32.32.64
+for vol in 64.64.32.64
 do
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
 srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
 srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
 srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
 done
--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@@ -3,20 +3,19 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --with-lime=$CLIME \
 --enable-unified=no \
 --enable-shm=nvlink \
--enable-tracing=timer \
+--enable-tracing=none \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
 --enable-accelerator-cshift \
 --with-gmp=$OLCF_GMP_ROOT \
 --with-fftw=$FFTW_DIR/.. \
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
-CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
+CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
- LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
+ LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@@ -1,12 +1,25 @@
 echo spack
 . /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
-spack load c-lime
+
-module load emacs 
+module load cce/15.0.1
-module load PrgEnv-gnu
+module load rocm/5.3.0
 module load rocm/6.0.0
 module load cray-mpich
 module load gmp
 module load cray-fftw
 module load craype-accel-amd-gfx90a
 #Ugly hacks to get down level software working on current system
 export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
 ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
 #echo spack load c-lime
 #spack load c-lime
 #module load emacs 
 ##module load PrgEnv-gnu
 ##module load cray-mpich
 ##module load cray-fftw
 ##module load craype-accel-amd-gfx90a
 ##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 #Hack for lib
-#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
+##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@@ -0,0 +1,206 @@
 The purpose of this file is to collate all non-obvious known magic shell variables
 and compiler flags required for either correctness or performance on various systems.
 A repository of work-arounds.
 Contents:
 1. Interconnect + MPI
 2. Compilation
 3. Profiling
 ************************
 * 1. INTERCONNECT + MPI
 ************************
 --------------------------------------------------------------------
 MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O 
 --------------------------------------------------------------------
 export OMPI_MCA_io=romio321
 --------------------------------------
 ROMIO fail with > 2GB per node read (32 bit issue)
 --------------------------------------
 Use later MPICH
 https://github.com/paboyle/Grid/issues/381
 https://github.com/pmodels/mpich/commit/3a479ab0
 --------------------------------------------------------------------
 Slingshot: Frontier and Perlmutter libfabric slow down 
 and physical memory fragmentation 
 --------------------------------------------------------------------
 export FI_MR_CACHE_MONITOR=disabled
 or
 export FI_MR_CACHE_MONITOR=kdreg2
 --------------------------------------------------------------------
 Perlmutter
 --------------------------------------------------------------------
 export MPICH_RDMA_ENABLED_CUDA=1
 export MPICH_GPU_IPC_ENABLED=1
 export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
 export MPICH_GPU_NO_ASYNC_MEMCPY=0
 --------------------------------------------------------------------
 Frontier/LumiG
 --------------------------------------------------------------------
 Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
 cat << EOF > select_gpu
 #!/bin/bash
 export MPICH_GPU_SUPPORT_ENABLED=1
 export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 export GPU_MAP=(0 1 2 3 7 6 5 4)
 export NUMA_MAP=(3 3 1 1 2 2 0 0)
 export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
 export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
 export HIP_VISIBLE_DEVICES=\$GPU
 unset ROCR_VISIBLE_DEVICES
 echo RANK \$SLURM_LOCALID using GPU \$GPU    
 exec numactl -m \$NUMA -N \$NUMA \$*
 EOF
 chmod +x ./select_gpu
 srun ./select_gpu BINARY
 --------------------------------------------------------------------
 Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
 --------------------------------------------------------------------
 export OMPI_MCA_btl=^uct,openib
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 --------------------------------------------------------------------
 Mellanox + A100 correctness (Tursa, Booster, Leonardo)
 --------------------------------------------------------------------
 export UCX_MEMTYPE_CACHE=n
 --------------------------------------------------------------------
 MPICH/Aurora/PVC correctness and performance 
 --------------------------------------------------------------------
 https://github.com/pmodels/mpich/issues/7302
 --enable-cuda-aware-mpi=no  
 --enable-unified=no
 Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
 Do not use SVM
 Ideally use MPICH with fix to issue 7302:
 https://github.com/pmodels/mpich/pull/7312
 Ideally:
 MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
 Alternatives:
 export MPIR_CVAR_NOLOCAL=1
 export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
 --------------------------------------------------------------------
 MPICH/Aurora/PVC correctness and performance 
 --------------------------------------------------------------------
 Broken:
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 This gives good peformance without requiring 
 --enable-cuda-aware-mpi=no  
 But is an open issue reported by James Osborn
 https://github.com/pmodels/mpich/issues/7139
 Possibly resolved but unclear if in the installed software yet.
 ************************
 * 2. COMPILATION
 ************************
 --------------------------------------------------------------------
 G++ compiler breakage / graveyard
 --------------------------------------------------------------------
 9.3.0, 10.3.1, 
 https://github.com/paboyle/Grid/issues/290
 https://github.com/paboyle/Grid/issues/264
 Working (-) Broken (X):
 4.9.0 -
 4.9.1 -
 5.1.0 X
 5.2.0 X
 5.3.0 X
 5.4.0 X
 6.1.0 X
 6.2.0 X
 6.3.0 -
 7.1.0 -
 8.0.0 (HEAD) -
 https://github.com/paboyle/Grid/issues/100
 --------------------------------------------------------------------
 AMD GPU nodes :
 --------------------------------------------------------------------
 multiple ROCM versions broken; use 5.3.0
 manifests itself as wrong results in fp32 
 https://github.com/paboyle/Grid/issues/464
 --------------------------------------------------------------------
 Aurora/PVC
 --------------------------------------------------------------------
 SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
 SYCL slow link and relocatable code issues (Christoph Lehner)
 Opt large register file required for good performance in fp64
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
 export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fPIC"
 --------------------------------------------------------------------
 Aurora/PVC useful extra options
 --------------------------------------------------------------------
 Host only sanitizer:
 -Xarch_host -fsanitize=leak
 -Xarch_host -fsanitize=address
 Deterministic MPI reduction:
 export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 ************************
 * 3. Visual profile tools
 ************************
 --------------------------------------------------------------------
 Frontier/rocprof
 --------------------------------------------------------------------
 --------------------------------------------------------------------
 Aurora/unitrace
 --------------------------------------------------------------------
 --------------------------------------------------------------------
 Tursa/nsight-sys
 --------------------------------------------------------------------
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@@ -1,2 +1,16 @@
-CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ CXX=c++-13 MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug 
+
 CXX=mpicxx ../../configure \
 	   --enable-simd=GEN \
 	   --enable-comms=mpi-auto \
 	   --enable-Sp=yes \
 	   --enable-unified=yes \
 	   --prefix /Users/peterboyle/QCD/vtk/Grid/install \
 	   --with-lime=$CLIME \
 	   --with-hdf5=$HDF5 \
 	   --with-fftw=$FFTW \
 	   --with-openssl=$OPENSSL \
 	   --with-gmp=$GMP \
 	   --with-mpfr=$MPFR \
 	   --disable-debug 
--- a/systems/sdcc-genoa/bench.slurm
+++ b/systems/sdcc-genoa/bench.slurm
@@ -0,0 +1,32 @@
 #!/bin/bash
 #SBATCH --partition lqcd
 #SBATCH --time=00:50:00
 #SBATCH -A lqcdtest
 #SBATCH -q lqcd
 #SBATCH --exclusive
 #SBATCH --nodes=1
 #SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
 #SBATCH --ntasks=1
 #SBATCH --cpus-per-task=64
 #SBATCH --qos lqcd
 source sourceme.sh
 export PLACES=(1:16:4 1:32:2 0:64:1);
 export THR=(16 32 64)
 for t in  2 
 do
 export OMP_NUM_THREADS=${THR[$t]}
 export OMP_PLACES=${PLACES[$t]}
 export thr=${THR[$t]}
 #for vol in 24.24.24.24 32.32.32.32 48.48.48.96
 for vol in 48.48.48.96
 do
 srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
 done
 #srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
 done
--- a/systems/sdcc-genoa/bench2.slurm
+++ b/systems/sdcc-genoa/bench2.slurm
@@ -0,0 +1,36 @@
 #!/bin/bash
 #SBATCH --partition lqcd
 #SBATCH --time=00:50:00
 #SBATCH -A lqcdtest
 #SBATCH -q lqcd
 #SBATCH --exclusive
 #SBATCH --nodes=2
 #SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
 #SBATCH --ntasks=2
 #SBATCH --cpus-per-task=64
 #SBATCH --qos lqcd
 source sourceme.sh
 export PLACES=(1:16:4 1:32:2 0:64:1);
 export THR=(16 32 64)
 nodes=2
 mpi=1.1.1.2
 for t in 2 
 do
 export OMP_NUM_THREADS=${THR[$t]}
 export OMP_PLACES=${PLACES[$t]}
 export thr=${THR[$t]}
 #srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
 for vol in 64.64.64.128
 do
 srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
 done
 done
--- a/systems/sdcc-genoa/config-command
+++ b/systems/sdcc-genoa/config-command
@@ -0,0 +1,16 @@
 ../../configure \
 --enable-comms=mpi-auto \
 --enable-unified=yes \
 --enable-shm=shmopen \
 --enable-shm-fast-path=shmopen \
 --enable-accelerator=none \
 --enable-simd=AVX512 \
 --disable-accelerator-cshift \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=clang++ \
 MPICXX=mpicxx \
 CXXFLAGS="-std=c++17"
--- a/systems/sdcc-genoa/sourceme.sh
+++ b/systems/sdcc-genoa/sourceme.sh
@@ -0,0 +1,4 @@
 source $HOME/spack/share/spack/setup-env.sh
 spack load llvm@17.0.4
 export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
 module load openmpi
--- a/systems/spack-linux/config-command
+++ b/systems/spack-linux/config-command
@@ -0,0 +1,17 @@
 ../../src/Grid/configure \
    --prefix /home/pab/NPR/install \
    --enable-comms=mpi-auto \
    --enable-simd=AVX2 \
    --enable-shm=none \
    --enable-debug \
    --with-lime=$CLIME \
    --with-hdf5=$HDF5 \
    --with-fftw=$FFTW \
    --with-gmp=$GMP \
    --with-mpfr=$MPFR \
    --disable-gparity \
    --disable-fermion-reps \
    CXX=clang++ \
    MPICXX=mpicxx \
    CXXFLAGS="-std=c++17 "
--- a/systems/spack-linux/sourceme.sh
+++ b/systems/spack-linux/sourceme.sh
@@ -0,0 +1,28 @@
 source $HOME/spack/share/spack/setup-env.sh
 spack load llvm@12
 spack load autoconf%clang@12.0.1
 spack load automake%clang@12.0.1
 spack load c-lime%clang@12.0.1
 spack load fftw%clang@12.0.1
 spack load gmp%clang@12.0.1
 spack load mpfr%clang@12.0.1
 spack load openmpi%clang@12.0.1
 spack load openssl%clang@12.0.1
 spack load hdf5+cxx%clang@12.0.1
 spack load cmake%clang@12.0.1
 export FFTW=`spack find --paths fftw%clang@12.0.1    | grep ^fftw   | awk '{print $2}' `
 export HDF5=`spack find --paths hdf5+cxx%clang@12.0.1   | grep ^hdf5   | awk '{print $2}' `
 export CLIME=`spack find --paths c-lime%clang@12.0.1 | grep ^c-lime | awk '{print $2}' `
 export MPFR=`spack find --paths mpfr%clang@12.0.1    | grep ^mpfr  | awk '{print $2}' `
 export LLVM=`spack find --paths llvm@12    | grep ^llvm  | awk '{print $2}' `
 export OPENSSL=`spack find --paths openssl%clang@12.0.1 | grep openssl | awk  '{print $2}' `
 export GMP=`spack find --paths gmp%clang@12.0.1      | grep ^gmp | awk '{print $2}' `
 export TCLAP=`spack find --paths tclap%clang@12.0.1  | grep ^tclap | awk '{print $2}' `
 export LD_LIBRARY_PATH=${TCLAP}/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$FFTW/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$LLVM/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$LLVM/lib/x86_64-unknown-linux-gnu/:$LD_LIBRARY_PATH
 ulimit -s 81920
--- a/systems/spack-linux/spack-install
+++ b/systems/spack-linux/spack-install
@@ -0,0 +1,19 @@
 cd
 git clone https://github.com/spack/spack.git
 source $HOME/spack/share/spack/setup-env.sh
 spack install llvm@12
 spack install autoconf%clang@12.0.1
 spack install automake%clang@12.0.1
 spack install c-lime%clang@12.0.1
 spack install fftw%clang@12.0.1
 spack install gmp%clang@12.0.1
 spack install mpfr%clang@12.0.1
 spack install openmpi%clang@12.0.1
 spack install openssl%clang@12.0.1
 spack install hdf5+cxx%clang@12.0.1
 spack install cmake%clang@12.0.1
 spack install tclap%clang@12.0.1
 spack install emacs%clang@12.0.1
--- a/tests/Test_dwf_dslash_repro.cc
+++ b/tests/Test_dwf_dslash_repro.cc
@@ -62,7 +62,7 @@ int VerifyOnDevice(const FermionField &res, FermionField &ref)
    if (((random()&0xF)==0)&&injection) {
      uint64_t sF = random()%(NN);
      int lane=0;
-      printf("Error injection site %ld on rank %d\n",sF,res.Grid()->ThisRank());
+      printf("Error injection site %ld on rank %d\n",(long)sF,res.Grid()->ThisRank());
      auto vv = acceleratorGet(res_v[sF]);
      double *dd = (double *)&vv;
      *dd=M_PI;
--- a/tests/debug/Test_general_coarse_hdcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg.cc
@@ -195,8 +195,8 @@ int main (int argc, char ** argv)
  int Nk=nrhs;
  int Nm=Nk*3;
-  int Nk=36;
+  //  int Nk=36;
-  int Nm=144;
+  //  int Nm=144;
  int Nstop=Nk;
  int Nconv_test_interval=1;
--- a/tests/debug/Test_general_coarse_pvdagm.cc
+++ b/tests/debug/Test_general_coarse_pvdagm.cc
@@ -47,20 +47,20 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
-    std::cout << "Op: PVdag M "<<std::endl;
+    //    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
-    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    //    _Mat.M(in,tmp);
    //    _PV.Mdag(tmp,out);
@@ -83,14 +83,14 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
-    std::cout << "Op: PVdag M "<<std::endl;
+    //    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
-    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
@@ -98,7 +98,7 @@ public:
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
--- a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc
+++ b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc
@@ -54,6 +54,7 @@ const RealD            M5       = 1.8;
 int main(int argc, char** argv)
 {
 #ifdef ENABLE_GPARITY
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
@@ -106,6 +107,6 @@ int main(int argc, char** argv)
    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }
-
+#endif
  return 0;
 }
--- a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc
+++ b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc
@@ -56,6 +56,7 @@ const RealD            M5       = 1.8;
 int main(int argc, char** argv)
 {
 #ifdef ENABLE_GPARITY  
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
@@ -106,6 +107,6 @@ int main(int argc, char** argv)
    Meofa.refresh(Umu, sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }
-
+#endif
  return 0;
 }
--- a/tests/debug/Test_padded_cell.cc
+++ b/tests/debug/Test_padded_cell.cc
@@ -33,6 +33,7 @@ using namespace std;
 using namespace Grid;
 // This is to optimize the SIMD
 /*
 template<class vobj> void gpermute(vobj & inout,int perm){
  vobj tmp=inout;
  if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
@@ -40,7 +41,7 @@ template<class vobj> void gpermute(vobj & inout,int perm){
  if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
  if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
 }
-
+*/
 int main (int argc, char ** argv)
 {
--- a/tests/debug/Test_padded_cell_staple.cc
+++ b/tests/debug/Test_padded_cell_staple.cc
@@ -153,7 +153,7 @@ public:
    t=usecond();
    {
      autoView( gStaple_v , gStaple, AcceleratorWrite);
-      auto gStencil_v = gStencil.View();
+      auto gStencil_v = gStencil.View(AcceleratorRead);
      autoView( Ug_mu_v , Ug_mu, AcceleratorRead);
      autoView( Ug_nu_v , Ug_nu, AcceleratorRead);
@@ -389,7 +389,7 @@ public:
    GeneralLocalStencil gStencil(ggrid,shifts);
    {
      autoView( gStaple_v , gStaple, AcceleratorWrite);
-      auto gStencil_v = gStencil.View();
+      auto gStencil_v = gStencil.View(AcceleratorRead);
      typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
      size_t vsize = Nd*sizeof(GaugeViewType);
--- a/tests/debug/Test_reweight_dwf_eofa_gparity.cc
+++ b/tests/debug/Test_reweight_dwf_eofa_gparity.cc
@@ -83,6 +83,7 @@ std::vector<RealD> jack_stats(const std::vector<RealD>& data)
 int main(int argc, char **argv)
 {
 #ifdef ENABLE_GPARITY  
  Grid_init(&argc, &argv);
  // Initialize spacetime grid
@@ -206,4 +207,5 @@ int main(int argc, char **argv)
  std::cout << std::endl << "EOFA: rw = " << eofa_result[0] << " +/- " << eofa_result[1] << std::endl;
  Grid_finalize();
 #endif
 }
--- a/tests/debug/Test_reweight_mobius_eofa_gparity.cc
+++ b/tests/debug/Test_reweight_mobius_eofa_gparity.cc
@@ -85,6 +85,7 @@ std::vector<RealD> jack_stats(const std::vector<RealD>& data)
 int main(int argc, char **argv)
 {
 #ifdef ENABLE_GPARITY  
  Grid_init(&argc, &argv);
  // Initialize spacetime grid
@@ -215,4 +216,5 @@ int main(int argc, char **argv)
  std::cout << std::endl << "EOFA: rw = " << eofa_result[0] << " +/- " << eofa_result[1] << std::endl;
  Grid_finalize();
 #endif
 }
--- a/tests/forces/Test_dwf_gpforce.cc
+++ b/tests/forces/Test_dwf_gpforce.cc
@@ -35,6 +35,7 @@ using namespace Grid;
 int main (int argc, char ** argv)
 {
 #ifdef ENABLE_GPARITY  
  Grid_init(&argc,&argv);
  Coordinate latt_size   = GridDefaultLatt();
@@ -244,4 +245,5 @@ int main (int argc, char ** argv)
  std::cout<< GridLogMessage << "Done" <<std::endl;
  Grid_finalize();
 #endif  
 }
--- a/tests/forces/Test_dwf_gpforce_eofa.cc
+++ b/tests/forces/Test_dwf_gpforce_eofa.cc
@@ -38,6 +38,7 @@ typedef typename FermionAction::FermionField FermionField;
 int main (int argc, char** argv)
 {
 #ifdef ENABLE_GPARITY  
  Grid_init(&argc, &argv);
  Coordinate latt_size   = GridDefaultLatt();
@@ -173,4 +174,5 @@ int main (int argc, char** argv)
  std::cout << GridLogMessage << "Done" << std::endl;
  Grid_finalize();
 #endif
 }
--- a/tests/forces/Test_gpdwf_force.cc
+++ b/tests/forces/Test_gpdwf_force.cc
@@ -35,6 +35,7 @@ using namespace Grid;
 int main (int argc, char ** argv)
 {
 #ifdef ENABLE_GPARITY
  Grid_init(&argc,&argv);
  Coordinate latt_size   = GridDefaultLatt();
@@ -204,4 +205,5 @@ int main (int argc, char ** argv)
  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
  std::cout<< GridLogMessage << "Done" <<std::endl;
  Grid_finalize();
 #endif
 }
--- a/tests/forces/Test_gpdwf_force_1f_2f.cc
+++ b/tests/forces/Test_gpdwf_force_1f_2f.cc
@@ -32,6 +32,7 @@ using namespace std;
 using namespace Grid;
 //Here we test the G-parity action and force between the 1f (doubled-lattice) and 2f approaches 
 #ifdef ENABLE_GPARITY
 void copyConjGauge(LatticeGaugeFieldD &Umu_1f, const LatticeGaugeFieldD &Umu_2f, const int nu){
@@ -444,3 +445,7 @@ int main (int argc, char ** argv)
    assert(0);
  }
 }
 #else
 int main (int argc, char ** argv){};
 #endif
--- a/tests/forces/Test_gpwilson_force.cc
+++ b/tests/forces/Test_gpwilson_force.cc
@@ -32,6 +32,7 @@ using namespace Grid;
 int main (int argc, char ** argv)
 {
 #ifdef ENABLE_GPARITY
  Grid_init(&argc,&argv);
  Coordinate latt_size   = GridDefaultLatt();
@@ -155,4 +156,5 @@ int main (int argc, char ** argv)
  std::cout<< GridLogMessage << "Done" <<std::endl;
  Grid_finalize();
 #endif
 }
--- a/tests/forces/Test_mobius_gparity_eofa_mixed.cc
+++ b/tests/forces/Test_mobius_gparity_eofa_mixed.cc
@@ -30,9 +30,10 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Grid.h>
 #ifdef ENABLE_GPARITY
 using namespace std;
 using namespace Grid;
 ;
 typedef GparityWilsonImplD FermionImplPolicyD;
 typedef GparityMobiusEOFAFermionD FermionActionD;
@@ -231,3 +232,7 @@ int main (int argc, char** argv)
  std::cout << GridLogMessage << "Done" << std::endl;
  Grid_finalize();
 }
 #else
 int main(int argc,char ** argv) { return 0;};
 #endif
--- a/tests/forces/Test_mobius_gpforce_eofa.cc
+++ b/tests/forces/Test_mobius_gpforce_eofa.cc
@@ -31,14 +31,14 @@ See the full license in the file "LICENSE" in the top level distribution directo
 using namespace std;
 using namespace Grid;
- ;
+
 typedef GparityWilsonImplR FermionImplPolicy;
 typedef GparityMobiusEOFAFermionD FermionAction;
 typedef typename FermionAction::FermionField FermionField;
 int main (int argc, char** argv)
 {
 #ifdef ENABLE_GPARITY
  Grid_init(&argc, &argv);
  Coordinate latt_size   = GridDefaultLatt();
@@ -171,4 +171,5 @@ int main (int argc, char** argv)
  std::cout << GridLogMessage << "Done" << std::endl;
  Grid_finalize();
 #endif
 }
--- a/tests/hmc/Test_action_dwf_gparity2fvs1f.cc
+++ b/tests/hmc/Test_action_dwf_gparity2fvs1f.cc
@@ -30,7 +30,7 @@
 using namespace Grid;
-
+#ifdef ENABLE_GPARITY
 template<typename FermionField2f, typename FermionField1f>
 void copy2fTo1fFermionField(FermionField1f &out, const FermionField2f &in, int gpdir){
@@ -255,3 +255,6 @@ int main(int argc, char **argv) {
 } // main
 #else
 int main(int argc, char **argv){};
 #endif
--- a/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc
+++ b/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc
@@ -30,6 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 int main(int argc, char **argv) {
 #ifdef ENABLE_GPARITY
  using namespace Grid;
   ;
@@ -139,7 +140,7 @@ int main(int argc, char **argv) {
  Grid_finalize();
-
+#endif
 } // main
--- a/tests/hmc/Test_hmc_EOMobiusRatio.cc
+++ b/tests/hmc/Test_hmc_EOMobiusRatio.cc
@@ -55,13 +55,13 @@ namespace Grid{
 };
-  struct SmearingParameters: Serializable {
+  struct HmcSmearingParameters: Serializable {
-    GRID_SERIALIZABLE_CLASS_MEMBERS(SmearingParameters,
+    GRID_SERIALIZABLE_CLASS_MEMBERS(HmcSmearingParameters,
 				    double, rho,
 				    Integer, Nsmear)
    template <class ReaderClass >
-    SmearingParameters(Reader<ReaderClass>& Reader){
+    HmcSmearingParameters(Reader<ReaderClass>& Reader){
      read(Reader, "StoutSmearing", *this);
    }
@@ -213,7 +213,7 @@ int main(int argc, char **argv) {
  // Reset performance counters 
  if (ApplySmearing){
-    SmearingParameters SmPar(Reader);
+    HmcSmearingParameters SmPar(Reader);
    //double rho = 0.1;  // smearing parameter
    //int Nsmear = 3;    // number of smearing levels
    Smear_Stout<HMCWrapper::ImplPolicy> Stout(SmPar.rho);
--- a/tests/lanczos/LanParams.xml
+++ b/tests/lanczos/LanParams.xml
@@ -0,0 +1,15 @@
 <?xml version="1.0"?>
 <grid>
  <LanczosParameters>
    <mass>-1.025</mass>
    <mstep>-0.025</mstep>
    <M5>1.8</M5>
    <Ls>48</Ls>
    <Nstop>10</Nstop>
    <Nk>12</Nk>
    <Np>30</Np>
    <ChebyLow>0.1</ChebyLow>
    <ChebyHigh>50</ChebyHigh>
    <ChebyOrder>51</ChebyOrder>
  </LanczosParameters>
 </grid>
--- a/tests/lanczos/Test_compressed_lanczos_gparity.cc
+++ b/tests/lanczos/Test_compressed_lanczos_gparity.cc
@@ -35,6 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
 #ifdef ENABLE_GPARITY
 using namespace std;
 using namespace Grid;
@@ -378,7 +380,8 @@ void runTest(const Options &opt){
 //Note:  because we rely upon physical properties we must use a "real" gauge configuration
-int main (int argc, char ** argv) {
+int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  GridLogIRL.TimingMode(1);
@@ -482,4 +485,8 @@ int main (int argc, char ** argv) {
  Grid_finalize();
 }
 #else
 int main(int argc, char **argv){};
 #endif
--- a/tests/lanczos/Test_dwf_G5R5.cc
+++ b/tests/lanczos/Test_dwf_G5R5.cc
@@ -0,0 +1,351 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_dwf_G5R5.cc
 Copyright (C) 2015
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 From Duo and Bob's Chirality study
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 ;
 #if 0
 typedef DomainWallFermionD FermionOp;
 typedef typename DomainWallFermionD::FermionField FermionField;
 #else
 typedef MobiusFermionD FermionOp;
 typedef typename MobiusFermionD::FermionField FermionField;
 #endif
 RealD AllZero(RealD x) { return 0.; }
 namespace Grid {
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 		  		RealD, M5 , 
 	  			Integer, Ls,
 	  			Integer, Nstop,
 	  			Integer, Nk,
 	  			Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
 //                                  Integer, StartTrajectory,
 //                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
 //                                  bool, MetropolisTest,
 //                                  Integer, NoMetropolisUntil,
 //                                  std::string, StartingType,
 //                                  Integer, SW,
 //				  RealD, Kappa,
 //                                  IntegratorParameters, MD)
  LanczosParameters() {
    ////////////////////////////// Default values
      mass = 0;
 //    MetropolisTest    = true;
 //    NoMetropolisUntil = 10;
 //    StartTrajectory   = 0;
 //    SW                = 2;
 //    Trajectories      = 10;
 //    StartingType      = "HotStart";
    /////////////////////////////////
  }
  template <class ReaderClass >
  LanczosParameters(Reader<ReaderClass> & TheReader){
    initialize(TheReader);
  }
  template < class ReaderClass > 
  void initialize(Reader<ReaderClass> &TheReader){
 //    std::cout << GridLogMessage << "Reading HMC\n";
    read(TheReader, "HMC", *this);
  }
  void print_parameters() const {
 //    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
 //    MD.print_parameters();
  }
 };
 }
 int main(int argc, char** argv) {
  Grid_init(&argc, &argv);
  LanczosParameters LanParams;
 #if 1
  {
    XmlReader  HMCrd("LanParams.xml");
    read(HMCrd,"LanczosParameters",LanParams);
  }
 #else
  {
    LanParams.mass = mass;
  }
 #endif
  std::cout << GridLogMessage<< LanParams <<std::endl;
  { 
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
  int Ls=16;
  RealD M5=1.8;
  RealD mass = -1.0;
  mass=LanParams.mass;
  Ls=LanParams.Ls;
  M5=LanParams.M5;
  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
      GridDefaultMpi());
  GridRedBlackCartesian* UrbGrid =
      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 //  GridCartesian* FGrid = UGrid;
 //  GridRedBlackCartesian* FrbGrid = UrbGrid;
  GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
 //  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
  GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("./config");
  int precision32 = 0;
  int tworow      = 0;
  NerscIO::readConfiguration(Umu,header,file);
 /*
  std::vector<LatticeColourMatrix> U(4, UGrid);
  for (int mu = 0; mu < Nd; mu++) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
  int Nstop = 10;
  int Nk = 20;
  int Np = 80;
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
  int Nm = Nk + Np;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;
  RealD mob_b=1.5;
 //while ( mass > - 5.0){
 //  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
 //  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
  Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
 //  Gamma5R5HermitianLinearOperator
  std::vector<double> Coeffs{0, 1.};
  Polynomial<FermionField> PolyX(Coeffs);
  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
  PlainHermOp<FermionField> Op     (HermOp);
  PlainHermOp<FermionField> Op2     (G5R5Herm);
  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
  std::vector<RealD> eval(Nm);
  FermionField src(FGrid);
  gaussian(RNG5, src);
  std::vector<FermionField> evec(Nm, FGrid);
  for (int i = 0; i < 1; i++) {
    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
              << std::endl;
  };
  int Nconv;
  IRL.calc(eval, evec, src, Nconv);
  std::cout << mass <<" : " << eval << std::endl;
 #if 0
  Gamma g5(Gamma::Algebra::Gamma5) ;
  ComplexD dot;
  FermionField tmp(FGrid);
 //  RealD eMe,eMMe;
  for (int i = 0; i < Nstop ; i++) {
 //    tmp = g5*evec[i];
    dot = innerProduct(evec[i],evec[i]);
 //    G5R5(tmp,evec[i]);
    G5R5Herm.HermOpAndNorm(evec[i],tmp,eMe,eMMe);
    std::cout <<"Norm "<<M5<<" "<< mass << " : " << i << " " << real(dot) << " " << imag(dot)  << " "<< eMe << " " <<eMMe<< std::endl ;
    for (int j = 0; j < Nstop ; j++) {
      dot = innerProduct(tmp,evec[j]);
      std::cout <<"G5R5 "<<M5<<" "<< mass << " : " << i << " " <<j<<" " << real(dot) << " " << imag(dot)  << std::endl ;
    }
  }
 //  src  = evec[0]+evec[1]+evec[2];
 //  mass += -0.1;
 #endif
  //**********************************************************************
  //orthogonalization
  //calculat the matrix
  cout << "Start orthogonalization " << endl;
  cout << "calculate the matrix element" << endl;
  vector<LatticeFermion> G5R5Mevec(Nconv, FGrid);
  vector<LatticeFermion> finalevec(Nconv, FGrid);
  vector<RealD> eMe(Nconv), eMMe(Nconv);
  for(int i = 0; i < Nconv; i++){
    G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
  cout << eMe << endl;
  cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
  cout << eMMe << endl;
  vector<vector<ComplexD>> VevecG5R5Mevec(Nconv);
  Eigen::MatrixXcd evecG5R5Mevec = Eigen::MatrixXcd::Zero(Nconv, Nconv);
  for(int i = 0; i < Nconv; i++){
    VevecG5R5Mevec[i].resize(Nconv);
    for(int j = 0; j < Nconv; j++){
      VevecG5R5Mevec[i][j] = innerProduct(evec[i], G5R5Mevec[j]);
      evecG5R5Mevec(i, j) = VevecG5R5Mevec[i][j];
    }
  }
  //calculate eigenvector
  cout << "Eigen solver" << endl;
  Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(evecG5R5Mevec);
  vector<RealD> eigeneval(Nconv);
  vector<vector<ComplexD>> eigenevec(Nconv);
  for(int i = 0; i < Nconv; i++){
    eigeneval[i] = eigensolver.eigenvalues()[i];
    eigenevec[i].resize(Nconv);
    for(int j = 0; j < Nconv; j++){
      eigenevec[i][j] = eigensolver.eigenvectors()(i, j);
    }
  }
  //rotation
  cout << "Do rotation" << endl;
  for(int i = 0; i < Nconv; i++){
    finalevec[i] = finalevec[i] - finalevec[i];
    for(int j = 0; j < Nconv; j++){
      finalevec[i] = eigenevec[j][i]*evec[j] + finalevec[i];
    }
  }
  //normalize again;
  for(int i = 0; i < Nconv; i++){
    RealD tmp_RealD = norm2(finalevec[i]);
    tmp_RealD = 1./pow(tmp_RealD, 0.5);
    finalevec[i] = finalevec[i]*tmp_RealD;
  }
  //check
  for(int i = 0; i < Nconv; i++){
    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  //**********************************************************************
  //sort the eigenvectors
  vector<LatticeFermion> finalevec_copy(Nconv, FGrid);
  for(int i = 0; i < Nconv; i++){
    finalevec_copy[i] = finalevec[i];
  }
  vector<RealD> eMe_copy(eMe);
  for(int i = 0; i < Nconv; i++){
    eMe[i] = fabs(eMe[i]);
    eMe_copy[i] = eMe[i];
  }
  sort(eMe_copy.begin(), eMe_copy.end());
  for(int i = 0; i < Nconv; i++){
    for(int j = 0; j < Nconv; j++){
      if(eMe[j] == eMe_copy[i]){
        finalevec[i] = finalevec_copy[j];
      }
    }
  }
    for(int i = 0; i < Nconv; i++){
    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
  cout << eMe << endl;
  cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
  cout << eMMe << endl;
 //  vector<LatticeFermion> finalevec(Nconv, FGrid);
 // temporary, until doing rotation
 //  for(int i = 0; i < Nconv; i++)
 //	  finalevec[i]=evec[i];
  //**********************************************************************
  //calculate chirality matrix
  vector<LatticeFermion> G5evec(Nconv, FGrid);
  vector<vector<ComplexD>> chiral_matrix(Nconv);
  vector<vector<RealD>> chiral_matrix_real(Nconv);
  for(int i = 0; i < Nconv; i++){
 //    G5evec[i] = G5evec[i] - G5evec[i];
    G5evec[i] = Zero();
    for(int j = 0; j < Ls/2; j++){
      axpby_ssp(G5evec[i], 1., finalevec[i], 0., G5evec[i], j, j);
    }
    for(int j = Ls/2; j < Ls; j++){
      axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
    }
  }
  for(int i = 0; i < Nconv; i++){
    chiral_matrix_real[i].resize(Nconv);
    chiral_matrix[i].resize(Nconv);
    for(int j = 0; j < Nconv; j++){
      chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
      chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
      std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
    }
  }
  for(int i = 0; i < Nconv; i++){
    if(chiral_matrix[i][i].real() < 0.){
      chiral_matrix_real[i][i] = -1. * chiral_matrix_real[i][i];
    }
  }
  Grid_finalize();
 }
--- a/tests/lanczos/Test_dwf_lanczos.cc
+++ b/tests/lanczos/Test_dwf_lanczos.cc
@@ -29,11 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
 ;
 template<typename Action>
 struct Setup{};
 #ifdef ENABLE_GPARITY
 template<>
 struct Setup<GparityMobiusFermionF>{
  static GparityMobiusFermionF* getAction(LatticeGaugeFieldF &Umu,
@@ -47,16 +47,24 @@ struct Setup<GparityMobiusFermionF>{
    return new GparityMobiusFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
  }
 };
 #endif
 template<>
 struct Setup<DomainWallFermionF>{
  static DomainWallFermionF* getAction(LatticeGaugeFieldF &Umu,
 					  GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
    RealD mass=0.00054;
    RealD M5=1.8;
    return new DomainWallFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  }
 };
 template<>
 struct Setup<DomainWallFermionD>{
  static DomainWallFermionD* getAction(LatticeGaugeField &Umu,
 					  GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
    RealD mass=0.00054;
    RealD M5=1.8;
-    return new DomainWallFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+    return new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  }
 };
@@ -168,7 +176,9 @@ int main (int argc, char ** argv)
  }
  if(action == "GparityMobius"){
 #ifdef ENABLE_GPARITY    
    run<GparityMobiusFermionF>();
 #endif    
  }else if(action == "DWF"){
    run<DomainWallFermionF>();
  }else if(action == "Mobius"){
--- a/tests/lanczos/Test_evec_compression.cc
+++ b/tests/lanczos/Test_evec_compression.cc
@@ -555,6 +555,7 @@ int main (int argc, char ** argv) {
  double c = (args.mobius_scale - bmc)/2.;  // c = 1/2 [ (b+c) - (b-c) ]
  if(is_gparity){
 #ifdef ENABLE_GPARITY    
    GparityWilsonImplD::ImplParams Params = setupGparityParams(args.GparityDirs);
    readConfiguration<ConjugateGimplD>(Umu, config, args.is_cps_cfg);   //Read the gauge field
@@ -564,7 +565,10 @@ int main (int argc, char ** argv) {
    }else if(action_s == "Mobius"){
      GparityMobiusFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, b, c, Params);
      run(action, config, args);	    
-    }      
+    }
 #else
    assert(0);
 #endif
  }else{
    WilsonImplD::ImplParams Params = setupParams();
    readConfiguration<PeriodicGimplD>(Umu, config, args.is_cps_cfg);   //Read the gauge field
--- a/tests/lanczos/Test_wilson_DWFKernel.cc
+++ b/tests/lanczos/Test_wilson_DWFKernel.cc
@@ -0,0 +1,284 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_dwf_lanczos.cc
 Copyright (C) 2015
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 ;
 typedef WilsonFermionD FermionOp;
 typedef typename WilsonFermionD::FermionField FermionField;
 RealD AllZero(RealD x) { return 0.; }
 namespace Grid {
 #if 0
 template<typename Field>
 class RationalHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();  
 //  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
  RealD _massDen, _massNum;
  FunctionHermOp(LinearOperatorBase<Field>& linop, RealD massDen,RealD massNum)
    :  _Linop(linop) ,_massDen(massDen),_massNum(massNum) {};
  void operator()(const Field& in, Field& out) {
 //    _poly(_Linop,in,out);
  } 
 };
 #endif
 template<class Matrix,class Field>
 class InvG5LinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  RealD _num;
  RealD _Tol;
  Integer _MaxIt;
  Gamma g5;
 public:
  InvG5LinearOperator(Matrix &Mat,RealD num): _Mat(Mat),_num(num), _Tol(1e-12),_MaxIt(10000), g5(Gamma::Algebra::Gamma5) {};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
    assert(0);
    _Mat.Mdiag(in,out);
  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    assert(0);
    _Mat.Mdir(in,out,dir,disp);
  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){
    assert(0);
    _Mat.MdirAll(in,out);
  };
  void Op     (const Field &in, Field &out){
    assert(0);
    _Mat.M(in,out);
  }
  void AdjOp     (const Field &in, Field &out){
    assert(0);
    _Mat.Mdag(in,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
  void HermOp(const Field &in, Field &out){
     Field tmp(in.Grid());
     MdagMLinearOperator<Matrix,Field> denom(_Mat);
     ConjugateGradient<Field> CG(_Tol,_MaxIt); 
     _Mat.M(in,tmp);
     tmp += _num*in;
     _Mat.Mdag(tmp,out);
     CG(denom,out,tmp);
     out = g5*tmp;
  }
 };
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 				RealD, resid,
 				Integer, Nstop,
                                Integer, Nk,
                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
 //                                  Integer, StartTrajectory,
 //                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
 //                                  bool, MetropolisTest,
 //                                  Integer, NoMetropolisUntil,
 //                                  std::string, StartingType,
 //                                  Integer, SW,
 //				  RealD, Kappa,
 //                                  IntegratorParameters, MD)
  LanczosParameters() {
    ////////////////////////////// Default values
      mass = 0;
 //    MetropolisTest    = true;
 //    NoMetropolisUntil = 10;
 //    StartTrajectory   = 0;
 //    SW                = 2;
 //    Trajectories      = 10;
 //    StartingType      = "HotStart";
    /////////////////////////////////
  }
  template <class ReaderClass >
  LanczosParameters(Reader<ReaderClass> & TheReader){
    initialize(TheReader);
  }
  template < class ReaderClass > 
  void initialize(Reader<ReaderClass> &TheReader){
 //    std::cout << GridLogMessage << "Reading HMC\n";
    read(TheReader, "HMC", *this);
  }
  void print_parameters() const {
 //    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
 //    MD.print_parameters();
  }
 };
 }
 int main(int argc, char** argv) {
  Grid_init(&argc, &argv);
  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
      GridDefaultMpi());
  GridRedBlackCartesian* UrbGrid =
      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian* FGrid = UGrid;
  GridRedBlackCartesian* FrbGrid = UrbGrid;
 //  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
  GridParallelRNG RNG5(FGrid);
  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG RNG5rb(FrbGrid);
  RNG5.SeedFixedIntegers(seeds5);
  LatticeGaugeField Umu(UGrid);
 //  SU<Nc>::HotConfiguration(RNG4, Umu);
  FieldMetaData header;
  std::string file("./config");
  int precision32 = 0;
  int tworow      = 0;
 //  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
  NerscIO::readConfiguration(Umu,header,file);
 /*
  std::vector<LatticeColourMatrix> U(4, UGrid);
  for (int mu = 0; mu < Nd; mu++) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
  int Nstop = 5;
  int Nk = 10;
  int Np = 90;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;
  RealD mass = -1.0;
  LanczosParameters LanParams;
 #if 1
  {
    XmlReader  HMCrd("LanParams.xml");
    read(HMCrd,"LanczosParameters",LanParams);
  }
 #else
  {
    LanParams.mass = mass;
  }
 #endif
  std::cout << GridLogMessage<< LanParams <<std::endl;
  { 
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
  mass=LanParams.mass;
  resid=LanParams.resid;
  int Nm = Nk + Np;
 while ( mass > - 5.0){
  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
  InvG5LinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator,-2.); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
 //  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
  std::vector<double> Coeffs{0, 0, 1.};
  Polynomial<FermionField> PolyX(Coeffs);
  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
       FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
 //     InvHermOp<FermionField> Op(WilsonOperator,HermOp);
     PlainHermOp<FermionField> Op     (HermOp);
 //     PlainHermOp<FermionField> Op2     (HermOp2);
  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
  std::vector<RealD> eval(Nm);
  FermionField src(FGrid);
  gaussian(RNG5, src);
  std::vector<FermionField> evec(Nm, FGrid);
  for (int i = 0; i < 1; i++) {
    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
              << std::endl;
  };
  int Nconv;
  IRL.calc(eval, evec, src, Nconv);
  std::cout << mass <<" : " << eval << std::endl;
  Gamma g5(Gamma::Algebra::Gamma5) ;
  ComplexD dot;
  FermionField tmp(FGrid);
  for (int i = 0; i < Nstop ; i++) {
    tmp = g5*evec[i];
    dot = innerProduct(tmp,evec[i]);
    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
  }
  src  = evec[0]+evec[1]+evec[2];
  mass += -0.1;
 }
  Grid_finalize();
 }
--- a/tests/lanczos/Test_wilson_lanczos.cc
+++ b/tests/lanczos/Test_wilson_lanczos.cc
@@ -61,7 +61,8 @@ int main(int argc, char** argv) {
  RNG5.SeedFixedIntegers(seeds5);
  LatticeGaugeField Umu(UGrid);
-  SU<Nc>::HotConfiguration(RNG4, Umu);
+//  SU<Nc>::HotConfiguration(RNG4, Umu);
  SU<Nc>::ColdConfiguration(Umu);
 /*
  std::vector<LatticeColourMatrix> U(4, UGrid);
@@ -69,9 +70,15 @@ int main(int argc, char** argv) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
 //  std::vector<Complex> boundary = {1,1,1,-1};
  std::vector<Complex> boundary = {1,1,1,1};
  FermionOp::ImplParams Params(boundary);
-  RealD mass = -0.1;
+
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+
  RealD mass = 0.0;
 //  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
  MdagMLinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
@@ -89,7 +96,8 @@ int main(int argc, char** argv) {
  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
     PlainHermOp<FermionField> Op     (HermOp);
-  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
+//  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);
  std::vector<RealD> eval(Nm);
  FermionField src(FGrid);
@@ -101,7 +109,8 @@ int main(int argc, char** argv) {
  };
  int Nconv;
-  IRL.calc(eval, evec, src, Nconv);
+//  IRL.calc(eval, evec, src, Nconv);
  IRL.calc(eval, src, Nconv);
  std::cout << eval << std::endl;
--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@@ -0,0 +1,249 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_dwf_lanczos.cc
 Copyright (C) 2015
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/parallelIO/IldgIOtypes.h>
 using namespace std;
 using namespace Grid;
 ;
 typedef WilsonFermionD FermionOp;
 typedef typename WilsonFermionD::FermionField FermionField;
 RealD AllZero(RealD x) { return 0.; }
 template <class T> void writeFile(T& in, std::string const fname){
 #if 1
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
  WR.writeScidacFieldRecord(in,record,0);
  WR.close();
 #endif
  // What is the appropriate way to throw error?
 }
 namespace Grid {
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 		  		RealD, mstep , 
 				Integer, Nstop,
                                Integer, Nk,
                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
 //                                  Integer, StartTrajectory,
 //                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
 //                                  bool, MetropolisTest,
 //                                  Integer, NoMetropolisUntil,
 //                                  std::string, StartingType,
 //                                  Integer, SW,
 //				  RealD, Kappa,
 //                                  IntegratorParameters, MD)
  LanczosParameters() {
    ////////////////////////////// Default values
      mass = 0;
 //    MetropolisTest    = true;
 //    NoMetropolisUntil = 10;
 //    StartTrajectory   = 0;
 //    SW                = 2;
 //    Trajectories      = 10;
 //    StartingType      = "HotStart";
    /////////////////////////////////
  }
  template <class ReaderClass >
  LanczosParameters(Reader<ReaderClass> & TheReader){
    initialize(TheReader);
  }
  template < class ReaderClass > 
  void initialize(Reader<ReaderClass> &TheReader){
 //    std::cout << GridLogMessage << "Reading HMC\n";
    read(TheReader, "HMC", *this);
  }
  void print_parameters() const {
 //    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
 //    MD.print_parameters();
  }
 };
 }
 int main(int argc, char** argv) {
  Grid_init(&argc, &argv);
  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
      GridDefaultMpi());
  GridRedBlackCartesian* UrbGrid =
      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian* FGrid = UGrid;
  GridRedBlackCartesian* FrbGrid = UrbGrid;
 //  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
  GridParallelRNG RNG5(FGrid);
  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG RNG5rb(FrbGrid);
  RNG5.SeedFixedIntegers(seeds5);
  LatticeGaugeField Umu(UGrid);
 //  SU<Nc>::HotConfiguration(RNG4, Umu);
 //  SU<Nc>::ColdConfiguration(Umu);
  FieldMetaData header;
  std::string file("./config");
  int precision32 = 0;
  int tworow      = 0;
 //  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
  NerscIO::readConfiguration(Umu,header,file);
 /*
  std::vector<LatticeColourMatrix> U(4, UGrid);
  for (int mu = 0; mu < Nd; mu++) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
  int Nstop = 10;
  int Nk = 20;
  int Np = 80;
  int Nm = Nk + Np;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;
  RealD mass = -1.0;
  LanczosParameters LanParams;
 #if 1
  {
    XmlReader  HMCrd("LanParams.xml");
    read(HMCrd,"LanczosParameters",LanParams);
  }
 #else
  {
    LanParams.mass = mass;
  }
 #endif
  std::cout << GridLogMessage<< LanParams <<std::endl;
  { 
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
  mass=LanParams.mass;
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
  Nm = Nk + Np;
  FermionField src(FGrid);
  gaussian(RNG5, src);
  std::vector<Complex> boundary = {1,1,1,-1};
 //  std::vector<Complex> boundary = {1,1,1,1};
  FermionOp::ImplParams Params(boundary);
 while ( mass > - 2.5){
  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
  std::vector<double> Coeffs{0, 1.};
  Polynomial<FermionField> PolyX(Coeffs);
 //  Chebyshev<FermionField> Cheby(0.5, 60., 31);
 //                                  RealD, ChebyLow,
 //                                RealD, ChebyHigh,
 //                                Integer, ChebyOrder)
  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
     PlainHermOp<FermionField> Op     (HermOp);
     PlainHermOp<FermionField> Op2     (HermOp2);
  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
 //  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);
  std::vector<RealD> eval(Nm);
  std::vector<FermionField> evec(Nm, FGrid);
  for (int i = 0; i < 1; i++) {
    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
              << std::endl;
  };
  int Nconv;
  IRL.calc(eval, evec, src, Nconv);
 //  IRL.calc(eval,  src, Nconv);
  std::cout << mass <<" : " << eval << std::endl;
  Gamma g5(Gamma::Algebra::Gamma5) ;
  ComplexD dot;
  FermionField tmp(FGrid);
  for (int i = 0; i < Nstop ; i++) {
    tmp = g5*evec[i];
    dot = innerProduct(tmp,evec[i]);
    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
 //    if ( i<1)
    {
 	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
        auto evdensity = localInnerProduct(evec[i],evec[i] );
 	writeFile(evdensity,evfile);
    }
  }
  src  = evec[0]+evec[1]+evec[2];
  src  += evec[3]+evec[4]+evec[5];
  src  += evec[6]+evec[7]+evec[8];
  mass += LanParams.mstep;
 }
  Grid_finalize();
 }
--- a/tests/smearing/Test_WilsonFlow.cc
+++ b/tests/smearing/Test_WilsonFlow.cc
@@ -33,8 +33,7 @@ namespace Grid{
    GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters,
            int, steps,
            double, step_size,
-            int, meas_interval,
+            int, meas_interval);
            double, maxTau); // for the adaptive algorithm
    template <class ReaderClass >
@@ -86,7 +85,7 @@ int main(int argc, char **argv) {
  WFParameters WFPar(Reader);
  ConfParameters CPar(Reader);
  CheckpointerParameters CPPar(CPar.conf_prefix, CPar.rng_prefix);
-  BinaryHmcCheckpointer<PeriodicGimplR> CPBin(CPPar);
+  NerscHmcCheckpointer<PeriodicGimplR> CPBin(CPPar);
  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
@@ -96,19 +95,13 @@ int main(int argc, char **argv) {
  std::cout << GridLogMessage << "Initial plaquette: "
    << WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
-  int t=WFPar.maxTau;
+  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size, WFPar.steps,
  WilsonFlowAdaptive<PeriodicGimplR> WF(WFPar.step_size, WFPar.maxTau,
 					1.0e-4,
 					WFPar.meas_interval);
  WF.smear(Uflow, Umu);
  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow);
  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;
  std::cout<< GridLogMessage << " Admissibility check:\n";
  const double sp_adm = 0.067;                // admissible threshold
--- a/tests/solver/Test_dwf_cg_prec.cc
+++ b/tests/solver/Test_dwf_cg_prec.cc
@@ -1,4 +1,4 @@
-*************************************************************************************
+/*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
--- a/tests/solver/Test_dwf_multishift_mixedprec.cc
+++ b/tests/solver/Test_dwf_multishift_mixedprec.cc
@@ -165,6 +165,7 @@ int main (int argc, char ** argv)
    }
  }
  if(gparity){
 #ifdef ENABLE_GPARITY
    std::cout << "Running test with G-parity BCs in " << gpdir << " direction" << std::endl;
    GparityWilsonImplParams params;
    params.twists[gpdir] = 1;
@@ -174,6 +175,9 @@ int main (int argc, char ** argv)
    ConjugateGimplD::setDirections(conj_dirs);
    run_test<GparityDomainWallFermionD, GparityDomainWallFermionF, ConjugateGaugeStatistics>(argc,argv,params);
 #else
    std::cout << " Gparity is not compiled "<<std::endl;
 #endif
  }else{
    std::cout << "Running test with periodic BCs" << std::endl;
    WilsonImplParams params;
--- a/visualisation/CMakeLists.txt
+++ b/visualisation/CMakeLists.txt
@@ -0,0 +1,37 @@
 cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
 project(GridViewer)
 list(APPEND CMAKE_PREFIX_PATH "/Users/peterboyle/QCD/vtk/VTK-9.4.2-install/")
 find_package(VTK COMPONENTS 
  CommonColor
  CommonCore
  FiltersCore
  FiltersModeling
  IOImage
  IOFFMPEG
  InteractionStyle
  InteractionWidgets
  RenderingContextOpenGL2
  RenderingCore
  RenderingFreeType
  RenderingGL2PSOpenGL2
  RenderingOpenGL2
 )
 if (NOT VTK_FOUND)
  message(FATAL_ERROR "GridViewer: Unable to find the VTK build folder.")
 endif()
 # Prevent a "command line is too long" failure in Windows.
 set(CMAKE_NINJA_FORCE_RESPONSE_FILE "ON" CACHE BOOL "Force Ninja to use response files.")
 add_executable(FieldDensityAnimate MACOSX_BUNDLE FieldDensityAnimate.cxx )
  target_link_libraries(FieldDensityAnimate PRIVATE ${VTK_LIBRARIES}
 )
 # vtk_module_autoinit is needed
 vtk_module_autoinit(
  TARGETS FieldDensityAnimate
  MODULES ${VTK_LIBRARIES}
 )
--- a/visualisation/E8_vs_Topo8.avi
+++ b/visualisation/E8_vs_Topo8.avi
--- a/visualisation/FieldDensity.py
+++ b/visualisation/FieldDensity.py
@@ -0,0 +1,285 @@
 #!/usr/bin/env python
 # noinspection PyUnresolvedReferences
 import math
 import vtk
 import vtkmodules.vtkInteractionStyle
 # noinspection PyUnresolvedReferences
 import vtkmodules.vtkRenderingOpenGL2
 from vtkmodules.vtkCommonColor import vtkNamedColors
 from vtkmodules.vtkCommonCore import (
    VTK_VERSION_NUMBER,
    vtkVersion
 )
 from vtkmodules.vtkCommonCore import VTK_DOUBLE
 from vtkmodules.vtkCommonDataModel import vtkImageData
 from vtkmodules.vtkFiltersCore import (
    vtkMarchingCubes,
    vtkStripper
 )
 from vtkmodules.vtkFiltersModeling import vtkOutlineFilter
 from vtkmodules.vtkIOImage import (
    vtkMetaImageReader,
    vtkJPEGWriter,
    vtkPNGWriter
 )
 from vtkmodules.vtkRenderingCore import (
    vtkActor,
    vtkCamera,
    vtkPolyDataMapper,
    vtkProperty,
    vtkRenderWindow,
    vtkRenderWindowInteractor,
    vtkRenderer,
    vtkWindowToImageFilter
 )
 class vtkTimerCallback():
    def __init__(self, steps, imageData, iren):
        self.timer_count = 0
        self.steps = steps
        self.imageData = imageData
        self.iren = iren
        self.timerId = None
        self.step = 0
    def execute(self, obj, event):
        print(self.timer_count)
        dims = self.imageData.GetDimensions()
        t=self.step/10.0
        z0 = 2
        y0 = 4
        x0 = 4
        z1 = 14
        y1 = 12
        x1 = 12
        for z in range(dims[2]):
            for y in range(dims[1]):
                for x in range(dims[0]):
                    self.imageData.SetScalarComponentFromDouble(x, y, z, 0,
                                                                math.sin(t)*math.exp(-0.25*((x-x0)*(x-x0)+(y-y0)*(y-y0)+(z-z0)*(z-z0)))
                                                                - math.cos(t)*math.exp(-0.25*((x-x1)*(x-x1)+(y-y1)*(y-y1)+(z-z1)*(z-z1))))
        self.imageData.Modified()
        iren = obj
        iren.GetRenderWindow().Render()
        self.timer_count += 1
        self.step += 1
        if self.step >= self.steps :
            iren.DestroyTimer(self.timerId)
 def WriteImage(fileName, renWin):
    '''
    '''
    import os
    if fileName:
        # Select the writer to use.
        path, ext = os.path.splitext(fileName)
        ext = ext.lower()
        if not ext:
            ext = '.png'
            fileName = fileName + ext
        elif ext == '.jpg':
            writer = vtkJPEGWriter()
        else:
            writer = vtkPNGWriter()
        windowto_image_filter = vtkWindowToImageFilter()
        windowto_image_filter.SetInput(renWin)
        windowto_image_filter.SetScale(1)  # image quality
        windowto_image_filter.SetInputBufferTypeToRGBA()
        writer.SetFileName(fileName)
        writer.SetInputConnection(windowto_image_filter.GetOutputPort())
        writer.Write()
    else:
        raise RuntimeError('Need a filename.')
 def main():
    colors = vtkNamedColors()
    file_name = get_program_parameters()
    colors.SetColor('InstantonColor', [240, 184, 160, 255])
    colors.SetColor('BackfaceColor', [255, 229, 200, 255])
    colors.SetColor('BkgColor', [51, 77, 102, 255])
    # Create the renderer, the render window, and the interactor. The renderer
    # draws into the render window, the interactor enables mouse- and
    # keyboard-based interaction with the data within the render window.
    #
    a_renderer = vtkRenderer()
    ren_win = vtkRenderWindow()
    ren_win.AddRenderer(a_renderer)
    iren = vtkRenderWindowInteractor()
    iren.SetRenderWindow(ren_win)
    # The following reader is used to read a series of 2D slices (images)
    # that compose the volume. The slice dimensions are set, and the
    # pixel spacing. The data Endianness must also be specified. The reader
    # uses the FilePrefix in combination with the slice number to construct
    # filenames using the format FilePrefix.%d. (In this case the FilePrefix
    # is the root name of the file: quarter.)
    imageData = vtkImageData()
    imageData.SetDimensions(16, 16, 16)
    imageData.AllocateScalars(VTK_DOUBLE, 1)
    dims = imageData.GetDimensions()
    # Fill every entry of the image data with '2.0'
    # Set the input data
    for z in range(dims[2]):
        z0 = dims[2]/2
        for y in range(dims[1]):
            y0 = dims[1]/2
            for x in range(dims[0]):
                x0 = dims[0]/2
                imageData.SetScalarComponentFromDouble(x, y, z, 0, math.exp(-0.25*((x-x0)*(x-x0)+(y-y0)*(y-y0)+z*z)) -  math.exp(-0.25*((x-x0)*(x-x0)+y*y+(z-z0)*(z-z0))))
    instanton_extractor = vtkMarchingCubes()
    instanton_extractor.SetInputData(imageData)
    instanton_extractor.SetValue(0, 0.1)
    instanton_stripper = vtkStripper()
    instanton_stripper.SetInputConnection(instanton_extractor.GetOutputPort())
    instanton_mapper = vtkPolyDataMapper()
    instanton_mapper.SetInputConnection(instanton_stripper.GetOutputPort())
    instanton_mapper.ScalarVisibilityOff()
    instanton = vtkActor()
    instanton.SetMapper(instanton_mapper)
    instanton.GetProperty().SetDiffuseColor(colors.GetColor3d('InstantonColor'))
    instanton.GetProperty().SetSpecular(0.3)
    instanton.GetProperty().SetSpecularPower(20)
    instanton.GetProperty().SetOpacity(0.5)
    # The triangle stripper is used to create triangle strips from the
    # isosurface these render much faster on may systems.
    antiinstanton_extractor = vtkMarchingCubes()
    antiinstanton_extractor.SetInputData(imageData)
    antiinstanton_extractor.SetValue(0, -0.1)
    antiinstanton_stripper = vtkStripper()
    antiinstanton_stripper.SetInputConnection(antiinstanton_extractor.GetOutputPort())
    antiinstanton_mapper = vtkPolyDataMapper()
    antiinstanton_mapper.SetInputConnection(antiinstanton_stripper.GetOutputPort())
    antiinstanton_mapper.ScalarVisibilityOff()
    antiinstanton = vtkActor()
    antiinstanton.SetMapper(antiinstanton_mapper)
    antiinstanton.GetProperty().SetDiffuseColor(colors.GetColor3d('Ivory'))
    # An outline provides box around the data.
    outline_data = vtkOutlineFilter()
    outline_data.SetInputData(imageData)
    map_outline = vtkPolyDataMapper()
    map_outline.SetInputConnection(outline_data.GetOutputPort())
    outline = vtkActor()
    outline.SetMapper(map_outline)
    outline.GetProperty().SetColor(colors.GetColor3d('Black'))
    # It is convenient to create an initial view of the data. The FocalPoint
    # and Position form a vector direction. Later on (ResetCamera() method)
    # this vector is used to position the camera to look at the data in
    # this direction.
    a_camera = vtkCamera()
    a_camera.SetViewUp(0, 0, -1)
    a_camera.SetPosition(0, -100, 0)
    a_camera.SetFocalPoint(0, 0, 0)
    a_camera.ComputeViewPlaneNormal()
    a_camera.Azimuth(30.0)
    a_camera.Elevation(30.0)
    # Actors are added to the renderer. An initial camera view is created.
    # The Dolly() method moves the camera towards the FocalPoint,
    # thereby enlarging the image.
    a_renderer.AddActor(outline)
    a_renderer.AddActor(instanton)
    a_renderer.AddActor(antiinstanton)
    a_renderer.SetActiveCamera(a_camera)
    a_renderer.ResetCamera()
    a_camera.Dolly(1.0)
    # Set a background color for the renderer and set the size of the
    # render window (expressed in pixels).
    a_renderer.SetBackground(colors.GetColor3d('BkgColor'))
    ren_win.SetSize(1024, 1024)
    ren_win.SetWindowName('ExpoDemo')
    # Note that when camera movement occurs (as it does in the Dolly()
    # method), the clipping planes often need adjusting. Clipping planes
    # consist of two planes: near and far along the view direction. The
    # near plane clips out objects in front of the plane the far plane
    # clips out objects behind the plane. This way only what is drawn
    # between the planes is actually rendered.
    a_renderer.ResetCameraClippingRange()
    # write image
    #    WriteImage('exp.jpg',ren_win)
    # Sign up to receive TimerEvent
    cb = vtkTimerCallback(200, imageData, iren)
    iren.AddObserver('TimerEvent', cb.execute)
    cb.timerId = iren.CreateRepeatingTimer(50)
    # start the interaction and timer
    ren_win.Render()
    # Initialize the event loop and then start it.
    iren.Initialize()
    iren.Start()
 def get_program_parameters():
    import argparse
    description = 'Simple lattice volumetric demo'
    epilogue = '''
    Derived from VTK/Examples/Cxx/Medical2.cxx
    '''
    parser = argparse.ArgumentParser(description=description, epilog=epilogue,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('filename', help='FieldDensity.py')
    args = parser.parse_args()
    return args.filename
 def vtk_version_ok(major, minor, build):
    """
    Check the VTK version.
    :param major: Major version.
    :param minor: Minor version.
    :param build: Build version.
    :return: True if the requested VTK version is greater or equal to the actual VTK version.
    """
    needed_version = 10000000000 * int(major) + 100000000 * int(minor) + int(build)
    try:
        vtk_version_number = VTK_VERSION_NUMBER
    except AttributeError:  # as error:
        ver = vtkVersion()
        vtk_version_number = 10000000000 * ver.GetVTKMajorVersion() + 100000000 * ver.GetVTKMinorVersion() \
                             + ver.GetVTKBuildVersion()
    if vtk_version_number >= needed_version:
        return True
    else:
        return False
 if __name__ == '__main__':
    main()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Chulwoo Jung	7780d88d26	Adding simple lanczos, boundary to specflow(!)	2025-08-06 23:41:53 +00:00
Chulwoo Jung	2bf9179d2c	Adding mass step	2025-08-06 16:52:51 +00:00
Chulwoo Jung	c606f5dca0	Move out src initialization for re-use / Adding antiperiodic BC	2025-08-06 16:51:14 +00:00
Chulwoo Jung	8419cc5c64	specflow evec I/O added,	2025-07-11 15:57:23 -04:00
Chulwoo Jung	2cc6deb8e0	Merge branch 'develop' of https://github.com/paboyle/Grid into ic2	2025-04-25 10:48:41 -04:00
Chulwoo Jung	19d0590579	Checking in for merging	2025-04-25 10:48:22 -04:00
Peter Boyle	677b4cc5b0	Make all tests compile	2025-04-24 20:33:26 -04:00
Peter Boyle	be565ffab6	update mac config command	2025-04-24 14:50:06 -04:00
Peter Boyle	df6120e5f6	CPU compile oops fix	2025-04-24 14:50:06 -04:00
Peter Boyle	21de6f7da8	Merge pull request #477 from lehner/feature/wilson-clover-5d Feature/wilson clover 5d	2025-04-24 14:44:48 -04:00
Peter Boyle	dbe39f9ce0	Merge pull request #471 from edbennett/fix-wflow Shave off rough edges in Wilson flow test	2025-04-24 14:40:31 -04:00
Peter Boyle	ab3de50d5e	Merge pull request #473 from UCL-ARC/gauge_action_deriv WilsonGagueAction deriv	2025-04-24 14:39:10 -04:00
Peter Boyle	c545bd2139	Merge pull request #465 from edbennett/allow-nonsu3-compilation guard against trying to compile SU3-specific code when Nc ≠ 3	2025-04-24 14:35:51 -04:00
Peter Boyle	6a1c64fbdd	Merge pull request #470 from paboyle/specflow Spectral flow, DWF/Mobius kernel measurement	2025-04-24 14:34:33 -04:00
Peter Boyle	b75809ed61	Update README	2025-04-24 14:27:22 -04:00
Peter Boyle	ecaf228e5c	Update README	2025-04-24 14:25:32 -04:00
Peter Boyle	6d015ae8fc	Visualisation tools	2025-04-24 13:47:34 -04:00
Peter Boyle	233150d93f	Bug fix for no accelerator aware MPI, thanks Shuhei for finding it.	2025-04-24 11:40:46 -04:00
Peter Boyle	7af8c77a52	Normalise	2025-04-24 11:37:39 -04:00
Chulwoo Jung	a957e7bfa1	Adding DWF evec Chirality measurement	2025-04-22 22:17:51 +00:00
Chulwoo Jung	cee4c8ce8c	Merge branch 'develop' of https://github.com/paboyle/Grid into specflow	2025-04-18 19:55:36 +00:00
Christoph Lehner	96bf814d8c	Add checkerboarding to 5D compact clover	2025-04-10 23:05:39 +02:00
Christoph Lehner	7ddc422788	CompactWilsonClover5D	2025-04-10 23:05:29 +02:00
Peter Boyle	e652fc2825	Shared Memory test reenabled on every Grid object creation. Const improvements in Accelerator.h	2025-04-07 11:51:40 -04:00
Peter Boyle	a49fa3f8d0	ROCM 6.3.1 appears to work	2025-04-07 11:50:59 -04:00
Peter Boyle	cd452a2f91	Slurm update	2025-04-04 18:40:20 -04:00
Peter Boyle	4f89f603ae	Changes to add back shared memory test on GPU	2025-04-04 18:40:15 -04:00
Peter Boyle	11dc2c5e1d	PVdagM initialise	2025-04-04 18:35:06 -04:00
Peter Boyle	6fec3c15ca	Cleaner printing	2025-04-04 18:35:06 -04:00
Peter Boyle	938c47480f	Updated compile on frontier. Unsatisfactory hacsk	2025-04-04 18:35:06 -04:00
Peter Boyle	3811d19298	Fence	2025-04-04 18:35:06 -04:00
Peter Boyle	83a3ab6b6f	Barrier -- not sure 100% this was needed	2025-04-04 18:35:05 -04:00
Peter Boyle	d66a9af6a3	No compile fix	2025-04-04 18:35:05 -04:00
Peter Boyle	adc90d3a86	NVLINK GET/PUT on cuda aware mpi	2025-04-04 18:35:05 -04:00
Peter Boyle	ebbd015c5c	Deprecate shared memory copy as direction matters on nvidia GPU	2025-04-04 18:35:05 -04:00
Peter Boyle	4ab73b36b2	Deprecate shared memory copy as direction matters on GPU	2025-04-04 18:35:05 -04:00
Peter Boyle	130e07a422	Non hermitian support	2025-04-04 18:35:05 -04:00
Peter Boyle	8f47bb367e	Shifted non herm	2025-04-04 18:35:05 -04:00
Peter Boyle	0c3cb60135	Script update	2025-04-04 18:35:05 -04:00
Peter Boyle	9eae8fca5d	Size outut	2025-04-04 18:35:05 -04:00
Peter Boyle	882a217074	Example of Useful prerequisite installs with spack	2025-03-26 11:28:53 -04:00
Mashy Green	e465fce201	Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv	2025-03-24 10:12:42 +00:00
Mashy Green	d41542c64b	reverted sp2n test wilsonfundfermiongauge to original	2025-03-24 08:29:15 +00:00
Peter Boyle	199818bd6c	Merge pull request #475 from lehner/feature-aurora Sync with GPT on Aurora	2025-03-13 08:55:55 -04:00
Christoph Lehner	fe66c7ca30	verbosity	2025-03-13 12:49:36 +00:00
Christoph Lehner	e9177e4af3	Blas compatibility	2025-03-13 08:48:23 +00:00
Christoph Lehner	d15a6c5933	Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora	2025-03-13 07:29:55 +00:00
Peter Boyle	25ab9325e7	Use hostVector but remove construct resize	2025-03-11 15:02:32 +00:00
Peter Boyle	19f9378b98	Should work on Aurora nowb	2025-03-11 13:50:43 +00:00
Mashy Green	785bc7a14f	Adding staple zeroing fix	2025-03-10 12:29:04 +00:00
Mashy Green	1a1fe85428	Merge remote-tracking branch 'upstream' into gauge_action_deriv	2025-03-10 08:37:36 +00:00
Mashy Green	0000d2e558	Merge branch 'develop' into gauge_action_deriv	2025-03-10 08:35:57 +00:00
Christoph Lehner	9ffd1ed4ce	Merged	2025-03-08 15:30:08 +00:00
Peter Boyle	3d014864e2	Makinig LLVM happy	2025-03-06 14:19:25 -05:00
Peter Boyle	1d22841811	Working on aurora, GPT issue turned up is fixed	2025-03-06 03:20:18 +00:00
Peter Boyle	a1cdda833f	Update WorkArounds.txt	2025-03-05 14:04:23 -05:00
Peter Boyle	ad6db92690	Update WorkArounds.txt	2025-03-05 14:00:26 -05:00
Peter Boyle	e8ff9d8e50	Update WorkArounds.txt	2025-03-05 14:00:04 -05:00
Peter Boyle	795769c636	Update WorkArounds.txt	2025-03-05 13:50:41 -05:00
Peter Boyle	267a39d943	Update WorkArounds.txt	2025-03-05 13:49:43 -05:00
Peter Boyle	3624bd3d22	Update WorkArounds.txt	2025-03-05 13:45:09 -05:00
Peter Boyle	bc12dbbb38	Update WorkArounds.txt	2025-03-05 12:48:56 -05:00
Peter Boyle	eb8a008a8f	Create WorkArounds.txt	2025-03-05 12:41:59 -05:00
Peter Boyle	c4d9aa1a21	Config command that makes GPT happier	2025-02-27 20:12:49 +00:00
Peter Boyle	6ae809ed40	Print not liked on GPT compile	2025-02-27 20:12:49 +00:00
Peter Boyle	311e2aab3f	Update Accelerator.h	2025-02-26 11:42:52 -05:00
Peter Boyle	438dfbdb83	Only throw if there is a pending list entry in CommsComplete	2025-02-25 16:57:27 +00:00
Peter Boyle	b2ce760cf4	Verbose issue with GPT	2025-02-25 16:55:23 +00:00
Muhammad Asif	b1ba209696	Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22 ) Co-authored-by: Mashy Green <mashy@me.com> merging no-su3 patch	2025-02-24 11:38:42 +00:00
Muhammad Asif	cb3e529b1e	Merge branch 'paboyle:develop' into develop	2025-02-24 11:29:09 +00:00
Mashy Green	717f647418	added the WilsonFlow patch from upstream PR #471	2025-02-24 08:41:31 +00:00
Mashy Green	98e7418187	Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv	2025-02-24 08:33:05 +00:00
Mashy Green	fe05bf48b1	Improvements to WilsonGaugeAction deriv function (#16 ) * patched version + modifications to deriv -> staple in qcd/gauge * Cleaning up and aligning variable naming between action deriv versions * Removing the regresion test files that were also in this branch for a clean PR * Reverting whitespace changes * Fixing after revering too much! --------- Co-authored-by: Mashy Green <mashy@me.com>	2025-02-17 18:52:04 +00:00
Mashy Green	d2dd8f54e2	Fixing after revering too much!	2025-02-17 17:32:27 +00:00
Mashy Green	7726ee4b16	Reverting whitespace changes	2025-02-17 17:16:28 +00:00
Peter Boyle	ba9bbe0221	Bounce MPI through host	2025-02-12 19:34:59 +00:00
Peter Boyle	4c3dd82d84	CSHIFT with bounce throuhgh Host memory on MPI packets	2025-02-12 19:09:53 +00:00
Peter Boyle	44e911b5b7	Comment change	2025-02-12 17:37:55 +00:00
Peter Boyle	a7a16df9d0	GET not put has kinder barrier sequence for NVLINK type access as when GET is done, I can use it without barrier. Moves a barrier to a nicer place, overlapped with DtoH DMA	2025-02-12 14:59:28 +00:00
Peter Boyle	382e0abefd	Was issueing a double fence -- the gather also fences	2025-02-12 14:57:28 +00:00
Peter Boyle	6fdefe5b90	Barrier sequencing if doing "GET" not "PUT" is different. This is somewhat better timing for Barriers	2025-02-12 14:55:20 +00:00
Peter Boyle	4788dd8e2e	More states in packet progression for GPU non aware MPI	2025-02-12 14:53:57 +00:00
Peter Boyle	1cc5f221f3	GET not put ordering is better as I know when I've got all MY data	2025-02-12 14:53:05 +00:00
Peter Boyle	93251bfba0	GET not put for better ordering in the downstream dependent kernels -- I know when I'm done, so we can move a barrier / handshake between ranks intranode to a point off critical path	2025-02-12 14:50:21 +00:00
Peter Boyle	18b79508b8	New line better for pretty print	2025-02-12 14:49:48 +00:00
Peter Boyle	4de5ed1613	Remove vector view. The std::vector will not inform Memory manager of deletion and so a stale entry could be left. It is not and should not be used.	2025-02-12 14:48:46 +00:00
Peter Boyle	0baaddbe98	Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384 nodes. More concurrency/fine grained scheduling is possible.	2025-02-04 19:27:26 +00:00
Ed Bennett	8729c46169	add clover energy density measurement to default WilsonFlow measurements	2025-02-03 14:27:55 +00:00
Ed Bennett	09f81fe7c3	don't force energy density measurement to be every wilson flow iteration	2025-02-03 14:27:45 +00:00
Ed Bennett	1876e5b7c0	correct tests/smearing/WilsonFlow to use non-adaptive flow and use correct interface	2025-02-03 14:27:29 +00:00
Mashy Green	355ec76257	Merge pull request #18 from UCL-ARC/bugfix/nvtx Bugfix/nvtx	2025-02-03 11:05:42 +00:00
Christoph Lehner	84cab5e6e7	no comms and log cleanup	2025-02-01 16:37:21 +01:00
Mashy Green	4f17c8d081	Merge branch 'paboyle:develop' into bugfix/nvtx	2025-01-29 13:10:12 +00:00
Mashy Green	aaab753982	Reverting to older version of nvtx for Tursa support	2025-01-29 12:57:38 +00:00
Chulwoo Jung	570b72a47b	Bugfix. Sorry!	2025-01-21 15:37:39 -05:00
Chulwoo Jung	a5798a89ed	Merge branch 'develop' into specflow	2025-01-21 12:13:24 -05:00
Chulwoo Jung	f7e2f9a401	Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement	2025-01-16 20:47:33 +00:00
Chulwoo Jung	2848a9b558	DWF Kernel lanczos working(?)	2025-01-16 01:29:56 +00:00
Mashy Green	d4868991af	Fixed wrong lib for NVTX in configure.ac and updated to nvtx3	2025-01-10 14:53:19 +00:00
Mashy Green	e99d42404e	Removing the regresion test files that were also in this branch for a clean PR	2024-12-16 16:31:22 +00:00
Mashy Green	3ba019c747	Cleaning up and aligning variable naming between action deriv versions	2024-12-03 15:23:00 +00:00
Mashy Green	47429218bb	patched version + modifications to deriv -> staple in qcd/gauge	2024-11-27 16:29:22 +00:00
Ed Bennett	8d305df0db	guard against trying to compile SU3-specific code when Nc ≠ 3	2024-05-24 14:00:56 +01:00
		`@@ -0,0 +1 @@`
							`../CompactWilsonCloverFermion5DInstantiation.cc.master`
`@@ -1,4 +1,4 @@`
	`*************************************************************************************`	`/*************************************************************************************`

	`Grid physics library, www.github.com/paboyle/Grid`	`Grid physics library, www.github.com/paboyle/Grid`