Merge 32e6d58356 into ba9bbe0221

use accelerator for setCheckerboard in RHMC
2025-06-13 20:57:06 +01:00 · 2025-02-13 11:26:59 +00:00 · 2021-12-22 23:43:43 +00:00
43 changed files with 178 additions and 1632 deletions
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@ -191,7 +191,7 @@ public:
      
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
-    //std::cout << "CPU view" << std::endl;
+    std::cout << "CPU view" << std::endl;
    
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@ -215,7 +215,7 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
      
-    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+    std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -229,7 +229,7 @@ public:
    }
      
    // Barrel shift and collect global pencil
-    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
+    std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@ -251,7 +251,7 @@ public:
      }
    }
      
-    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
+    std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@ -274,7 +274,7 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
      
-    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
+    std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@ -291,7 +291,7 @@ public:
    }
    result = result*div;
      
-    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
+    std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -277,38 +277,6 @@ public:
    assert(0);
  }
 };
-template<class Matrix,class Field>
-class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
-  Matrix &_Mat;
-  RealD shift;
-public:
-  ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
-  // Support for coarsening to a multigrid
-  void OpDiag (const Field &in, Field &out) {
-    _Mat.Mdiag(in,out);
-    out = out + shift*in;
-  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    _Mat.Mdir(in,out,dir,disp);
-  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){
-    _Mat.MdirAll(in,out);
-  };
-  void Op     (const Field &in, Field &out){
-    _Mat.M(in,out);
-    out = out + shift * in;
-  }
-  void AdjOp     (const Field &in, Field &out){
-    _Mat.Mdag(in,out);
-    out = out + shift * in;
-  }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    assert(0);
-  }
-  void HermOp(const Field &in, Field &out){
-    assert(0);
-  }
-};

 //////////////////////////////////////////////////////////
 // Even Odd Schur decomp operators; there are several
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@ -208,8 +208,8 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);

-    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    //assert(OpB!=GridBLAS_OP_T);
+    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -367,67 +367,28 @@ public:
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
-	  else
-	    eCmn = alpha * eAmk * eBkn ;
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
        });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
-	  else
-	    eCmn = alpha * eAmk.adjoint() * eBkn ;
-	  });
-      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
-	thread_for (p, batchCount, {
-	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
-	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
-	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
-	  else
-	    eCmn = alpha * eAmk.transpose() * eBkn ;
+	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
-	  else
-	    eCmn = alpha * eAmk * eBkn.adjoint() ;
-	  });
-      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
-	thread_for (p, batchCount, {
-	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
-	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
-	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
-	  else
-	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
-	  } );
-      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
-	thread_for (p, batchCount, {
-	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
-	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
-	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
-	  else
-	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
      } else { 
 	assert(0);
@ -453,8 +414,8 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

-    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    //assert(OpB!=GridBLAS_OP_T);
+    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -553,70 +514,28 @@ public:
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
-	  else
-	    eCmn = alpha * eAmk * eBkn ;
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
-	  else
-	    eCmn = alpha * eAmk.adjoint() * eBkn ;
-	  });
-      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
-	thread_for (p, batchCount, {
-	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
-	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
-	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
-	  else
-	    eCmn = alpha * eAmk.transpose() * eBkn ;
+	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
-	  else
-	    eCmn = alpha * eAmk * eBkn.adjoint() ;
-	  });
-      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
-	thread_for (p, batchCount, {
-	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
-	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
-	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
-	  else
-	    eCmn = alpha * eAmk * eBkn.transpose() ;
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
-	  else
-	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
-	  } );
-      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
-	thread_for (p, batchCount, {
-	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
-	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
-	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
-	  else
-	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
      } else { 
 	assert(0);
@ -742,41 +661,29 @@ public:
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
-	  else
-	    eCmn = alpha * eAmk * eBkn ;
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
-	  else
-	    eCmn = alpha * eAmk.transpose() * eBkn ;
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
-	  else
-	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
-	  else
-	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
-	  });
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  } );
      } else { 
 	assert(0);
      }
@ -902,40 +809,28 @@ public:
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
-	  else
-	    eCmn = alpha * eAmk * eBkn ;
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
-	  else
-	    eCmn = alpha * eAmk.transpose() * eBkn ;
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
-	  else
-	    eCmn = alpha * eAmk * eBkn.transpose() ;
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  if (std::abs(beta) != 0.0)
-	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
-	  else
-	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -245,10 +245,9 @@ until convergence
 	_HermOp(src_n,tmp);
 	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
 	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
-//	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
-	RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
+	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
 	RealD vden = norm2(src_n);
-	RealD na = std::sqrt(vnum/vden);
+	RealD na = vnum/vden;
 	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
@ -256,7 +255,6 @@ until convergence
 	src_n = tmp;
      }
    }
-    std::cout << GridLogIRL << " Final evalMaxApprox  " << evalMaxApprox << std::endl;
 	
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@ -97,7 +97,7 @@ public:

    RealD scale;

-    ConjugateGradient<FineField> CG(1.0e-3,400,false);
+    ConjugateGradient<FineField> CG(1.0e-2,100,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);

@ -110,7 +110,7 @@ public:
      
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;

-      for(int i=0;i<4;i++){
+      for(int i=0;i<1;i++){

 	CG(hermop,noise,subspace[b]);

@ -146,7 +146,7 @@ public:
      
      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;

-      for(int i=0;i<2;i++){
+      for(int i=0;i<3;i++){
 	//  void operator() (const Field &src, Field &psi){
 #if 1
 	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@ -441,20 +441,8 @@ public:
    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
  }
 #else
-  //////////////////////////////////////////////////////////////////////
-  // Galerkin projection of matrix
-  //////////////////////////////////////////////////////////////////////
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
-  {
-    CoarsenOperator(linop,Subspace,Subspace);
-  }
-  //////////////////////////////////////////////////////////////////////
-  // Petrov - Galerkin projection of matrix
-  //////////////////////////////////////////////////////////////////////
-  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & U,
-		       Aggregation<Fobj,CComplex,nbasis> & V)
  {
    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
    GridBase *grid = FineGrid();
@ -470,9 +458,11 @@ public:
    // Orthogonalise the subblocks over the basis
    /////////////////////////////////////////////////////////////
    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,V.subspace);
-    blockOrthogonalise(InnerProd,U.subspace);
+    blockOrthogonalise(InnerProd,Subspace.subspace);

+    //    for(int s=0;s<Subspace.subspace.size();s++){
+      //      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
+    //    }
    const int npoint = geom.npoint;
      
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
@ -552,7 +542,7 @@ public:
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
 	tphaseBZ-=usecond();
-	phaV = phaF[p]*V.subspace[i];
+	phaV = phaF[p]*Subspace.subspace[i];
 	tphaseBZ+=usecond();

 	/////////////////////////////////////////////////////////////////////
@ -565,7 +555,7 @@ public:
 	//	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;

 	tproj-=usecond();
-	blockProject(coarseInner,MphaV,U.subspace);
+	blockProject(coarseInner,MphaV,Subspace.subspace);
 	coarseInner = conjugate(pha[p]) * coarseInner;

 	ComputeProj[p] = coarseInner;
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@ -69,7 +69,7 @@ public:
  }

  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { };
+  void construct(pointer __p, const _Tp& __val) { assert(0);};
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -234,9 +234,6 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
-  if(bytes>=DeviceMaxBytes) {
-    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
-  }
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -149,8 +149,7 @@ public:
 			    sizeof(obj),d*100+p);

      }
-      if (!list.empty()) // avoid triggering assert in comms == none
-	CommsComplete(list);
+      CommsComplete(list);
      for(int p=1;p<_processors[d];p++){
 	accum = accum + column[p];
      }
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -438,15 +438,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(rrq);
      off_node_bytes+=rbytes;
    }
-#ifdef NVLINK_GET
-    else { 
-      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
-      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
-    }
-#endif
  }
-  // This is a NVLINK PUT  
+  
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
@ -455,11 +448,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(xrq);
      off_node_bytes+=xbytes;
    } else {
-#ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
-#endif
    }
  }
  return off_node_bytes;
@ -468,7 +459,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  int nreq=list.size();
-  /*finishes Get/Put*/
+
  acceleratorCopySynchronise();

  if (nreq==0) return;
@ -755,31 +746,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
+  //  int nreq=list.size();

-  std::vector<MPI_Status> status;
-  std::vector<MPI_Request> MpiRequests;
-    
-  for(int r=0;r<list.size();r++){
-    // Must check each Send buf is clear to reuse
-    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
-    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
-  }
+  //  if (nreq==0) return;
+  //  std::vector<MPI_Status> status(nreq);
+  //  std::vector<MPI_Request> MpiRequests(nreq);

-  int nreq=MpiRequests.size();
-
-  if (nreq>0) {
-    status.resize(MpiRequests.size());
-    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
-    assert(ierr==0);
-  }
+  //  for(int r=0;r<nreq;r++){
+  //    MpiRequests[r] = list[r].req;
+  //  }
  
+  //  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
+  //  assert(ierr==0);
+
  //  for(int r=0;r<nreq;r++){
  //    if ( list[r].PacketType==InterNodeRecv ) {
  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
  //    }
  //  }
  
+  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
  
  list.resize(0);               // Delete the list
  this->HostBufferFreeAll();    // Clean up the buffer allocs
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@ -137,7 +137,7 @@ public:
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
-  //  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
+  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
  static void SharedMemoryZero(void *dest,size_t bytes);

 };
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -542,12 +542,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  // printf("Host buffer allocate for GPU non-aware MPI\n");
+  printf("Host buffer allocate for GPU non-aware MPI\n");
 #if 0
  HostCommBuf= acceleratorAllocHost(bytes);
 #else 
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
-#if 0
+#ifdef HAVE_NUMAIF_H
  #warning "Moving host buffers to specific NUMA domain"
  int numa;
  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
@ -916,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
  bzero(dest,bytes);
 #endif
 }
-//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-//{
-//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-//  acceleratorCopyToDevice(src,dest,bytes);
-//#else   
-//  bcopy(src,dest,bytes);
-//#endif
-//}
+void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+{
+#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+  acceleratorCopyToDevice(src,dest,bytes);
+#else   
+  bcopy(src,dest,bytes);
+#endif
+}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
@ -959,7 +959,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);

    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
-    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
  }
  ShmBufferFreeAll();

@ -990,7 +989,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  SharedMemoryTest();
+  //SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@ -1012,18 +1011,19 @@ void SharedMemory::SharedMemoryTest(void)
       check[0]=GlobalSharedMemory::WorldNode;
       check[1]=r;
       check[2]=magic;
-       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
+       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
    }
  }
  ShmBarrier();
  for(uint64_t r=0;r<ShmSize;r++){
-    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
+    ShmBarrier();
+    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
+    ShmBarrier();
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==magic);
+    ShmBarrier();
  }
-  ShmBarrier();
-  std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
 }

 void *SharedMemory::ShmBuffer(int rank)
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@ -126,8 +126,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
 #ifndef ACCELERATOR_AWARE_MPI
-  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
-  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
+  static hostVector<vobj> hsend_buf;  hsend_buf.resize(buffer_size);
+  static hostVector<vobj> hrecv_buf;  hrecv_buf.resize(buffer_size);
 #endif
  
  int cb= (cbmask==0x2)? Odd : Even;
@ -244,6 +244,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;

+
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
  
  //copy offsets to device
-  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
+  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  
  
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
    exit(EXIT_FAILURE);
  }
  
-  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
+  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  
  //sync after copy
  accelerator_barrier();
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -485,6 +485,7 @@ public:
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
 #ifdef NVLINK_GET
+    #warning "NVLINK_GET"
    this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
    // Or issue barrier AFTER the DMA is running
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else {							\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  acceleratorSynchronise();					\
+  acceleratorSynchronise();						\
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
  Recon(result, Uchi);

@ -504,7 +504,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    autoView(st_v , st,AcceleratorRead);

   if( interior && exterior ) {
-     acceleratorFenceComputeStream();
+     //     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #ifndef GRID_CUDA
--- a/Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
+++ b/Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
@ -86,8 +86,13 @@ public:
    assert(ForceE.Checkerboard()==Even);
    assert(ForceO.Checkerboard()==Odd);

+#if defined(GRID_CUDA) || defined(GRID_HIP)  || defined(GRID_SYCL)
+    acceleratorSetCheckerboard(Force,ForceE);
+    acceleratorSetCheckerboard(Force,ForceO);
+#else
    setCheckerboard(Force,ForceE); 
    setCheckerboard(Force,ForceO);
+#endif
    Force=-Force;

    delete forcecb;
@ -130,8 +135,13 @@ public:
    assert(ForceE.Checkerboard()==Even);
    assert(ForceO.Checkerboard()==Odd);

+#if defined(GRID_CUDA) || defined(GRID_HIP)  || defined(GRID_SYCL)
+    acceleratorSetCheckerboard(Force,ForceE);
+    acceleratorSetCheckerboard(Force,ForceO);
+#else
    setCheckerboard(Force,ForceE); 
    setCheckerboard(Force,ForceO);
+#endif
    Force=-Force;

    delete forcecb;
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -446,7 +446,6 @@ public:
    Communicate();
    CommsMergeSHM(compress);
    CommsMerge(compress);
-    accelerator_barrier();
  }

  template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@ -519,6 +518,7 @@ public:
    }
    accelerator_barrier(); // All my local gathers are complete
 #ifdef NVLINK_GET
+    #warning "NVLINK_GET"
    _grid->StencilBarrier(); // He can now get mu local gather, I can get his
    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
    // Or issue barrier AFTER the DMA is running
@ -690,7 +690,6 @@ public:
 	}
      }
    }
-    //    std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
    surface_list.resize(surface_list_size);
    std::vector<int> surface_list_host(surface_list_size);
    int32_t ss=0;
@ -710,7 +709,7 @@ public:
      }
    }
    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
-    //    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
+    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@ -802,8 +801,8 @@ public:
    this->_entries_host_p = &_entries[0];
    this->_entries_p = &_entries_device[0];

-    //    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
-    //	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
+    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
+	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
    
    for(int ii=0;ii<npoints;ii++){

--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -242,33 +242,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };

-typedef int acceleratorEvent_t;
-
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
-inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
+inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
-inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
-  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
-  return 0;
-}
-inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
-  acceleratorCopyFromDevice(from,to,bytes);
-  return 0;
-}
-inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
-  return 0;
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
-inline void acceleratorEventWait(acceleratorEvent_t ev)
-{
-  //auto discard=cudaStreamSynchronize(ev);
-}
-inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}


 inline int  acceleratorIsCommunicable(void *ptr)
@ -377,8 +363,8 @@ inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *t
 inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }

-inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
-inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}

 inline int  acceleratorIsCommunicable(void *ptr)
@ -492,7 +478,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 inline void *acceleratorAllocHost(size_t bytes)
 {
  void *ptr=NULL;
-  auto err = hipHostMalloc((void **)&ptr,bytes);
+  auto err = hipMallocHost((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
@ -525,35 +511,23 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
-inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}

 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}

-typedef int acceleratorEvent_t;
-
-inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
-  return 0;
 }
-inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  acceleratorCopyToDevice(from,to,bytes);
-  return 0;
+inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
 }
-inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  acceleratorCopyFromDevice(from,to,bytes);
-  return 0;
+inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
 }
 inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };

-inline void acceleratorEventWait(acceleratorEvent_t ev)
-{
-  //  auto discard=hipStreamSynchronize(ev);
-}
-inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
-
-
 #endif

 inline void acceleratorPin(void *ptr,unsigned long bytes)
@ -590,8 +564,6 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)

 #undef GRID_SIMT

-typedef int acceleratorEvent_t;
-
 inline void acceleratorMem(void)
 {
  /*
@ -611,12 +583,9 @@ inline void acceleratorMem(void)

 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific

-inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { acceleratorCopyToDevice(from,to,bytes); return 0; }
-inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { acceleratorCopyFromDevice(from,to,bytes); return 0; }
-inline void acceleratorEventWait(acceleratorEvent_t ev){}
-inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
-inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); return 0;}
-
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
 inline void acceleratorCopySynchronise(void) {};

 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
@ -705,9 +674,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
  acceleratorCopySynchronise();
 }

-template<class T> void acceleratorPut(T& dev,const T&host)
+template<class T> void acceleratorPut(T& dev,T&host)
 {
-  acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
+  acceleratorCopyToDevice(&host,&dev,sizeof(T));
 }
 template<class T> T acceleratorGet(T& dev)
 {
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_critical                                     DO_PRAGMA(omp critical)

 #ifdef GRID_OMP
-inline void thread_bcopy(const void *from, void *to,size_t bytes)
+inline void thread_bcopy(void *from, void *to,size_t bytes)
 {
-  const uint64_t *ufrom = (const uint64_t *)from;
+  uint64_t *ufrom = (uint64_t *)from;
  uint64_t *uto   = (uint64_t *)to;
  assert(bytes%8==0);
  uint64_t words=bytes/8;
@ -84,7 +84,7 @@ inline void thread_bcopy(const void *from, void *to,size_t bytes)
  });
 }
 #else
-inline void thread_bcopy(const void *from, void *to,size_t bytes)
+inline void thread_bcopy(void *from, void *to,size_t bytes)
 {
  bcopy(from,to,bytes);
 }
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -509,14 +509,7 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);

-  if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
-    std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
-    FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
-    FlightRecorder::PrintEntireLog = 1;
-    FlightRecorder::ChecksumComms  = 1;
-    FlightRecorder::ChecksumCommsSend=1;
-  }
-  
+
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@ -658,4 +651,3 @@ void Grid_debug_handler_init(void)
 }

 NAMESPACE_END(Grid);
-
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@ -50,7 +50,7 @@ namespace Grid{
      int64_t index64;
      IndexFromCoorReversed(coor,index64,dims);
      if ( index64>=2*1024*1024*1024LL ){
-	//	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
      }
      assert(index64<2*1024*1024*1024LL);
      index = (int) index64;
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@ -492,18 +492,17 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t no    = 50;
-	uint64_t ni    = 100;
+	uint64_t ncall = 500;
+
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(no);
-	for(uint64_t i=0;i<no;i++){
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
-	  for(uint64_t j=0;j<ni;j++){
-	    Dw.DhopEO(src_o,r_e,DaggerNo);
-	  }
+	  Dw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@ -521,11 +520,11 @@ public:
 	double mf_hi, mf_lo, mf_err;

 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min*ni;
-	mf_lo = flops/timestat.max*ni;
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean*ni;
+	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@ -536,7 +535,6 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;

      }

@ -656,19 +654,17 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
+	uint64_t ncall = 500;

-	uint64_t no    = 50;
-	uint64_t ni    = 100;
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(no);
-	for(uint64_t i=0;i<no;i++){
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
-	  for(uint64_t j=0;j<ni;j++){
-	    Ds.DhopEO(src_o,r_e,DaggerNo);
-	  }
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@ -679,11 +675,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min*ni;
-	mf_lo = flops/timestat.max*ni;
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean*ni;
+	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@ -693,7 +689,6 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
      
      }

@ -797,18 +792,19 @@ public:
 	  Dc.M(src,r);
 	}
 	FGrid->Barrier();
-	uint64_t ni = 100;
-	uint64_t no = 50;
+	double t1=usecond();
+	uint64_t ncall = 500;
+
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+
 	time_statistics timestat;
-	std::vector<double> t_time(no);
-	for(uint64_t i=0;i<no;i++){
-	  double t0=usecond();
-	  for(uint64_t j=0;j<ni;j++){
-	    Dc.M(src,r);
-	  }
-	  double t1=usecond();
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+	  Dc.M(src,r);
+	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
@ -818,21 +814,20 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min*ni;
-	mf_lo = flops/timestat.max*ni;
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean*ni;
+	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call   "<< timestat.mean/ni<<std::endl;
      
      }

@ -877,7 +872,7 @@ int main (int argc, char ** argv)
  int do_dslash=1;

  int sel=4;
-  std::vector<int> L_list({8,12,16,24,32});
+  std::vector<int> L_list({8,12,16,24});
  int selm1=sel-1;

  std::vector<double> clover;
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@ -1,19 +1,18 @@
 #Ahead of time compile for PVC

-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"

 #JIT compile 
 #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "

-../configure \
+../../configure \
 	--enable-simd=GPU \
 	--enable-reduction=grid \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
-	--prefix $HOME/gpt-install \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \
--- a/systems/Frontier-rocm631/config-command
+++ b/systems/Frontier-rocm631/config-command
@ -1,22 +0,0 @@
-CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
-../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=none \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
-CXX=hipcc MPICXX=mpicxx \
-CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
- LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
-
-
-
-
--- a/systems/Frontier-rocm631/sourceme631.sh
+++ b/systems/Frontier-rocm631/sourceme631.sh
@ -1,16 +0,0 @@
-
-echo spack
-. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
-
-#module load cce/15.0.1
-
-module load rocm/6.3.1
-module load cray-fftw
-module load craype-accel-amd-gfx90a
-export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
-
-#Ugly hacks to get down level software working on current system
-#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
-#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
-#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
-
--- a/systems/Frontier/benchmarks/bench2.slurm
+++ b/systems/Frontier/benchmarks/bench2.slurm
@ -30,10 +30,14 @@ source ${root}/sourceme.sh

 export OMP_NUM_THREADS=7
 export MPICH_GPU_SUPPORT_ENABLED=1
-#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
-#64.64.32.96
-for vol in 64.64.32.64
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+
+for vol in 32.32.32.64
 do
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
+
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
 done

--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@ -3,19 +3,20 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --with-lime=$CLIME \
 --enable-unified=no \
 --enable-shm=nvlink \
--enable-tracing=none \
+--enable-tracing=timer \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
+--enable-accelerator-cshift \
 --with-gmp=$OLCF_GMP_ROOT \
 --with-fftw=$FFTW_DIR/.. \
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
-CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
- LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
+CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
+ LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"



--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@ -1,25 +1,12 @@
-
-echo spack
 . /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
-
-module load cce/15.0.1
-module load rocm/5.3.0
+spack load c-lime
+module load emacs 
+module load PrgEnv-gnu
+module load rocm/6.0.0
+module load cray-mpich
+module load gmp
 module load cray-fftw
 module load craype-accel-amd-gfx90a
-
-#Ugly hacks to get down level software working on current system
-export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
-ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
-
-#echo spack load c-lime
-#spack load c-lime
-#module load emacs 
-##module load PrgEnv-gnu
-##module load cray-mpich
-##module load cray-fftw
-##module load craype-accel-amd-gfx90a
-##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 #Hack for lib
-##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
+#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@ -1,206 +0,0 @@
-The purpose of this file is to collate all non-obvious known magic shell variables
-and compiler flags required for either correctness or performance on various systems.
-
-A repository of work-arounds.
-
-Contents:
-1. Interconnect + MPI
-2. Compilation
-3. Profiling
-
-************************
-* 1. INTERCONNECT + MPI
-************************
-
--------------------------------------------------------------------
-MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O 
--------------------------------------------------------------------
-export OMPI_MCA_io=romio321
-
--------------------------------------
-ROMIO fail with > 2GB per node read (32 bit issue)
--------------------------------------
-
-Use later MPICH
-
-https://github.com/paboyle/Grid/issues/381
-
-https://github.com/pmodels/mpich/commit/3a479ab0
-
--------------------------------------------------------------------
-Slingshot: Frontier and Perlmutter libfabric slow down 
-and physical memory fragmentation 
--------------------------------------------------------------------
-export FI_MR_CACHE_MONITOR=disabled
-or
-export FI_MR_CACHE_MONITOR=kdreg2
-
--------------------------------------------------------------------
-Perlmutter
--------------------------------------------------------------------
-
-export MPICH_RDMA_ENABLED_CUDA=1
-export MPICH_GPU_IPC_ENABLED=1
-export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
-export MPICH_GPU_NO_ASYNC_MEMCPY=0
-
--------------------------------------------------------------------
-Frontier/LumiG
--------------------------------------------------------------------
-
-Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
-
-cat << EOF > select_gpu
-#!/bin/bash
-export MPICH_GPU_SUPPORT_ENABLED=1
-export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
-export GPU_MAP=(0 1 2 3 7 6 5 4)
-export NUMA_MAP=(3 3 1 1 2 2 0 0)
-export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
-export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
-export HIP_VISIBLE_DEVICES=\$GPU
-unset ROCR_VISIBLE_DEVICES
-echo RANK \$SLURM_LOCALID using GPU \$GPU    
-exec numactl -m \$NUMA -N \$NUMA \$*
-EOF
-chmod +x ./select_gpu
-
-srun ./select_gpu BINARY
-
-
--------------------------------------------------------------------
-Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
--------------------------------------------------------------------
-export OMPI_MCA_btl=^uct,openib
-export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
-export UCX_RNDV_SCHEME=put_zcopy
-export UCX_RNDV_THRESH=16384
-export UCX_IB_GPU_DIRECT_RDMA=yes
-
--------------------------------------------------------------------
-Mellanox + A100 correctness (Tursa, Booster, Leonardo)
--------------------------------------------------------------------
-export UCX_MEMTYPE_CACHE=n
-
--------------------------------------------------------------------
-MPICH/Aurora/PVC correctness and performance 
--------------------------------------------------------------------
-
-https://github.com/pmodels/mpich/issues/7302
-
--enable-cuda-aware-mpi=no  
--enable-unified=no
-
-Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
-Do not use SVM
-
-Ideally use MPICH with fix to issue 7302:
-
-https://github.com/pmodels/mpich/pull/7312
-
-Ideally:
-MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
-
-Alternatives:
-export MPIR_CVAR_NOLOCAL=1
-export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
-
--------------------------------------------------------------------
-MPICH/Aurora/PVC correctness and performance 
--------------------------------------------------------------------
-
-Broken:
-export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
-
-This gives good peformance without requiring 
--enable-cuda-aware-mpi=no  
-
-But is an open issue reported by James Osborn
-https://github.com/pmodels/mpich/issues/7139
-
-Possibly resolved but unclear if in the installed software yet.
-
-************************
-* 2. COMPILATION
-************************
-
--------------------------------------------------------------------
-G++ compiler breakage / graveyard
--------------------------------------------------------------------
-
-9.3.0, 10.3.1, 
-https://github.com/paboyle/Grid/issues/290
-https://github.com/paboyle/Grid/issues/264
-
-Working (-) Broken (X):
-
-4.9.0 -
-4.9.1 -
-5.1.0 X
-5.2.0 X
-5.3.0 X
-5.4.0 X
-6.1.0 X
-6.2.0 X
-6.3.0 -
-7.1.0 -
-8.0.0 (HEAD) -
-
-https://github.com/paboyle/Grid/issues/100
-
--------------------------------------------------------------------
-AMD GPU nodes :
--------------------------------------------------------------------
-
-multiple ROCM versions broken; use 5.3.0
-manifests itself as wrong results in fp32 
-
-https://github.com/paboyle/Grid/issues/464
-
--------------------------------------------------------------------
-Aurora/PVC
--------------------------------------------------------------------
-
-SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
-SYCL slow link and relocatable code issues (Christoph Lehner)
-Opt large register file required for good performance in fp64
-
-
-export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fPIC"
-
--------------------------------------------------------------------
-Aurora/PVC useful extra options
--------------------------------------------------------------------
-
-Host only sanitizer:
-Xarch_host -fsanitize=leak
-Xarch_host -fsanitize=address
-
-Deterministic MPI reduction:
-export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
-export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
-export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
-unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
-unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
-unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
-
-
-
-************************
-* 3. Visual profile tools
-************************
-
--------------------------------------------------------------------
-Frontier/rocprof
--------------------------------------------------------------------
-
--------------------------------------------------------------------
-Aurora/unitrace
--------------------------------------------------------------------
-
-
--------------------------------------------------------------------
-Tursa/nsight-sys
--------------------------------------------------------------------
--- a/systems/sdcc-genoa/bench.slurm
+++ b/systems/sdcc-genoa/bench.slurm
@ -1,32 +0,0 @@
-#!/bin/bash
-#SBATCH --partition lqcd
-#SBATCH --time=00:50:00
-#SBATCH -A lqcdtest
-#SBATCH -q lqcd
-#SBATCH --exclusive
-#SBATCH --nodes=1
-#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
-#SBATCH --ntasks=1
-#SBATCH --cpus-per-task=64
-#SBATCH --qos lqcd
-
-source sourceme.sh
-
-export PLACES=(1:16:4 1:32:2 0:64:1);
-export THR=(16 32 64)
-
-for t in  2 
-do
-   
-export OMP_NUM_THREADS=${THR[$t]}
-export OMP_PLACES=${PLACES[$t]}
-export thr=${THR[$t]}
-
-#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
-for vol in 48.48.48.96
-do
-srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
-done
-#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
-done
-
--- a/systems/sdcc-genoa/bench2.slurm
+++ b/systems/sdcc-genoa/bench2.slurm
@ -1,36 +0,0 @@
-#!/bin/bash
-#SBATCH --partition lqcd
-#SBATCH --time=00:50:00
-#SBATCH -A lqcdtest
-#SBATCH -q lqcd
-#SBATCH --exclusive
-#SBATCH --nodes=2
-#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
-#SBATCH --ntasks=2
-#SBATCH --cpus-per-task=64
-#SBATCH --qos lqcd
-
-source sourceme.sh
-
-export PLACES=(1:16:4 1:32:2 0:64:1);
-export THR=(16 32 64)
-
-nodes=2
-mpi=1.1.1.2
-
-for t in 2 
-do
-   
-export OMP_NUM_THREADS=${THR[$t]}
-export OMP_PLACES=${PLACES[$t]}
-export thr=${THR[$t]}
-
-#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
-
-for vol in 64.64.64.128
-do
-srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
-done
-
-done
-
--- a/systems/sdcc-genoa/config-command
+++ b/systems/sdcc-genoa/config-command
@ -1,16 +0,0 @@
-../../configure \
--enable-comms=mpi-auto \
--enable-unified=yes \
--enable-shm=shmopen \
--enable-shm-fast-path=shmopen \
--enable-accelerator=none \
--enable-simd=AVX512 \
--disable-accelerator-cshift \
--disable-fermion-reps \
--disable-gparity \
-CXX=clang++ \
-MPICXX=mpicxx \
-CXXFLAGS="-std=c++17"
-
-
-
--- a/systems/sdcc-genoa/sourceme.sh
+++ b/systems/sdcc-genoa/sourceme.sh
@ -1,4 +0,0 @@
-source $HOME/spack/share/spack/setup-env.sh
-spack load llvm@17.0.4
-export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
-module load openmpi
--- a/systems/spack-linux/config-command
+++ b/systems/spack-linux/config-command
@ -1,17 +0,0 @@
-../../src/Grid/configure \
-    --prefix /home/pab/NPR/install \
-    --enable-comms=mpi-auto \
-    --enable-simd=AVX2 \
-    --enable-shm=none \
-    --enable-debug \
-    --with-lime=$CLIME \
-    --with-hdf5=$HDF5 \
-    --with-fftw=$FFTW \
-    --with-gmp=$GMP \
-    --with-mpfr=$MPFR \
-    --disable-gparity \
-    --disable-fermion-reps \
-    CXX=clang++ \
-    MPICXX=mpicxx \
-    CXXFLAGS="-std=c++17 "
-
--- a/systems/spack-linux/sourceme.sh
+++ b/systems/spack-linux/sourceme.sh
@ -1,28 +0,0 @@
-source $HOME/spack/share/spack/setup-env.sh
-spack load llvm@12
-spack load autoconf%clang@12.0.1
-spack load automake%clang@12.0.1
-spack load c-lime%clang@12.0.1
-spack load fftw%clang@12.0.1
-spack load gmp%clang@12.0.1
-spack load mpfr%clang@12.0.1
-spack load openmpi%clang@12.0.1
-spack load openssl%clang@12.0.1
-spack load hdf5+cxx%clang@12.0.1
-spack load cmake%clang@12.0.1
-export FFTW=`spack find --paths fftw%clang@12.0.1    | grep ^fftw   | awk '{print $2}' `
-export HDF5=`spack find --paths hdf5+cxx%clang@12.0.1   | grep ^hdf5   | awk '{print $2}' `
-export CLIME=`spack find --paths c-lime%clang@12.0.1 | grep ^c-lime | awk '{print $2}' `
-export MPFR=`spack find --paths mpfr%clang@12.0.1    | grep ^mpfr  | awk '{print $2}' `
-export LLVM=`spack find --paths llvm@12    | grep ^llvm  | awk '{print $2}' `
-export OPENSSL=`spack find --paths openssl%clang@12.0.1 | grep openssl | awk  '{print $2}' `
-export GMP=`spack find --paths gmp%clang@12.0.1      | grep ^gmp | awk '{print $2}' `
-export TCLAP=`spack find --paths tclap%clang@12.0.1  | grep ^tclap | awk '{print $2}' `
-export LD_LIBRARY_PATH=${TCLAP}/lib:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=$FFTW/lib:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=$LLVM/lib:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=$LLVM/lib/x86_64-unknown-linux-gnu/:$LD_LIBRARY_PATH
-
-ulimit -s 81920
--- a/systems/spack-linux/spack-install
+++ b/systems/spack-linux/spack-install
@ -1,19 +0,0 @@
-cd
-git clone https://github.com/spack/spack.git
-source $HOME/spack/share/spack/setup-env.sh
-
-spack install llvm@12
-
-spack install autoconf%clang@12.0.1
-spack install automake%clang@12.0.1
-spack install c-lime%clang@12.0.1
-spack install fftw%clang@12.0.1
-spack install gmp%clang@12.0.1
-spack install mpfr%clang@12.0.1
-spack install openmpi%clang@12.0.1
-spack install openssl%clang@12.0.1
-spack install hdf5+cxx%clang@12.0.1
-spack install cmake%clang@12.0.1
-spack install tclap%clang@12.0.1
-spack install emacs%clang@12.0.1
-
--- a/tests/debug/Test_general_coarse_pvdagm.cc
+++ b/tests/debug/Test_general_coarse_pvdagm.cc
@ -47,20 +47,20 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
-    //    std::cout << "Op: PVdag M "<<std::endl;
+    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
-    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    //    _Mat.M(in,tmp);
    //    _PV.Mdag(tmp,out);
@ -83,14 +83,14 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
-    //    std::cout << "Op: PVdag M "<<std::endl;
+    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
-    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
@ -98,7 +98,7 @@ public:
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
--- a/tests/lanczos/LanParams.xml
+++ b/tests/lanczos/LanParams.xml
@ -1,14 +0,0 @@
-<?xml version="1.0"?>
-<grid>
-  <LanczosParameters>
-    <mass>0.00107</mass>
-    <M5>1.8</M5>
-    <Ls>48</Ls>
-    <Nstop>10</Nstop>
-    <Nk>15</Nk>
-    <Np>85</Np>
-    <ChebyLow>0.003</ChebyLow>
-    <ChebyHigh>60</ChebyHigh>
-    <ChebyOrder>201</ChebyOrder>
-  </LanczosParameters>
-</grid>
--- a/tests/lanczos/Test_dwf_G5R5.cc
+++ b/tests/lanczos/Test_dwf_G5R5.cc
@ -1,346 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./tests/Test_dwf_G5R5.cc
-
-Copyright (C) 2015
-
-Author: Chulwoo Jung <chulwoo@bnl.gov>
-From Duo and Bob's Chirality study
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
- ;
-
-//typedef WilsonFermionD FermionOp;
-typedef DomainWallFermionD FermionOp;
-typedef typename DomainWallFermionD::FermionField FermionField;
-
-
-RealD AllZero(RealD x) { return 0.; }
-
-namespace Grid {
-
-struct LanczosParameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
-		  		RealD, mass , 
-		  		RealD, M5 , 
-	  			Integer, Ls,
-	  			Integer, Nstop,
-	  			Integer, Nk,
-	  			Integer, Np,
-	  			RealD, ChebyLow,
-	  			RealD, ChebyHigh,
-	  			Integer, ChebyOrder)
-//                                  Integer, StartTrajectory,
-//                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
-//                                  bool, MetropolisTest,
-//                                  Integer, NoMetropolisUntil,
-//                                  std::string, StartingType,
-//                                  Integer, SW,
-//				  RealD, Kappa,
-//                                  IntegratorParameters, MD)
-
-  LanczosParameters() {
-    ////////////////////////////// Default values
-      mass = 0;
-//    MetropolisTest    = true;
-//    NoMetropolisUntil = 10;
-//    StartTrajectory   = 0;
-//    SW                = 2;
-//    Trajectories      = 10;
-//    StartingType      = "HotStart";
-    /////////////////////////////////
-  }
-
-  template <class ReaderClass >
-  LanczosParameters(Reader<ReaderClass> & TheReader){
-    initialize(TheReader);
-  }
-
-  template < class ReaderClass > 
-  void initialize(Reader<ReaderClass> &TheReader){
-//    std::cout << GridLogMessage << "Reading HMC\n";
-    read(TheReader, "HMC", *this);
-  }
-
-
-  void print_parameters() const {
-//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
-//    MD.print_parameters();
-  }
-  
-};
-
-}
-
-int main(int argc, char** argv) {
-  Grid_init(&argc, &argv);
-
-  LanczosParameters LanParams;
-#if 1
-  {
-    XmlReader  HMCrd("LanParams.xml");
-    read(HMCrd,"LanczosParameters",LanParams);
-  }
-#else
-  {
-    LanParams.mass = mass;
-  }
-#endif
-  std::cout << GridLogMessage<< LanParams <<std::endl;
-  { 
-    XmlWriter HMCwr("LanParams.xml.out");
-    write(HMCwr,"LanczosParameters",LanParams);
-  }
-
-  int Ls=16;
-  RealD M5=1.8;
-  RealD mass = -1.0;
-
-  mass=LanParams.mass;
-  Ls=LanParams.Ls;
-  M5=LanParams.M5;
-
-  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
-      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
-      GridDefaultMpi());
-  GridRedBlackCartesian* UrbGrid =
-      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-//  GridCartesian* FGrid = UGrid;
-//  GridRedBlackCartesian* FrbGrid = UrbGrid;
-  GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
-//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
-
-  std::vector<int> seeds4({1, 2, 3, 4});
-  std::vector<int> seeds5({5, 6, 7, 8});
-  GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
-  GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
-  GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5);
-
-  LatticeGaugeField Umu(UGrid);
-
-  FieldMetaData header;
-  std::string file("./config");
-
-  int precision32 = 0;
-  int tworow      = 0;
-  NerscIO::readConfiguration(Umu,header,file);
-
-/*
-  std::vector<LatticeColourMatrix> U(4, UGrid);
-  for (int mu = 0; mu < Nd; mu++) {
-    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
-  }
-*/
-
-  int Nstop = 10;
-  int Nk = 20;
-  int Np = 80;
-  Nstop=LanParams.Nstop;
-  Nk=LanParams.Nk;
-  Np=LanParams.Np;
-
-  int Nm = Nk + Np;
-  int MaxIt = 10000;
-  RealD resid = 1.0e-5;
-
-
-//while ( mass > - 5.0){
-  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
-//  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
-  Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
-//  Gamma5R5HermitianLinearOperator
-  std::vector<double> Coeffs{0, 1.};
-  Polynomial<FermionField> PolyX(Coeffs);
-
-  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
-
-  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
-  PlainHermOp<FermionField> Op     (HermOp);
-  PlainHermOp<FermionField> Op2     (G5R5Herm);
-
-  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
-
-  std::vector<RealD> eval(Nm);
-  FermionField src(FGrid);
-  gaussian(RNG5, src);
-  std::vector<FermionField> evec(Nm, FGrid);
-  for (int i = 0; i < 1; i++) {
-    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
-              << std::endl;
-  };
-
-  int Nconv;
-  IRL.calc(eval, evec, src, Nconv);
-
-  std::cout << mass <<" : " << eval << std::endl;
-
-#if 0
-  Gamma g5(Gamma::Algebra::Gamma5) ;
-  ComplexD dot;
-  FermionField tmp(FGrid);
-//  RealD eMe,eMMe;
-  for (int i = 0; i < Nstop ; i++) {
-//    tmp = g5*evec[i];
-    dot = innerProduct(evec[i],evec[i]);
-//    G5R5(tmp,evec[i]);
-    G5R5Herm.HermOpAndNorm(evec[i],tmp,eMe,eMMe);
-    std::cout <<"Norm "<<M5<<" "<< mass << " : " << i << " " << real(dot) << " " << imag(dot)  << " "<< eMe << " " <<eMMe<< std::endl ;
-    for (int j = 0; j < Nstop ; j++) {
-      dot = innerProduct(tmp,evec[j]);
-      std::cout <<"G5R5 "<<M5<<" "<< mass << " : " << i << " " <<j<<" " << real(dot) << " " << imag(dot)  << std::endl ;
-    }
-  }
-//  src  = evec[0]+evec[1]+evec[2];
-//  mass += -0.1;
-#endif
-
-  //**********************************************************************
-  //orthogonalization
-  //calculat the matrix
-  cout << "Start orthogonalization " << endl;
-  cout << "calculate the matrix element" << endl;
-  vector<LatticeFermion> G5R5Mevec(Nconv, FGrid);
-  vector<LatticeFermion> finalevec(Nconv, FGrid);
-  vector<RealD> eMe(Nconv), eMMe(Nconv);
-  for(int i = 0; i < Nconv; i++){
-    G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
-  }
-  cout << "Re<evec, G5R5M(evec)>: " << endl;
-  cout << eMe << endl;
-  cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
-  cout << eMMe << endl;
-  vector<vector<ComplexD>> VevecG5R5Mevec(Nconv);
-  Eigen::MatrixXcd evecG5R5Mevec = Eigen::MatrixXcd::Zero(Nconv, Nconv);
-  for(int i = 0; i < Nconv; i++){
-    VevecG5R5Mevec[i].resize(Nconv);
-    for(int j = 0; j < Nconv; j++){
-      VevecG5R5Mevec[i][j] = innerProduct(evec[i], G5R5Mevec[j]);
-      evecG5R5Mevec(i, j) = VevecG5R5Mevec[i][j];
-    }
-  }
-  //calculate eigenvector
-  cout << "Eigen solver" << endl;
-  Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(evecG5R5Mevec);
-  vector<RealD> eigeneval(Nconv);
-  vector<vector<ComplexD>> eigenevec(Nconv);
-  for(int i = 0; i < Nconv; i++){
-    eigeneval[i] = eigensolver.eigenvalues()[i];
-    eigenevec[i].resize(Nconv);
-    for(int j = 0; j < Nconv; j++){
-      eigenevec[i][j] = eigensolver.eigenvectors()(i, j);
-    }
-  }
-  //rotation
-  cout << "Do rotation" << endl;
-  for(int i = 0; i < Nconv; i++){
-    finalevec[i] = finalevec[i] - finalevec[i];
-    for(int j = 0; j < Nconv; j++){
-      finalevec[i] = eigenevec[j][i]*evec[j] + finalevec[i];
-    }
-  }
-  //normalize again;
-  for(int i = 0; i < Nconv; i++){
-    RealD tmp_RealD = norm2(finalevec[i]);
-    tmp_RealD = 1./pow(tmp_RealD, 0.5);
-    finalevec[i] = finalevec[i]*tmp_RealD;
-  }
-
-  //check
-  for(int i = 0; i < Nconv; i++){
-    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
-  }
-
-  //**********************************************************************
-  //sort the eigenvectors
-  vector<LatticeFermion> finalevec_copy(Nconv, FGrid);
-  for(int i = 0; i < Nconv; i++){
-    finalevec_copy[i] = finalevec[i];
-  }
-  vector<RealD> eMe_copy(eMe);
-  for(int i = 0; i < Nconv; i++){
-    eMe[i] = fabs(eMe[i]);
-    eMe_copy[i] = eMe[i];
-  }
-  sort(eMe_copy.begin(), eMe_copy.end());
-  for(int i = 0; i < Nconv; i++){
-    for(int j = 0; j < Nconv; j++){
-      if(eMe[j] == eMe_copy[i]){
-        finalevec[i] = finalevec_copy[j];
-      }
-    }
-  }
-    for(int i = 0; i < Nconv; i++){
-    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
-  }
-  cout << "Re<evec, G5R5M(evec)>: " << endl;
-  cout << eMe << endl;
-  cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
-  cout << eMMe << endl;
-
-
-//  vector<LatticeFermion> finalevec(Nconv, FGrid);
-// temporary, until doing rotation
-//  for(int i = 0; i < Nconv; i++)
-//	  finalevec[i]=evec[i];
-  //**********************************************************************
-  //calculate chirality matrix
-  vector<LatticeFermion> G5evec(Nconv, FGrid);
-  vector<vector<ComplexD>> chiral_matrix(Nconv);
-  vector<vector<RealD>> chiral_matrix_real(Nconv);
-  for(int i = 0; i < Nconv; i++){
-//    G5evec[i] = G5evec[i] - G5evec[i];
-    G5evec[i] = Zero();
-    for(int j = 0; j < Ls/2; j++){
-      axpby_ssp(G5evec[i], 1., finalevec[i], 0., G5evec[i], j, j);
-    }
-    for(int j = Ls/2; j < Ls; j++){
-      axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
-    }
-  }
-  for(int i = 0; i < Nconv; i++){
-    chiral_matrix_real[i].resize(Nconv);
-    chiral_matrix[i].resize(Nconv);
-    for(int j = 0; j < Nconv; j++){
-      chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
-      chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
-      std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
-    }
-  }
-  for(int i = 0; i < Nconv; i++){
-    if(chiral_matrix[i][i].real() < 0.){
-      chiral_matrix_real[i][i] = -1. * chiral_matrix_real[i][i];
-    }
-  }
-
-
-  Grid_finalize();
-}
--- a/tests/lanczos/Test_wilson_DWFKernel.cc
+++ b/tests/lanczos/Test_wilson_DWFKernel.cc
@ -1,278 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./tests/Test_dwf_lanczos.cc
-
-Copyright (C) 2015
-
-Author: Chulwoo Jung <chulwoo@bnl.gov>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
- ;
-
-typedef WilsonFermionD FermionOp;
-typedef typename WilsonFermionD::FermionField FermionField;
-
-
-RealD AllZero(RealD x) { return 0.; }
-
-namespace Grid {
-
-#if 0
-template<typename Field>
-class RationalHermOp : public LinearFunction<Field> {
-public:
-  using LinearFunction<Field>::operator();  
-//  OperatorFunction<Field>   & _poly;
-  LinearOperatorBase<Field> &_Linop;
-  RealD _massDen, _massNum;
-   
-  FunctionHermOp(LinearOperatorBase<Field>& linop, RealD massDen,RealD massNum)
-    :  _Linop(linop) ,_massDen(massDen),_massNum(massNum) {};
-      
-  void operator()(const Field& in, Field& out) {
-//    _poly(_Linop,in,out);
-  } 
-};
-#endif
-
-template<class Matrix,class Field>
-class InvG5LinearOperator : public LinearOperatorBase<Field> {
-  Matrix &_Mat;
-  RealD _num;
-  RealD _Tol;
-  Integer _MaxIt;
-  Gamma g5;
-
-public:
-  InvG5LinearOperator(Matrix &Mat,RealD num): _Mat(Mat),_num(num), _Tol(1e-12),_MaxIt(10000), g5(Gamma::Algebra::Gamma5) {};
-
-  // Support for coarsening to a multigrid
-  void OpDiag (const Field &in, Field &out) {
-    assert(0);
-    _Mat.Mdiag(in,out);
-  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    assert(0);
-    _Mat.Mdir(in,out,dir,disp);
-  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
-    _Mat.MdirAll(in,out);
-  };
-  void Op     (const Field &in, Field &out){
-    assert(0);
-    _Mat.M(in,out);
-  }
-  void AdjOp     (const Field &in, Field &out){
-    assert(0);
-    _Mat.Mdag(in,out);
-  }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    HermOp(in,out);
-    ComplexD dot = innerProduct(in,out);
-    n1=real(dot);
-    n2=norm2(out);
-  }
-  void HermOp(const Field &in, Field &out){
-     Field tmp(in.Grid());
-     MdagMLinearOperator<Matrix,Field> denom(_Mat);
-     ConjugateGradient<Field> CG(_Tol,_MaxIt); 
-     _Mat.M(in,tmp);
-     tmp += _num*in;
-     _Mat.Mdag(tmp,out);
-     CG(denom,out,tmp);
-     out = g5*tmp;
-  }
-};
-
-
-struct LanczosParameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
-		  		RealD, mass , 
-				RealD, resid,
-	  			RealD, ChebyLow,
-	  			RealD, ChebyHigh,
-	  			Integer, ChebyOrder)
-//                                  Integer, StartTrajectory,
-//                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
-//                                  bool, MetropolisTest,
-//                                  Integer, NoMetropolisUntil,
-//                                  std::string, StartingType,
-//                                  Integer, SW,
-//				  RealD, Kappa,
-//                                  IntegratorParameters, MD)
-
-  LanczosParameters() {
-    ////////////////////////////// Default values
-      mass = 0;
-//    MetropolisTest    = true;
-//    NoMetropolisUntil = 10;
-//    StartTrajectory   = 0;
-//    SW                = 2;
-//    Trajectories      = 10;
-//    StartingType      = "HotStart";
-    /////////////////////////////////
-  }
-
-  template <class ReaderClass >
-  LanczosParameters(Reader<ReaderClass> & TheReader){
-    initialize(TheReader);
-  }
-
-  template < class ReaderClass > 
-  void initialize(Reader<ReaderClass> &TheReader){
-//    std::cout << GridLogMessage << "Reading HMC\n";
-    read(TheReader, "HMC", *this);
-  }
-
-
-  void print_parameters() const {
-//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
-//    MD.print_parameters();
-  }
-  
-};
-
-}
-
-int main(int argc, char** argv) {
-  Grid_init(&argc, &argv);
-
-  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
-      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
-      GridDefaultMpi());
-  GridRedBlackCartesian* UrbGrid =
-      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian* FGrid = UGrid;
-  GridRedBlackCartesian* FrbGrid = UrbGrid;
-//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
-
-  std::vector<int> seeds4({1, 2, 3, 4});
-  std::vector<int> seeds5({5, 6, 7, 8});
-  GridParallelRNG RNG5(FGrid);
-  RNG5.SeedFixedIntegers(seeds5);
-  GridParallelRNG RNG4(UGrid);
-  RNG4.SeedFixedIntegers(seeds4);
-  GridParallelRNG RNG5rb(FrbGrid);
-  RNG5.SeedFixedIntegers(seeds5);
-
-  LatticeGaugeField Umu(UGrid);
-//  SU<Nc>::HotConfiguration(RNG4, Umu);
-
-  FieldMetaData header;
-  std::string file("./config");
-
-  int precision32 = 0;
-  int tworow      = 0;
-//  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
-  NerscIO::readConfiguration(Umu,header,file);
-
-/*
-  std::vector<LatticeColourMatrix> U(4, UGrid);
-  for (int mu = 0; mu < Nd; mu++) {
-    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
-  }
-*/
-
-  int Nstop = 5;
-  int Nk = 10;
-  int Np = 90;
-  int Nm = Nk + Np;
-  int MaxIt = 10000;
-  RealD resid = 1.0e-5;
-
-  RealD mass = -1.0;
-
-  LanczosParameters LanParams;
-#if 1
-  {
-    XmlReader  HMCrd("LanParams.xml");
-    read(HMCrd,"LanczosParameters",LanParams);
-  }
-#else
-  {
-    LanParams.mass = mass;
-  }
-#endif
-  std::cout << GridLogMessage<< LanParams <<std::endl;
-  { 
-    XmlWriter HMCwr("LanParams.xml.out");
-    write(HMCwr,"LanczosParameters",LanParams);
-  }
-
-  mass=LanParams.mass;
-  resid=LanParams.resid;
-
-
-while ( mass > - 5.0){
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
-  InvG5LinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator,-2.); /// <-----
-  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
-//  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
-
-  std::vector<double> Coeffs{0, 0, 1.};
-  Polynomial<FermionField> PolyX(Coeffs);
-  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
-
-       FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
-//     InvHermOp<FermionField> Op(WilsonOperator,HermOp);
-     PlainHermOp<FermionField> Op     (HermOp);
-//     PlainHermOp<FermionField> Op2     (HermOp2);
-
-  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
-
-  std::vector<RealD> eval(Nm);
-  FermionField src(FGrid);
-  gaussian(RNG5, src);
-  std::vector<FermionField> evec(Nm, FGrid);
-  for (int i = 0; i < 1; i++) {
-    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
-              << std::endl;
-  };
-
-  int Nconv;
-  IRL.calc(eval, evec, src, Nconv);
-
-  std::cout << mass <<" : " << eval << std::endl;
-
-  Gamma g5(Gamma::Algebra::Gamma5) ;
-  ComplexD dot;
-  FermionField tmp(FGrid);
-  for (int i = 0; i < Nstop ; i++) {
-    tmp = g5*evec[i];
-    dot = innerProduct(tmp,evec[i]);
-    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
-  }
-  src  = evec[0]+evec[1]+evec[2];
-  mass += -0.1;
-}
-
-  Grid_finalize();
-}
--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@ -1,211 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./tests/Test_dwf_lanczos.cc
-
-Copyright (C) 2015
-
-Author: Chulwoo Jung <chulwoo@bnl.gov>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
- ;
-
-typedef WilsonFermionD FermionOp;
-typedef typename WilsonFermionD::FermionField FermionField;
-
-
-RealD AllZero(RealD x) { return 0.; }
-
-namespace Grid {
-
-struct LanczosParameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
-		  		RealD, mass , 
-	  			RealD, ChebyLow,
-	  			RealD, ChebyHigh,
-	  			Integer, ChebyOrder)
-//                                  Integer, StartTrajectory,
-//                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
-//                                  bool, MetropolisTest,
-//                                  Integer, NoMetropolisUntil,
-//                                  std::string, StartingType,
-//                                  Integer, SW,
-//				  RealD, Kappa,
-//                                  IntegratorParameters, MD)
-
-  LanczosParameters() {
-    ////////////////////////////// Default values
-      mass = 0;
-//    MetropolisTest    = true;
-//    NoMetropolisUntil = 10;
-//    StartTrajectory   = 0;
-//    SW                = 2;
-//    Trajectories      = 10;
-//    StartingType      = "HotStart";
-    /////////////////////////////////
-  }
-
-  template <class ReaderClass >
-  LanczosParameters(Reader<ReaderClass> & TheReader){
-    initialize(TheReader);
-  }
-
-  template < class ReaderClass > 
-  void initialize(Reader<ReaderClass> &TheReader){
-//    std::cout << GridLogMessage << "Reading HMC\n";
-    read(TheReader, "HMC", *this);
-  }
-
-
-  void print_parameters() const {
-//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
-//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
-//    MD.print_parameters();
-  }
-  
-};
-
-}
-
-int main(int argc, char** argv) {
-  Grid_init(&argc, &argv);
-
-  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
-      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
-      GridDefaultMpi());
-  GridRedBlackCartesian* UrbGrid =
-      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian* FGrid = UGrid;
-  GridRedBlackCartesian* FrbGrid = UrbGrid;
-//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
-
-  std::vector<int> seeds4({1, 2, 3, 4});
-  std::vector<int> seeds5({5, 6, 7, 8});
-  GridParallelRNG RNG5(FGrid);
-  RNG5.SeedFixedIntegers(seeds5);
-  GridParallelRNG RNG4(UGrid);
-  RNG4.SeedFixedIntegers(seeds4);
-  GridParallelRNG RNG5rb(FrbGrid);
-  RNG5.SeedFixedIntegers(seeds5);
-
-  LatticeGaugeField Umu(UGrid);
-//  SU<Nc>::HotConfiguration(RNG4, Umu);
-
-  FieldMetaData header;
-  std::string file("./config");
-
-  int precision32 = 0;
-  int tworow      = 0;
-//  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
-  NerscIO::readConfiguration(Umu,header,file);
-
-/*
-  std::vector<LatticeColourMatrix> U(4, UGrid);
-  for (int mu = 0; mu < Nd; mu++) {
-    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
-  }
-*/
-
-  int Nstop = 10;
-  int Nk = 20;
-  int Np = 80;
-  int Nm = Nk + Np;
-  int MaxIt = 10000;
-  RealD resid = 1.0e-5;
-
-  RealD mass = -1.0;
-
-  LanczosParameters LanParams;
-#if 1
-  {
-    XmlReader  HMCrd("LanParams.xml");
-    read(HMCrd,"LanczosParameters",LanParams);
-  }
-#else
-  {
-    LanParams.mass = mass;
-  }
-#endif
-  std::cout << GridLogMessage<< LanParams <<std::endl;
-  { 
-    XmlWriter HMCwr("LanParams.xml.out");
-    write(HMCwr,"LanczosParameters",LanParams);
-  }
-
-  mass=LanParams.mass;
-
-
-while ( mass > - 5.0){
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
-  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
-  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
-  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
-
-  std::vector<double> Coeffs{0, 1.};
-  Polynomial<FermionField> PolyX(Coeffs);
-//  Chebyshev<FermionField> Cheby(0.5, 60., 31);
-//                                  RealD, ChebyLow,
-//                                RealD, ChebyHigh,
-//                                Integer, ChebyOrder)
-
-  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
-
-  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
-     PlainHermOp<FermionField> Op     (HermOp);
-     PlainHermOp<FermionField> Op2     (HermOp2);
-
-  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
-
-  std::vector<RealD> eval(Nm);
-  FermionField src(FGrid);
-  gaussian(RNG5, src);
-  std::vector<FermionField> evec(Nm, FGrid);
-  for (int i = 0; i < 1; i++) {
-    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
-              << std::endl;
-  };
-
-  int Nconv;
-  IRL.calc(eval, evec, src, Nconv);
-
-  std::cout << mass <<" : " << eval << std::endl;
-
-  Gamma g5(Gamma::Algebra::Gamma5) ;
-  ComplexD dot;
-  FermionField tmp(FGrid);
-  for (int i = 0; i < Nstop ; i++) {
-    tmp = g5*evec[i];
-    dot = innerProduct(tmp,evec[i]);
-    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
-  }
-  src  = evec[0]+evec[1]+evec[2];
-  mass += -0.1;
-}
-
-  Grid_finalize();
-}
Author	SHA1	Message	Date
Ed Bennett	dcfc4ed81f	Merge `32e6d58356` into `ba9bbe0221`	2025-02-13 11:26:59 +00:00
Ed Bennett	32e6d58356	use accelerator for setCheckerboard in RHMC	2021-12-22 23:43:43 +00:00