Merge pull request #397 from giltirn/feature/dirichlet-gparity-stage

Gparity HMC import round 2
2026-02-04 14:23:29 +00:00 · 2022-05-25 13:29:45 -04:00
parent 0417b96896 6121397587
commit d3496d2fe0
14 changed files with 852 additions and 93 deletions
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -49,6 +49,7 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    RealD TrueResidual;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
@@ -68,6 +69,7 @@ NAMESPACE_BEGIN(Grid);
    }
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
    std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
    TotalInnerIterations = 0;
    GridStopWatch TotalTimer;
@@ -97,6 +99,7 @@ NAMESPACE_BEGIN(Grid);
    FieldF sol_f(SinglePrecGrid);
    sol_f.Checkerboard() = cb;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;
@@ -130,6 +133,7 @@ NAMESPACE_BEGIN(Grid);
 	(*guesser)(src_f, sol_f);
      //Inner CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
      CG_f.Tolerance = inner_tol;
      InnerCGtimer.Start();
      CG_f(Linop_f, src_f, sol_f);
@@ -150,6 +154,7 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
    TrueResidual = CG_d.TrueResidual;
    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
-  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@@ -183,6 +183,9 @@ public:
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -0,0 +1,409 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //Linop to add shift to input linop, used in cleanup CG
 namespace ConjugateGradientMultiShiftMixedPrecSupport{
 template<typename Field>
 class ShiftedLinop: public LinearOperatorBase<Field>{
 public:
  LinearOperatorBase<Field> &linop_base;
  RealD shift;
  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOp(const Field &in, Field &out){
    linop_base.HermOp(in, out);
    axpy(out, shift, in, out);
  }    
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
 };
 };
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq
 				       ) : 
    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GridBase *DoublePrecGrid = src_d.Grid();
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF r_f(SinglePrecGrid);
    FieldF p_f(SinglePrecGrid);
    FieldF tmp_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    FieldF src_f(SinglePrecGrid);
    precisionChange(src_f, src_d);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
    r_f=src_f; //residual maintained in single
    p_f=src_f;
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    //MdagM+m[0]
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    axpy(mmp_f,mass[0],p_f,mmp_f);
    RealD rn = norm2(p_f);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_f,b,mmp_f,r_f);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterations;k++){    
      a = c /cp;
      //Update double precision search direction by residual
      PrecChangeTimer.Start();
      precisionChange(r_d, r_f);
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_d[s],a,ps_d[s],r_d);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      cp=c;
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f); 
      d=real(innerProduct(p_f,mmp_f));    
      MatrixTimer.Stop();  
      AXPYTimer.Start();
      axpy(mmp_f,mass[0],p_f,mmp_f);
      AXPYTimer.Stop();
      RealD rn = norm2(p_f);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update double precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
 	}
      }
      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
      AXPYTimer.Stop();
      c = c_f;
      if(k % ReliableUpdateFreq == 0){
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
 	MatrixTimer.Stop();  
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
 	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();
 	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
 	PrecChangeTimer.Start();
 	precisionChange(r_f, r_d);
 	PrecChangeTimer.Stop();
 	c = c_d;
      }
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsq[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged ){
 	SolverTimer.Stop();
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    //  assert(0);
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -44,6 +44,7 @@ public:
 				  int, MinRes);    // Must restart
 };
 //This class is the input parameter class for some testing programs
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@@ -155,6 +156,7 @@ public:
      _coarse_relax_tol(coarse_relax_tol)  
  {    };
  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
@@ -181,8 +183,16 @@ public:
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
  //This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
  //applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
  //evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
  //As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
  //We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)  
  {
    evalMaxApprox = 1.0; //cf above
    GridBase *FineGrid = _subspace[0].Grid();    
    int checkerboard   = _subspace[0].Checkerboard();
    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@@ -201,13 +211,13 @@ public:
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@@ -285,6 +295,10 @@ public:
    evals_coarse.resize(0);
  };
  //The block inner product is the inner product on the fine grid locally summed over the blocks
  //to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
  //vectors under the block inner product. This step must be performed after computing the fine grid
  //eigenvectors and before computing the coarse grid eigenvectors.    
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@@ -328,6 +342,8 @@ public:
    }
  }
  //While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
  //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
@@ -376,18 +392,23 @@ public:
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
  //cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
  //cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
  //relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
-    Chebyshev<FineField>                          Cheby(cheby_op);
+    Chebyshev<FineField>                          Cheby(cheby_op); //Chebyshev of fine operator on fine grid
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); 
    evals_coarse.resize(Nm);
@@ -395,6 +416,7 @@ public:
    CoarseField src(_CoarseGrid);     src=1.0; 
    //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@@ -405,6 +427,14 @@ public:
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
  //Get the fine eigenvector 'i' by reconstruction
  void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
    blockPromote(evec_coarse[i],evec,subspace);  
    eval = evals_coarse[i];
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@@ -30,6 +30,8 @@ template<class Field> class PowerMethod
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
 	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -39,9 +39,11 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
  typedef Lattice<vLorentzColourMatrixD> GaugeField;
  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@@ -198,7 +200,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -30,6 +30,18 @@ directory
 NAMESPACE_BEGIN(Grid);
 /*
  Policy implementation for G-parity boundary conditions
  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
 */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@@ -113,7 +125,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@@ -139,10 +151,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
+    //If this site is an global boundary site, perform the G-parity flavor twist
-
+    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
      if ( sl == 2 ) {
-       
+	//Only do the twist for lanes on the edge of the physical node
 	ExtractBuffer<sobj> vals(Nsimd);
 	extract(chi,vals);
@@ -197,6 +209,19 @@ public:
    reg = memory;
  }
  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
    autoView(poke_f0_v, poke_f0, CpuRead);
    autoView(poke_f1_v, poke_f1, CpuRead);
    autoView(Uds_v, Uds, CpuWrite);
    thread_foreach(ss,poke_f0_v,{
 	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
 	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
      });
  }
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@@ -208,13 +233,18 @@ public:
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-    for(int mu=0;mu<Nd;mu++){
+    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
    for(int mu=0;mu<Nd-1;mu++){
-      LatticeCoordinate(coor,mu);
+      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@@ -229,7 +259,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	  });
+	});
      }
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@@ -260,6 +290,38 @@ public:
        });
      }
    }
    { //periodic / antiperiodic temporal BCs
      int mu = Nd-1;
      int L   = GaugeGrid->GlobalDimensions()[mu];
      int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
      GaugeLinkField *Upoke = &U;
      if(Params.twists[mu]){ //antiperiodic
 	Utmp =  where(coor == Lmu, -U, U);
 	Upoke = &Utmp;
      }
      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
      //Get the barrel-shifted field
      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
      Upoke = &Utmp;
      if(Params.twists[mu]){
 	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
 	Upoke = &U;
      }
      Uconj = conjugate(*Upoke);
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@@ -300,27 +362,47 @@ public:
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    int Ls=Btilde.Grid()->_fdimensions[0];
    int Ls = Btilde.Grid()->_fdimensions[0];
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
-      autoView( tmp_v , tmp, CpuWrite);
+      GridBase *GaugeGrid = mat.Grid();
-      autoView( Atilde_v , Atilde, CpuRead);
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
-      autoView( Btilde_v , Btilde, CpuRead);
+
-      thread_for(ss,tmp.Grid()->oSites(),{
+      if( Params.twists[mu] ){
-	  for (int s = 0; s < Ls; s++) {
+	LatticeCoordinate(coor,mu);
-	    int sF = s + Ls * ss;
+      }
-	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
+
-	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+      autoView( mat_v , mat, AcceleratorWrite);
-	  }
+      autoView( Btilde_v , Btilde, AcceleratorRead);
-	});
+      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
  	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
  	  zeroit(sum);
  	  for(int s=0;s<Ls;s++){
  	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
 	      //Flavor 0
  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
  	      sum = sum + outerProduct(bb,aa);
  	      //Flavor 1
  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
  	      sum = sum + conjugate(outerProduct(bb,aa));
  	    }
  	  }	    
  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
 };
 typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
--- a/tests/core/Test_gparity.cc
+++ b/tests/core/Test_gparity.cc
@@ -55,6 +55,7 @@ static_assert(same_vComplex == 1, "Dirac Operators must have same underlying SIM
 int main (int argc, char ** argv)
 {
  int nu = 0;
  int tbc_aprd = 0; //use antiperiodic BCs in the time direction?
  Grid_init(&argc,&argv);
@@ -62,6 +63,9 @@ int main (int argc, char ** argv)
    if(std::string(argv[i]) == "--Gparity-dir"){
      std::stringstream ss; ss << argv[i+1]; ss >> nu;
      std::cout << GridLogMessage << "Set Gparity direction to " << nu << std::endl;
    }else if(std::string(argv[i]) == "--Tbc-APRD"){
      tbc_aprd = 1;
      std::cout << GridLogMessage << "Using antiperiodic BCs in the time direction" << std::endl;
    }
  }
@@ -155,13 +159,18 @@ int main (int argc, char ** argv)
  //Coordinate grid for reference
  LatticeInteger    xcoor_1f5(FGrid_1f);
-  LatticeCoordinate(xcoor_1f5,1+nu);
+  LatticeCoordinate(xcoor_1f5,1+nu); //note '1+nu'! This is because for 5D fields the s-direction is direction 0
  Replicate(src,src_1f);
  src_1f   = where( xcoor_1f5 >= Integer(L), 2.0*src_1f,src_1f );
  RealD mass=0.0;
  RealD M5=1.8;
-  StandardDiracOp Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5 DOP_PARAMS);
+
  //Standard Dirac op
  AcceleratorVector<Complex,4> bc_std(Nd, 1.0);
  if(tbc_aprd) bc_std[Nd-1] = -1.; //antiperiodic time BC
  StandardDiracOp::ImplParams std_params(bc_std);
  StandardDiracOp Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5 DOP_PARAMS, std_params);
  StandardFermionField    src_o_1f(FrbGrid_1f);
  StandardFermionField result_o_1f(FrbGrid_1f);
@@ -172,9 +181,11 @@ int main (int argc, char ** argv)
  ConjugateGradient<StandardFermionField> CG(1.0e-8,10000);
  CG(HermOpEO,src_o_1f,result_o_1f);
-  //  const int nu = 3;
+  //Gparity Dirac op
  std::vector<int> twists(Nd,0);
  twists[nu] = 1;
  if(tbc_aprd) twists[Nd-1] = 1;
  GparityDiracOp::ImplParams params;
  params.twists = twists;
  GparityDiracOp GPDdwf(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,mass,M5 DOP_PARAMS,params);
@@ -271,8 +282,11 @@ int main (int argc, char ** argv)
  std::cout << "2f cb "<<result_o_2f.Checkerboard()<<std::endl;
  std::cout << "1f cb "<<result_o_1f.Checkerboard()<<std::endl;
-  std::cout << " result norms " <<norm2(result_o_2f)<<" " <<norm2(result_o_1f)<<std::endl;
+  //Compare norms
  std::cout << " result norms 2f: " <<norm2(result_o_2f)<<" 1f: " <<norm2(result_o_1f)<<std::endl;
  //Take the 2f solution and convert into the corresponding 1f solution (odd cb only)
  StandardFermionField    res0o  (FrbGrid_2f); 
  StandardFermionField    res1o  (FrbGrid_2f); 
  StandardFermionField    res0  (FGrid_2f); 
@@ -281,12 +295,13 @@ int main (int argc, char ** argv)
  res0=Zero();
  res1=Zero();
-  res0o = PeekIndex<0>(result_o_2f,0);
+  res0o = PeekIndex<0>(result_o_2f,0); //flavor 0, odd cb
-  res1o = PeekIndex<0>(result_o_2f,1);
+  res1o = PeekIndex<0>(result_o_2f,1); //flavor 1, odd cb
  std::cout << "res cb "<<res0o.Checkerboard()<<std::endl;
  std::cout << "res cb "<<res1o.Checkerboard()<<std::endl;
  //poke odd onto non-cb field
  setCheckerboard(res0,res0o); 
  setCheckerboard(res1,res1o); 
@@ -296,12 +311,13 @@ int main (int argc, char ** argv)
  Replicate(res0,replica0);
  Replicate(res1,replica1);
  //2nd half of doubled lattice has f=1
  replica = where( xcoor_1f5 >= Integer(L), replica1,replica0 );
  replica0 = Zero();
  setCheckerboard(replica0,result_o_1f);
-  std::cout << "Norm2 solutions is " <<norm2(replica)<<" "<< norm2(replica0)<<std::endl;
+  std::cout << "Norm2 solutions 1f reconstructed from 2f: " <<norm2(replica)<<" Actual 1f: "<< norm2(replica0)<<std::endl;
  replica = replica - replica0;
--- a/tests/forces/Test_dwf_gpforce.cc
+++ b/tests/forces/Test_dwf_gpforce.cc
@@ -71,26 +71,14 @@ int main (int argc, char ** argv)
  ////////////////////////////////////
  RealD mass=0.2; //kills the diagonal term
  RealD M5=1.8;
  //  const int nu = 3;
  //  std::vector<int> twists(Nd,0); // twists[nu] = 1;
  //  GparityDomainWallFermionR::ImplParams params;  params.twists = twists;
  //  GparityDomainWallFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
-  //  DomainWallFermionR Dw     (U,     Grid,RBGrid,mass,M5);
+  const int nu = 0; //gparity direction
  const int nu = 3;
  std::vector<int> twists(Nd,0);
  twists[nu] = 1;
  twists[Nd-1] = 1; //antiperiodic in time
  GparityDomainWallFermionR::ImplParams params;
  params.twists = twists;
  /*
  params.boundary_phases[0] = 1.0;
  params.boundary_phases[1] = 1.0;
  params.boundary_phases[2] = 1.0;
  params.boundary_phases[3] =- 1.0;
  */
  GparityDomainWallFermionR Dw(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
  Dw.M   (phi,Mphi);
--- a/tests/forces/Test_gpdwf_force.cc
+++ b/tests/forces/Test_gpdwf_force.cc
@@ -71,8 +71,10 @@ int main (int argc, char ** argv)
  RealD mass=0.01; 
  RealD M5=1.8; 
-  const int nu = 3;
+  const int nu = 1;
-  std::vector<int> twists(Nd,0);  twists[nu] = 1;
+  std::vector<int> twists(Nd,0);
  twists[nu] = 1;
  twists[3] = 1;
  GparityDomainWallFermionR::ImplParams params;  params.twists = twists;
  GparityDomainWallFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
  Ddwf.M   (phi,Mphi);
--- a/tests/forces/Test_gpwilson_force.cc
+++ b/tests/forces/Test_gpwilson_force.cc
@@ -64,8 +64,12 @@ int main (int argc, char ** argv)
  ////////////////////////////////////
  RealD mass=0.01; 
-  const int nu = 3;
+  const int nu = 1;
-  std::vector<int> twists(Nd,0);  twists[nu] = 1;
+  const int Lnu=latt_size[nu];
  std::vector<int> twists(Nd,0);
  twists[nu] = 1;
  twists[3]=1;
  GparityWilsonFermionR::ImplParams params;  params.twists = twists;
  GparityWilsonFermionR Wil(U,*UGrid,*UrbGrid,mass,params);
  Wil.M   (phi,Mphi);
--- a/tests/lanczos/Test_dwf_lanczos.cc
+++ b/tests/lanczos/Test_dwf_lanczos.cc
@@ -31,14 +31,38 @@ using namespace std;
 using namespace Grid;
 ;
-typedef typename GparityDomainWallFermionR::FermionField FermionField;
+template<typename Action>
 struct Setup{};
-RealD AllZero(RealD x){ return 0.;}
+template<>
 struct Setup<GparityMobiusFermionR>{
  static GparityMobiusFermionR* getAction(LatticeGaugeField &Umu,
 					  GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
    RealD mass=0.01;
    RealD M5=1.8;
    RealD mob_b=1.5;
    GparityMobiusFermionD ::ImplParams params;
    std::vector<int> twists({1,1,1,0});
    params.twists = twists;
    return new GparityMobiusFermionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
  }
 };
-int main (int argc, char ** argv)
+template<>
-{
+struct Setup<DomainWallFermionR>{
-  Grid_init(&argc,&argv);
+  static DomainWallFermionR* getAction(LatticeGaugeField &Umu,
 					  GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
    RealD mass=0.01;
    RealD M5=1.8;
    return new DomainWallFermionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  }
 };
 template<typename Action>
 void run(){
  typedef typename Action::FermionField FermionField;
  const int Ls=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
@@ -56,24 +80,10 @@ int main (int argc, char ** argv)
  LatticeGaugeField Umu(UGrid); 
  SU<Nc>::HotConfiguration(RNG4, Umu);
-  std::vector<LatticeColourMatrix> U(4,UGrid);
+  Action *action = Setup<Action>::getAction(Umu,FGrid,FrbGrid,UGrid,UrbGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
-  RealD mass=0.01;
+  //MdagMLinearOperator<Action,FermionField> HermOp(Ddwf);
-  RealD M5=1.8;
+  SchurDiagTwoOperator<Action,FermionField> HermOp(*action);
  RealD mob_b=1.5;
 //  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  GparityMobiusFermionD ::ImplParams params;
  std::vector<int> twists({1,1,1,0});
  params.twists = twists;
  GparityMobiusFermionR  Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
 //  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
 //  SchurDiagTwoOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
  SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
 //  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
  const int Nstop = 30;
  const int Nk = 40;
@@ -91,7 +101,6 @@ int main (int argc, char ** argv)
  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt);
  std::vector<RealD>          eval(Nm);
  FermionField    src(FrbGrid); 
  gaussian(RNG5rb,src);
@@ -103,6 +112,28 @@ int main (int argc, char ** argv)
  int Nconv;
  IRL.calc(eval,evec,src,Nconv);
  delete action;
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::string action = "GparityMobius";
  for(int i=1;i<argc;i++){
    if(std::string(argv[i]) == "-action"){
      action = argv[i+1];
    }
  }
  if(action == "GparityMobius"){
    run<GparityMobiusFermionR>();
  }else if(action == "DWF"){
    run<DomainWallFermionR>();
  }else{
    std::cout << "Unknown action" << std::endl;
    exit(1);
  }
  Grid_finalize();
 }
--- a/tests/solver/Test_dwf_multishift_mixedprec.cc
+++ b/tests/solver/Test_dwf_multishift_mixedprec.cc
@@ -0,0 +1,184 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_dwf_multishift_mixedprec.cc
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 template<typename SpeciesD, typename SpeciesF, typename GaugeStatisticsType>
 void run_test(int argc, char ** argv, const typename SpeciesD::ImplParams &params){
  const int Ls = 16;
  GridCartesian* UGrid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* UrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_d);
  GridCartesian* FGrid_d = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid_d);
  GridRedBlackCartesian* FrbGrid_d = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid_d);
  GridCartesian* UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f);
  GridCartesian* FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid_f);
  GridRedBlackCartesian* FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid_f);
  typedef typename SpeciesD::FermionField FermionFieldD;
  typedef typename SpeciesF::FermionField FermionFieldF;
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
  GridParallelRNG RNG5(FGrid_d);
  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid_d);
  RNG4.SeedFixedIntegers(seeds4);
  FermionFieldD src_d(FGrid_d);
  random(RNG5, src_d);
  LatticeGaugeFieldD Umu_d(UGrid_d);
  //CPS-created G-parity ensembles have a factor of 2 error in the plaquette that causes the read to fail unless we workaround it
  bool gparity_plaquette_fix = false;
  for(int i=1;i<argc;i++){
    if(std::string(argv[i]) == "--gparity_plaquette_fix"){
      gparity_plaquette_fix=true;
      break;
    }
  }
  bool cfg_loaded=false;
  for(int i=1;i<argc;i++){
    if(std::string(argv[i]) == "--load_config"){
      assert(i != argc-1);
      std::string file = argv[i+1];
      NerscIO io;
      FieldMetaData metadata;
      if(gparity_plaquette_fix) NerscIO::exitOnReadPlaquetteMismatch() = false;
      io.readConfiguration<GaugeStatisticsType>(Umu_d, metadata, file);
      if(gparity_plaquette_fix){
 	metadata.plaquette *= 2.; //correct header value
 	//Get the true plaquette
 	FieldMetaData tmp;
 	GaugeStatisticsType gs; gs(Umu_d, tmp);
 	std::cout << "After correction: plaqs " << tmp.plaquette << " " << metadata.plaquette << std::endl;
 	assert(fabs(tmp.plaquette -metadata.plaquette ) < 1.0e-5 );
      }
      cfg_loaded=true;
      break;
    }
  }
  if(!cfg_loaded)
    SU<Nc>::HotConfiguration(RNG4, Umu_d);
  LatticeGaugeFieldF Umu_f(UGrid_f);
  precisionChange(Umu_f, Umu_d);
  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << "   Ls: " << Ls << std::endl;
  RealD mass = 0.01;
  RealD M5 = 1.8;
  SpeciesD Ddwf_d(Umu_d, *FGrid_d, *FrbGrid_d, *UGrid_d, *UrbGrid_d, mass, M5, params);
  SpeciesF Ddwf_f(Umu_f, *FGrid_f, *FrbGrid_f, *UGrid_f, *UrbGrid_f, mass, M5, params);
  FermionFieldD src_o_d(FrbGrid_d);
  pickCheckerboard(Odd, src_o_d, src_d);
  SchurDiagMooeeOperator<SpeciesD, FermionFieldD> HermOpEO_d(Ddwf_d);
  SchurDiagMooeeOperator<SpeciesF, FermionFieldF> HermOpEO_f(Ddwf_f);
  AlgRemez remez(1e-4, 64, 50);
  int order = 15;
  remez.generateApprox(order, 1, 2); //sqrt
  MultiShiftFunction shifts(remez, 1e-10, false);
  int relup_freq = 50;
  double t1=usecond();
  ConjugateGradientMultiShiftMixedPrec<FermionFieldD,FermionFieldF> mcg(10000, shifts, FrbGrid_f, HermOpEO_f, relup_freq);
  std::vector<FermionFieldD> results_o_d(order, FrbGrid_d);
  mcg(HermOpEO_d, src_o_d, results_o_d);
  double t2=usecond();
  //Crosscheck double and mixed prec results
  ConjugateGradientMultiShift<FermionFieldD> dmcg(10000, shifts);
  std::vector<FermionFieldD> results_o_d_2(order, FrbGrid_d);
  dmcg(HermOpEO_d, src_o_d, results_o_d_2);
  double t3=usecond();
  std::cout << GridLogMessage << "Comparison of mixed prec results to double prec results |mixed - double|^2 :" << std::endl;
  FermionFieldD tmp(FrbGrid_d);
  for(int i=0;i<order;i++){
    RealD ndiff = axpy_norm(tmp, -1., results_o_d[i], results_o_d_2[i]);
    std::cout << i << " " << ndiff << std::endl;
  }
  std::cout<<GridLogMessage << "Mixed precision algorithm: Total usec    =   "<< (t2-t1)<<std::endl;
  std::cout<<GridLogMessage << "Double precision algorithm: Total usec    =   "<< (t3-t2)<<std::endl;
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc, &argv);
  bool gparity = false;
  int gpdir;
  for(int i=1;i<argc;i++){
    std::string arg(argv[i]);
    if(arg == "--Gparity"){
      assert(i!=argc-1);
      gpdir = std::stoi(argv[i+1]);
      assert(gpdir >= 0 && gpdir <= 2); //spatial!
      gparity = true;
    }
  }
  if(gparity){
    std::cout << "Running test with G-parity BCs in " << gpdir << " direction" << std::endl;
    GparityWilsonImplParams params;
    params.twists[gpdir] = 1;
    std::vector<int> conj_dirs(Nd,0);
    conj_dirs[gpdir] = 1;
    ConjugateGimplD::setDirections(conj_dirs);
    run_test<GparityDomainWallFermionD, GparityDomainWallFermionF, ConjugateGaugeStatistics>(argc,argv,params);
  }else{
    std::cout << "Running test with periodic BCs" << std::endl;
    WilsonImplParams params;
    run_test<DomainWallFermionD, DomainWallFermionF, PeriodicGaugeStatistics>(argc,argv,params);
  }
  Grid_finalize();
 }