Merge branch 'feature/dirichlet' into develop

2026-05-21 01:24:16 +01:00 · 2023-03-29 16:26:47 -04:00
parent a00ae981e0 c42e25e5b8
commit 10e6d7c6ce
358 changed files with 24926 additions and 4366 deletions
@@ -44,9 +44,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridStd.h>
 #include <Grid/threads/Pragmas.h>
 #include <Grid/perfmon/Timer.h>
-#include <Grid/perfmon/PerfCount.h>
+//#include <Grid/perfmon/PerfCount.h>
 #include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
 #include <Grid/perfmon/Tracing.h>
 #include <Grid/allocator/Allocator.h>
 #include <Grid/simd/Simd.h>
 #include <Grid/threads/ThreadReduction.h>
@@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
@@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
@@ -324,9 +324,9 @@ public:
  GridBase*        _cbgrid;
  int hermitian;
-  CartesianStencil<siteVector,siteVector,int> Stencil; 
+  CartesianStencil<siteVector,siteVector,DefaultImplParams> Stencil; 
-  CartesianStencil<siteVector,siteVector,int> StencilEven;
+  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilEven;
-  CartesianStencil<siteVector,siteVector,int> StencilOdd;
+  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilOdd;
  std::vector<CoarseMatrix> A;
  std::vector<CoarseMatrix> Aeven;
@@ -631,7 +631,7 @@ public:
    assert(Aself != nullptr);
  }
-  void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
+  void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
                       const CoarseVector &in, CoarseVector &out, int dag) {
    int point = geom.npoint-1;
    autoView( out_v, out, AcceleratorWrite);
@@ -694,7 +694,7 @@ public:
    }
  }
-  void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
+  void DhopInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, std::vector<CoarseMatrix> &a,
                    const CoarseVector &in, CoarseVector &out, int dag) {
    SimpleCompressor<siteVector> compressor;
@@ -784,9 +784,9 @@ public:
    _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
+    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,_cbgrid),
    Aodd(geom.npoint,_cbgrid),
@@ -804,9 +804,9 @@ public:
    _cbgrid(&CoarseRBGrid),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
+    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,&CoarseRBGrid),
    Aodd(geom.npoint,&CoarseRBGrid),
@@ -526,6 +526,7 @@ public:
      (*this)(Linop,in[k],out[k]);
    }
  };
  virtual ~OperatorFunction(){};
 };
 template<class Field> class LinearFunction {
@@ -258,26 +258,12 @@ public:
    for(int n=2;n<order;n++){
      Linop.HermOp(*Tn,y);
 #if 0
      auto y_v = y.View();
      auto Tn_v = Tn->View();
      auto Tnp_v = Tnp->View();
      auto Tnm_v = Tnm->View();
      constexpr int Nsimd = vector_type::Nsimd();
      accelerator_for(ss, in.Grid()->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
      });
      if ( Coeffs[n] != 0.0) {
 	axpy(out,Coeffs[n],*Tnp,out);
      }
 #else
      axpby(y,xscale,mscale,y,(*Tn));
      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
      if ( Coeffs[n] != 0.0) {
 	axpy(out,Coeffs[n],*Tnp,out);
      }
-#endif
+
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
@@ -58,6 +58,7 @@ public:
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
    GRID_TRACE("ConjugateGradient");
    psi.Checkerboard() = src.Checkerboard();
    conformable(psi, src);
@@ -117,9 +118,13 @@ public:
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    RealD usecs = -usecond();
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
      GridStopWatch IterationTimer;
      IterationTimer.Start();
      c = cp;
      MatrixTimer.Start();
@@ -152,31 +157,41 @@ public:
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
-      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
+      IterationTimer.Stop();
      if ( (k % 500) == 0 ) {
 	std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
      } else { 
 	std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
 		  << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
      }
      // Stopping condition
      if (cp <= rsq) {
 	usecs +=usecond();
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
-
+	GridBase *grid = src.Grid();
 	RealD DwfFlops = (1452. )*grid->gSites()*4*k
   	               + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
        RealD srcnorm = std::sqrt(norm2(src));
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
 		  << "\tComputed residual " << std::sqrt(cp / ssq)
 		  << "\tTrue residual " << true_residual
 		  << "\tTarget " << Tolerance << std::endl;
-        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
+        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
-	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
 	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
@@ -49,6 +49,7 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    RealD TrueResidual;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
@@ -68,6 +69,7 @@ NAMESPACE_BEGIN(Grid);
    }
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
    std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
    TotalInnerIterations = 0;
    GridStopWatch TotalTimer;
@@ -97,6 +99,7 @@ NAMESPACE_BEGIN(Grid);
    FieldF sol_f(SinglePrecGrid);
    sol_f.Checkerboard() = cb;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;
@@ -105,7 +108,10 @@ NAMESPACE_BEGIN(Grid);
    GridStopWatch PrecChangeTimer;
    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
-      
+
    precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
      //Compute double precision rsd and also new RHS vector.
      Linop_d.HermOp(sol_d, tmp_d);
@@ -120,7 +126,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d);
+      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
      PrecChangeTimer.Stop();
      sol_f = Zero();
@@ -130,6 +136,7 @@ NAMESPACE_BEGIN(Grid);
 	(*guesser)(src_f, sol_f);
      //Inner CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
      CG_f.Tolerance = inner_tol;
      InnerCGtimer.Start();
      CG_f(Linop_f, src_f, sol_f);
@@ -138,7 +145,7 @@ NAMESPACE_BEGIN(Grid);
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f);
+      precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
      PrecChangeTimer.Stop();
      axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -150,6 +157,7 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
    TrueResidual = CG_d.TrueResidual;
    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
@@ -44,7 +44,7 @@ public:
  using OperatorFunction<Field>::operator();
-  RealD   Tolerance;
+  //  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
-  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@@ -84,6 +84,7 @@ public:
  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
  {
    GRID_TRACE("ConjugateGradientMultiShift");
    GridBase *grid = src.Grid();
@@ -182,6 +183,9 @@ public:
    for(int s=0;s<nshift;s++) {
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
  ///////////////////////////////////////
  // Timers
@@ -321,8 +325,8 @@ public:
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tAXPY     " << AXPYTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix   " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
      IterationsToComplete = k;	
@@ -0,0 +1,373 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //PB Pure single, then double fixup
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterationsMshift;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq) : 
    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
    MaxIterations(20000)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
    GridBase *DoublePrecGrid = src_d.Grid();
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
    std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldF r_f(SinglePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  rsqf[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF p_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	psi_f[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      rsqf[s] =rsq[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      //      ps_d[s] = src_d;
      precisionChangeFast(ps_f[s],src_d);
    }
    // r and p for primary
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    r_d = p_d;
    //MdagM+m[0]
    precisionChangeFast(p_f,p_d);
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    precisionChangeFast(tmp_d,mmp_f);
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    tmp_d = tmp_d - mmp_d;
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
    //    assert(norm2(tmp_d)< 1.0e-4);
    axpy(mmp_d,mass[0],p_d,mmp_d);
    RealD rn = norm2(p_d);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_d,b,mmp_d,r_d);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
      precisionChangeFast(psi_f[s],psi_d[s]);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterationsMshift;k++){    
      a = c /cp;
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChangeFast(r_f, r_d);
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_f[s],a,ps_f[s],r_f);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      cp=c;
      PrecChangeTimer.Start();
      precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f);
      MatrixTimer.Stop();  
      PrecChangeTimer.Start();
      precisionChangeFast(mmp_d, mmp_f); // From Float to Double
      PrecChangeTimer.Stop();
      d=real(innerProduct(p_d,mmp_d));    
      axpy(mmp_d,mass[0],p_d,mmp_d);
      RealD rn = norm2(p_d);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update single precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
 	}
      }
      c = axpy_norm(r_d,b,mmp_d,r_d);
      AXPYTimer.Stop();
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsqf[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged || k == MaxIterationsMshift-1){
 	SolverTimer.Stop();
 	for(int s=0;s<nshift;s++){
 	  precisionChangeFast(psi_d[s],psi_f[s]);
 	}
 	if ( all_converged ){
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
 	}
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    assert(0);
  }
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,416 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //Linop to add shift to input linop, used in cleanup CG
 namespace ConjugateGradientMultiShiftMixedPrecSupport{
 template<typename Field>
 class ShiftedLinop: public LinearOperatorBase<Field>{
 public:
  LinearOperatorBase<Field> &linop_base;
  RealD shift;
  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOp(const Field &in, Field &out){
    linop_base.HermOp(in, out);
    axpy(out, shift, in, out);
  }    
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
 };
 };
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterationsMshift;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq) : 
    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
    MaxIterations(20000)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
    GridBase *DoublePrecGrid = src_d.Grid();
    precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
    precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  rsqf[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF p_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      rsqf[s] =rsq[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    r_d = p_d;
    //MdagM+m[0]
    precisionChange(p_f, p_d, pc_wk_d_to_s);
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    tmp_d = tmp_d - mmp_d;
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
    //    assert(norm2(tmp_d)< 1.0e-4);
    axpy(mmp_d,mass[0],p_d,mmp_d);
    RealD rn = norm2(p_d);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_d,b,mmp_d,r_d);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterationsMshift;k++){    
      a = c /cp;
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_d[s],a,ps_d[s],r_d);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      cp=c;
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f);
      MatrixTimer.Stop();  
      PrecChangeTimer.Start();
      precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      d=real(innerProduct(p_d,mmp_d));    
      axpy(mmp_d,mass[0],p_d,mmp_d);
      AXPYTimer.Stop();
      RealD rn = norm2(p_d);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update double precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
 	}
      }
      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
      c = axpy_norm(r_d,b,mmp_d,r_d);
      AXPYTimer.Stop();
      if(k % ReliableUpdateFreq == 0){
 	RealD c_old = c;
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
 	MatrixTimer.Stop();  
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
 	c = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();
 	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_old <<" with |r|^2 = "<<c<<std::endl;
      }
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsqf[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged || k == MaxIterationsMshift-1){
 	SolverTimer.Stop();
 	if ( all_converged ){
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
 	}
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    assert(0);
  }
 };
 NAMESPACE_END(Grid);
 #endif
@@ -48,7 +48,7 @@ public:
  LinearOperatorBase<FieldF> &Linop_f;
  LinearOperatorBase<FieldD> &Linop_d;
  GridBase* SinglePrecGrid;
-  RealD Delta; //reliable update parameter
+  RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
  //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
  LinearOperatorBase<FieldF> *Linop_fallback;
@@ -65,7 +65,9 @@ public:
      ErrorOnNoConverge(err_on_no_conv),
      DoFinalCleanup(true),
      Linop_fallback(NULL)
-  {};
+  {
    assert(Delta > 0. && Delta < 1. && "Expect  0 < Delta < 1");
  };
  void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
    Linop_fallback = &_Linop_fallback;
@@ -73,6 +75,7 @@ public:
  }
  void operator()(const FieldD &src, FieldD &psi) {
    GRID_TRACE("ConjugateGradientReliableUpdate");
    LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
    bool using_fallback = false;
@@ -115,9 +118,12 @@ public:
    }
    //Single prec initialization
    precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
    FieldF r_f(SinglePrecGrid);
    r_f.Checkerboard() = r.Checkerboard();
-    precisionChange(r_f, r);
+    precisionChange(r_f, r, pc_wk_dp_to_sp);
    FieldF psi_f(r_f);
    psi_f = Zero();
@@ -133,7 +139,8 @@ public:
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
-
+    GridStopWatch PrecChangeTimer;
    SolverTimer.Start();
    int k = 0;
    int l = 0;
@@ -172,7 +179,9 @@ public:
      // Stopping condition
      if (cp <= rsq) {
 	//Although not written in the paper, I assume that I have to add on the final solution
-	precisionChange(mmp, psi_f);
+	PrecChangeTimer.Start();
 	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
 	PrecChangeTimer.Stop();
 	psi = psi + mmp;
@@ -193,7 +202,10 @@ public:
 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
 	IterationsToComplete = k;	
 	ReliableUpdatesPerformed = l;
@@ -213,14 +225,21 @@ public:
      else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 		  << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
-	precisionChange(mmp, psi_f);
+	PrecChangeTimer.Start();
 	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
 	PrecChangeTimer.Stop();
 	psi = psi + mmp;
 	MatrixTimer.Start();
 	Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 	MatrixTimer.Stop();
 	r = src - mmp;
 	psi_f = Zero();
-	precisionChange(r_f, r);
+	PrecChangeTimer.Start();
 	precisionChange(r_f, r, pc_wk_dp_to_sp);
 	PrecChangeTimer.Stop();
 	cp = norm2(r);
 	MaxResidSinceLastRelUp = cp;
@@ -44,6 +44,7 @@ public:
 				  int, MinRes);    // Must restart
 };
 //This class is the input parameter class for some testing programs
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@@ -145,16 +146,24 @@ public:
  LinearOperatorBase<FineField> &_Linop;
  RealD                             _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
  int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
                                //As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
                                //To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
                                //out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
                                //NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
-					   RealD coarse_relax_tol=5.0e3) 
+					   RealD coarse_relax_tol=5.0e3,
 					   int largestEvalIdxForReport=-1) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
-      _coarse_relax_tol(coarse_relax_tol)  
+      _coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
  {    };
  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
@@ -177,12 +186,26 @@ public:
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
      std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
      RealD tmp_eval;
      ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
    }
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
-  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
+
  //This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
  //applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
  //evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
  //As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
  //We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)  
  {
    evalMaxApprox = 1.0; //cf above
    GridBase *FineGrid = _subspace[0].Grid();    
    int checkerboard   = _subspace[0].Checkerboard();
    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@@ -201,13 +224,13 @@ public:
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
-
+    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@@ -285,6 +308,10 @@ public:
    evals_coarse.resize(0);
  };
  //The block inner product is the inner product on the fine grid locally summed over the blocks
  //to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
  //vectors under the block inner product. This step must be performed after computing the fine grid
  //eigenvectors and before computing the coarse grid eigenvectors.    
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@@ -328,6 +355,8 @@ public:
    }
  }
  //While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
  //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
@@ -376,25 +405,31 @@ public:
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
  //cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
  //cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
  //relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
-    Chebyshev<FineField>                          Cheby(cheby_op);
+    Chebyshev<FineField>                          Cheby(cheby_op); //Chebyshev of fine operator on fine grid
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); 
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
    CoarseField src(_CoarseGrid);     src=1.0; 
    //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@@ -405,6 +440,14 @@ public:
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
  //Get the fine eigenvector 'i' by reconstruction
  void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
    blockPromote(evec_coarse[i],evec,subspace);  
    eval = evals_coarse[i];
  }
 };
 NAMESPACE_END(Grid);
@@ -29,6 +29,8 @@ template<class Field> class PowerMethod
      RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. 
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
@@ -144,8 +144,8 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
-  assert(AccCache.accLock==0); // Cannot evict so logic bomb
+  if (AccCache.accLock!=0) return;
-  assert(AccCache.CpuPtr!=(uint64_t)NULL);
+  if (AccCache.cpuLock!=0) return;
  if(AccCache.state==AccDirty) {
    Flush(AccCache);
  }
@@ -532,6 +532,7 @@ void MemoryManager::Audit(std::string s)
    assert(AccCache.LRU_entry==it);
  }
  std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
    auto &AccCache = it->second;
@@ -548,6 +549,7 @@ void MemoryManager::Audit(std::string s)
    if ( AccCache.cpuLock || AccCache.accLock ) {
      assert(AccCache.LRU_valid==0);
      std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 		<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 		<< "\t cpuLock  " << AccCache.cpuLock
@@ -566,6 +568,7 @@ void MemoryManager::Audit(std::string s)
  std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
  assert(LruCnt == LRU.size());
  std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
 }
 void MemoryManager::PrintState(void* _CpuPtr)
@@ -53,10 +53,11 @@ public:
  // Communicator should know nothing of the physics grid, only processor grid.
  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
  Coordinate _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  Coordinate _processor_coor;  // linear processor coordinate
  unsigned long    _ndimension;
  Coordinate _shm_processors;  // Which dimensions get relayed out over processors lanes.
  Coordinate _processors;      // Which dimensions get relayed out over processors lanes.
  Coordinate _processor_coor;  // linear processor coordinate
  static Grid_MPI_Comm      communicator_world;
  Grid_MPI_Comm             communicator;
  std::vector<Grid_MPI_Comm> communicator_halo;
@@ -97,14 +98,16 @@ public:
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const Coordinate & ThisProcessorCoor(void) ;
  const Coordinate & ShmGrid(void)  { return _shm_processors; }  ;
  const Coordinate & ProcessorGrid(void)     ;
-  int                      ProcessorCount(void)    ;
+  int                ProcessorCount(void)    ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
  static void BarrierWorld(void);
  ////////////////////////////////////////////////////////////
  // Reduction
@@ -128,7 +131,7 @@ public:
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
-    scalar_type * ptr = (scalar_type *)& o;
+    scalar_type * ptr = (scalar_type *)& o; // Safe alias 
    GlobalSumVector(ptr,words);
  }
@@ -142,17 +145,17 @@ public:
 		      int bytes);
  double StencilSendToRecvFrom(void *xmit,
-			       int xmit_to_rank,
+			       int xmit_to_rank,int do_xmit,
 			       void *recv,
-			       int recv_from_rank,
+			       int recv_from_rank,int do_recv,
 			       int bytes,int dir);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
-				    int xmit_to_rank,
+				    int xmit_to_rank,int do_xmit,
 				    void *recv,
-				    int recv_from_rank,
+				    int recv_from_rank,int do_recv,
-				    int bytes,int dir);
+				    int xbytes,int rbytes,int dir);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
@@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm,_shm_processors);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
@@ -124,12 +124,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  Coordinate parent_processor_coor(_ndimension,0);
  Coordinate parent_processors    (_ndimension,1);
-
+  Coordinate shm_processors       (_ndimension,1);
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
    shm_processors       [pad+d]=parent._shm_processors[d];
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -154,6 +155,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
    if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
  }
  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@@ -335,23 +337,23 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int dest,
+						     int dest, int dox,
 						     void *recv,
-						     int from,
+						     int from, int dor,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int dest,
+							 int dest,int dox,
 							 void *recv,
-							 int from,
+							 int from,int dor,
-							 int bytes,int dir)
+							 int xbytes,int rbytes,int dir)
 {
  int ncomm  =communicator_halo.size();
  int commdir=dir%ncomm;
@@ -370,37 +372,34 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;
-  if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+  if ( dor ) {
-    tag= dir+from*32;
+    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      tag= dir+from*32;
-    assert(ierr==0);
+      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-    list.push_back(rrq);
+      assert(ierr==0);
-    off_node_bytes+=bytes;
+      list.push_back(rrq);
      off_node_bytes+=rbytes;
    }
  }
-
+  
-  if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+  if (dox) {
-    tag= dir+_processor*32;
+    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      tag= dir+_processor*32;
-    assert(ierr==0);
+      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-    list.push_back(xrq);
+      assert(ierr==0);
-    off_node_bytes+=bytes;
+      list.push_back(xrq);
-  } else {
+      off_node_bytes+=xbytes;
-    // TODO : make a OMP loop on CPU, call threaded bcopy
+    } else {
-    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
-    assert(shm!=NULL);
+      assert(shm!=NULL);
-    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
+      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
-    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
+    }
  }
  //  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
  //    this->StencilSendToRecvFromComplete(list,dir);
  //  }
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  //   std::cout << "Copy Synchronised\n"<<std::endl;
  int nreq=list.size();
  if (nreq==0) return;
@@ -436,6 +435,10 @@ int CartesianCommunicator::RankWorld(void){
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BarrierWorld(void){
  int ierr = MPI_Barrier(communicator_world);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
@@ -45,12 +45,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) 
  : CartesianCommunicator(processors) 
 {
  _shm_processors = Coordinate(processors.size(),1);
  srank=0;
  SetCommunicator(communicator_world);
 }
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 {
  _shm_processors = Coordinate(processors.size(),1);
  _processors = processors;
  _ndimension = processors.size();  assert(_ndimension>=1);
  _processor_coor.resize(_ndimension);
@@ -102,6 +104,7 @@ int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
 void CartesianCommunicator::BarrierWorld(void) { }
 int  CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) {  return 0;}
 void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){  coor = _processor_coor; }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
@@ -111,19 +114,19 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
 }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int xmit_to_rank,
+						     int xmit_to_rank,int dox,
 						     void *recv,
-						     int recv_from_rank,
+						     int recv_from_rank,int dor,
 						     int bytes, int dir)
 {
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int xmit_to_rank,
+							 int xmit_to_rank,int dox,
 							 void *recv,
-							 int recv_from_rank,
+							 int recv_from_rank,int dor,
-							 int bytes, int dir)
+							 int xbytes,int rbytes, int dir)
 {
  return 2.0*bytes;
 }
@@ -93,9 +93,10 @@ public:
  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
-  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
-  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
  static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
@@ -29,6 +29,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #include <Grid/GridCore.h>
 #include <pwd.h>
 #include <syscall.h>
 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
@@ -153,7 +154,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
  }
  return log2size;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  //////////////////////////////////////////////////////////////////////////////
  // Look and see if it looks like an HPE 8600 based on hostname conventions
@@ -166,8 +167,8 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
-  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
+  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
-  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm);
+  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
 }
 static inline int divides(int a,int b)
 {
@@ -222,7 +223,7 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
    dim=(dim+1) %ndimension;
  }
 }
-void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
@@ -295,7 +296,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  Coordinate HyperCoor(ndimension);
  GetShmDims(WorldDims,ShmDims);
-
+  SHM = ShmDims;
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -342,7 +344,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 }
-void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
@@ -354,6 +356,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
  Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension);
  GetShmDims(WorldDims,ShmDims);
  SHM=ShmDims;
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -521,7 +525,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  }
  if ( WorldRank == 0 ){
    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
-	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
  std::cout<< "Setting up IPC"<<std::endl;
@@ -48,9 +48,10 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  _ShmSetup=1;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  optimal_comm = WorldComm;
  SHM = Coordinate(processors.size(),1);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
@@ -63,7 +63,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
  typename std::remove_const<vobj>::type ret;
  typedef typename vobj::scalar_object scalar_object;
-  typedef typename vobj::scalar_type scalar_type;
+  //  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  const int Nsimd = vobj::vector_type::Nsimd();
@@ -36,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
@@ -53,6 +54,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -70,6 +72,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -86,6 +89,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -106,6 +110,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -119,6 +124,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -133,6 +139,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -146,6 +153,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -163,6 +171,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mult");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -177,6 +186,7 @@ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mac");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -191,6 +201,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("sub");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -204,6 +215,7 @@ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("add");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -218,6 +230,7 @@ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 template<class sobj,class vobj> inline
 void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
  GRID_TRACE("axpy");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@@ -231,6 +244,7 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
 }
 template<class sobj,class vobj> inline
 void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
  GRID_TRACE("axpby");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@@ -246,11 +260,13 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpy_norm");
    return axpy_norm_fast(ret,a,x,y);
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpby_norm");
    return axpby_norm_fast(ret,a,b,x,y);
 }
@@ -117,6 +117,7 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -140,6 +141,7 @@ public:
  }
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -163,6 +165,7 @@ public:
  }
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -32,7 +32,6 @@ template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -82,7 +81,6 @@ template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -130,7 +128,6 @@ template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs.Grid();
@@ -96,9 +96,6 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
  GridBase *grid=l.Grid();
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nsimd = grid->Nsimd();
  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
@@ -125,14 +122,17 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 template<class vobj>
 typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
  typename vobj::scalar_object s;
  peekSite(s,l,site);
  return s;
 }        
 template<class vobj,class sobj>
 void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
  GridBase *grid=l.Grid();
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nsimd = grid->Nsimd();
  assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
@@ -173,11 +173,11 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);
-  scalar_type * vp = (scalar_type *)&l[odx];
+  const vector_type *vp = (const vector_type *) &l[odx];
  scalar_type * pt = (scalar_type *)&s;
  for(int w=0;w<words;w++){
-    pt[w] = vp[idx+w*Nsimd];
+    pt[w] = getlane(vp[w],idx);
  }
  return;
@@ -210,10 +210,10 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);
-  scalar_type * vp = (scalar_type *)&l[odx];
+  vector_type * vp = (vector_type *)&l[odx];
  scalar_type * pt = (scalar_type *)&s;
  for(int w=0;w<words;w++){
-    vp[idx+w*Nsimd] = pt[w];
+    putlane(vp[w],pt[w],idx);
  }
  return;
 };
@@ -94,10 +94,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
  for(int i=0;i<nthread;i++){
    ssum = ssum+sumarray[i];
  } 
-  
+  return ssum;
  typedef typename vobj::scalar_object ssobj;
  ssobj ret = ssum;
  return ret;
 }
 /*
 Threaded max, don't use for now
@@ -236,7 +233,6 @@ template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 template<class vobj>
 inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  ComplexD  nrm;
@@ -246,6 +242,7 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  // Might make all code paths go this way.
 #if 0
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
@@ -254,15 +251,31 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
    autoView( right_v,right, AcceleratorRead);
    // This code could read coalesce
    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, 1,{
+    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v[ss];
+	auto x_l = left_v(ss);
-	auto y_l = right_v[ss];
+	auto y_l = right_v(ss);
-	inner_tmp_v[ss]=innerProductD(x_l,y_l);
+	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
    });
  }
 #else
  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
    // GPU - SIMT lane compliance...
    accelerator_for( ss, sites, nsimd,{
 	auto x_l = left_v(ss);
 	auto y_l = right_v(ss);
 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
    });
  }
 #endif
  // This is in single precision and fails some tests
-  auto anrm = sum(inner_tmp_v,sites);  
+  auto anrm = sumD(inner_tmp_v,sites);  
  nrm = anrm;
  return nrm;
 }
@@ -295,8 +308,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  conformable(z,x);
  conformable(x,y);
-  typedef typename vobj::scalar_type scalar_type;
+  //  typedef typename vobj::vector_typeD vector_type;
  typedef typename vobj::vector_typeD vector_type;
  RealD  nrm;
  GridBase *grid = x.Grid();
@@ -308,17 +320,29 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
-
+#if 0
  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
-  accelerator_for( ss, sites, 1,{
+  accelerator_for( ss, sites, nsimd,{
-      auto tmp = a*x_v[ss]+b*y_v[ss];
+      auto tmp = a*x_v(ss)+b*y_v(ss);
-      inner_tmp_v[ss]=innerProductD(tmp,tmp);
+      coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
-      z_v[ss]=tmp;
+      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 #else
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  accelerator_for( ss, sites, nsimd,{
      auto tmp = a*x_v(ss)+b*y_v(ss);
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
 #endif
  grid->GlobalSum(nrm);
  return nrm; 
 }
@@ -328,7 +352,6 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
 {
  conformable(left,right);
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  Vector<ComplexD> tmp(2);
@@ -472,6 +495,14 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
 }
 template<class vobj> inline
 std::vector<typename vobj::scalar_object> 
 sliceSum(const Lattice<vobj> &Data,int orthogdim)
 {
  std::vector<typename vobj::scalar_object> result;
  sliceSum(Data,result,orthogdim);
  return result;
 }
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
@@ -576,7 +607,8 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
 template<class vobj>
 static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 			    int orthogdim,RealD scale=1.0) 
-{    
+{
  // perhaps easier to just promote A to a field and use regular madd
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -607,8 +639,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
    for(int l=0;l<Nsimd;l++){
      grid->iCoorFromIindex(icoor,l);
      int ldx =r+icoor[orthogdim]*rd;
-      scalar_type *as =(scalar_type *)&av;
+      av.putlane(scalar_type(a[ldx])*zscale,l);
      as[l] = scalar_type(a[ldx])*zscale;
    }
    tensor_reduced at; at=av;
@@ -648,7 +679,6 @@ template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -702,7 +732,6 @@ template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -756,7 +785,6 @@ template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs.Grid();
@@ -211,13 +211,25 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  assert(ok);
  Integer smemSize = numThreads * sizeof(sobj);
-
+  // Move out of UVM
  // Turns out I had messed up the synchronise after move to compute stream
  // as running this on the default stream fools the synchronise
 #undef UVM_BLOCK_BUFFER  
 #ifndef UVM_BLOCK_BUFFER  
  commVector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
  sobj result;
  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
  accelerator_barrier();
  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
 #else
  Vector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
-  
+  sobj result;
-  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
+  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
  accelerator_barrier();
-  auto result = buffer_v[0];
+  result = *buffer_v;
 #endif
  return result;
 }
@@ -250,8 +262,6 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
@@ -424,9 +424,33 @@ public:
    // MT implementation does not implement fast discard even though
    // in principle this is possible
    ////////////////////////////////////////////////
 #if 1
    thread_for( lidx, _grid->lSites(), {
 	int gidx;
 	int o_idx;
 	int i_idx;
 	int rank;
 	Coordinate pcoor;
 	Coordinate lcoor;
 	Coordinate gcoor;
 	_grid->LocalIndexToLocalCoor(lidx,lcoor);
 	pcoor=_grid->ThisProcessorCoor();
 	_grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor);
 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 	assert(rank == _grid->ThisRank() );
 	int l_idx=generator_idx(o_idx,i_idx);
 	_generators[l_idx] = master_engine;
 	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
    });
 #else
    // Everybody loops over global volume.
    thread_for( gidx, _grid->_gsites, {
 	// Where is it?
 	int rank;
 	int o_idx;
@@ -443,6 +467,7 @@ public:
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
    });
 #endif
 #else 
    ////////////////////////////////////////////////////////////////
    // Machine and thread decomposition dependent seeding is efficient
@@ -194,11 +194,11 @@ accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
 #endif
 accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
-  out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
+  precisionChange(out,in);
 }
 accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
-  Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
+  precisionChange(out,in);
 }
 template<typename T1,typename T2>
@@ -726,10 +726,10 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
-      scalar_type * fp = (scalar_type *)&f_v[odx_f];
+      vector_type * fp = (vector_type *)&f_v[odx_f];
-      scalar_type * tp = (scalar_type *)&t_v[odx_t];
+      vector_type * tp = (vector_type *)&t_v[odx_t];
      for(int w=0;w<words;w++){
-	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
+	tp[w].putlane(fp[w].getlane(idx_f),idx_t);
      }
    }
  });
@@ -904,7 +904,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 template<class vobj>
-void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;
@@ -1129,9 +1129,27 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }
-//Convert a Lattice from one precision to another
+//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
 template<class VobjOut, class VobjIn>
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
+void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 {
  typedef typename VobjOut::vector_type Vout;
  typedef typename VobjIn::vector_type Vin;
  const int N = sizeof(VobjOut)/sizeof(Vout);
  conformable(out.Grid(),in.Grid());
  out.Checkerboard() = in.Checkerboard();
  int nsimd = out.Grid()->Nsimd();
  autoView( out_v  , out, AcceleratorWrite);
  autoView(  in_v ,   in, AcceleratorRead);
  accelerator_for(idx,out.Grid()->oSites(),1,{
      Vout *vout = (Vout *)&out_v[idx];
      Vin  *vin  = (Vin  *)&in_v[idx];
      precisionChange(vout,vin,N);
  });
 }
 //Convert a Lattice from one precision to another (original, slow implementation)
 template<class VobjOut, class VobjIn>
 void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 {
  assert(out.Grid()->Nd() == in.Grid()->Nd());
  for(int d=0;d<out.Grid()->Nd();d++){
@@ -1146,7 +1164,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
  int ndim = out.Grid()->Nd();
  int out_nsimd = out_grid->Nsimd();
-    
+  int in_nsimd = in_grid->Nsimd();
  std::vector<Coordinate > out_icoor(out_nsimd);
  for(int lane=0; lane < out_nsimd; lane++){
@@ -1177,6 +1195,128 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
  });
 }
 //The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
 class precisionChangeWorkspace{
  std::pair<Integer,Integer>* fmap_device; //device pointer
  //maintain grids for checking
  GridBase* _out_grid;
  GridBase* _in_grid;
 public:
  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
    assert(out_grid->Nd() == in_grid->Nd());
    for(int d=0;d<out_grid->Nd();d++){
      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
    }
    int Nsimd_out = out_grid->Nsimd();
    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
    for(int lane=0; lane < out_grid->Nsimd(); lane++)
      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
    thread_for(out_oidx,out_grid->oSites(),{
 	Coordinate out_ocorr; 
 	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
 	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
 	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
 	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
 	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
 	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
 	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
 	  int in_oidx = 0, in_lane = 0;
 	  for(int d=0;d<in_grid->_ndimension;d++){
 	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
 	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
 	  }
 	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
 	}
      });
    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
  }
  //Prevent moving or copying
  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
  void checkGrids(GridBase* out, GridBase* in) const{
    conformable(out, _out_grid);
    conformable(in, _in_grid);
  }
  ~precisionChangeWorkspace(){
    acceleratorFreeDevice(fmap_device);
  }
 };
 //We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
 //*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
 template<class VobjOut, class VobjIn>
 auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
  if(out.Grid() == in.Grid()){
    precisionChangeFast(out,in);
    return 1;
  }else{
    return 0;
  }
 }
 template<class VobjOut, class VobjIn>
 int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
  return 0;
 }
 //Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
 //which contains the mapping data.
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
  if(_precisionChangeFastWrap(out,in,0)) return;
  static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
  out.Checkerboard() = in.Checkerboard();
  constexpr int Nsimd_out = VobjOut::Nsimd();
  workspace.checkGrids(out.Grid(),in.Grid());
  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
  //Do the copy/precision change
  autoView( out_v , out, AcceleratorWrite);
  autoView( in_v , in, AcceleratorRead);
  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
 	int in_oidx = fmap_osite[out_lane].first;
 	int in_lane = fmap_osite[out_lane].second;
 	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
      }
    });
 }
 //Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
 //or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  if(_precisionChangeFastWrap(out,in,0)) return;   
  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
  precisionChange(out, in, workspace);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
@@ -42,9 +42,11 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
  typedef Lattice<vLorentzColourMatrixD> GaugeField;
  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@@ -203,7 +205,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
@@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
 #define GparityFlavourIndex (0)
 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@@ -87,6 +88,8 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
 const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@@ -110,8 +113,10 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
 template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@@ -121,6 +126,7 @@ typedef iSpinMatrix<ComplexD >          SpinMatrixD;
 typedef iSpinMatrix<vComplex >          vSpinMatrix;
 typedef iSpinMatrix<vComplexF>          vSpinMatrixF;
 typedef iSpinMatrix<vComplexD>          vSpinMatrixD;
 typedef iSpinMatrix<vComplexD2>         vSpinMatrixD2;
 // Colour Matrix
 typedef iColourMatrix<Complex  >        ColourMatrix;
@@ -130,6 +136,7 @@ typedef iColourMatrix<ComplexD >        ColourMatrixD;
 typedef iColourMatrix<vComplex >        vColourMatrix;
 typedef iColourMatrix<vComplexF>        vColourMatrixF;
 typedef iColourMatrix<vComplexD>        vColourMatrixD;
 typedef iColourMatrix<vComplexD2>       vColourMatrixD2;
 // SpinColour matrix
 typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
@@ -139,6 +146,7 @@ typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
 typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
 typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
 typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
 typedef iSpinColourMatrix<vComplexD2>   vSpinColourMatrixD2;
 // SpinColourSpinColour matrix
 typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
@@ -148,6 +156,7 @@ typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
 typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
 typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplexD2>   vSpinColourSpinColourMatrixD2;
 // SpinColourSpinColour matrix
 typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
@@ -157,24 +166,38 @@ typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
 typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
 typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplexD2>   vSpinColourSpinColourMatrixD2;
 // LorentzColour
 typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
 typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
 typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
-typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
+typedef iLorentzColourMatrix<vComplex >  vLorentzColourMatrix;
-typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
+typedef iLorentzColourMatrix<vComplexF>  vLorentzColourMatrixF;
-typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
+typedef iLorentzColourMatrix<vComplexD>  vLorentzColourMatrixD;
 typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;
 // DoubleStored gauge field
 typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
-typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
+typedef iDoubleStoredColourMatrix<vComplex >  vDoubleStoredColourMatrix;
-typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
+typedef iDoubleStoredColourMatrix<vComplexF>  vDoubleStoredColourMatrixF;
-typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
+typedef iDoubleStoredColourMatrix<vComplexD>  vDoubleStoredColourMatrixD;
 typedef iDoubleStoredColourMatrix<vComplexD2> vDoubleStoredColourMatrixD2;
 //G-parity flavour matrix
 typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
 typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplex>   vGparityFlavourMatrix;
 typedef iGparityFlavourMatrix<vComplexF>  vGparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<vComplexD>  vGparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplexD2> vGparityFlavourMatrixD2;
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
@@ -184,6 +207,7 @@ typedef iSpinVector<ComplexD>           SpinVectorD;
 typedef iSpinVector<vComplex >           vSpinVector;
 typedef iSpinVector<vComplexF>           vSpinVectorF;
 typedef iSpinVector<vComplexD>           vSpinVectorD;
 typedef iSpinVector<vComplexD2>          vSpinVectorD2;
 // Colour vector
 typedef iColourVector<Complex >         ColourVector;
@@ -193,6 +217,7 @@ typedef iColourVector<ComplexD>         ColourVectorD;
 typedef iColourVector<vComplex >         vColourVector;
 typedef iColourVector<vComplexF>         vColourVectorF;
 typedef iColourVector<vComplexD>         vColourVectorD;
 typedef iColourVector<vComplexD2>        vColourVectorD2;
 // SpinColourVector
 typedef iSpinColourVector<Complex >     SpinColourVector;
@@ -202,6 +227,7 @@ typedef iSpinColourVector<ComplexD>     SpinColourVectorD;
 typedef iSpinColourVector<vComplex >     vSpinColourVector;
 typedef iSpinColourVector<vComplexF>     vSpinColourVectorF;
 typedef iSpinColourVector<vComplexD>     vSpinColourVectorD;
 typedef iSpinColourVector<vComplexD2>    vSpinColourVectorD2;
 // HalfSpin vector
 typedef iHalfSpinVector<Complex >       HalfSpinVector;
@@ -211,15 +237,27 @@ typedef iHalfSpinVector<ComplexD>       HalfSpinVectorD;
 typedef iHalfSpinVector<vComplex >       vHalfSpinVector;
 typedef iHalfSpinVector<vComplexF>       vHalfSpinVectorF;
 typedef iHalfSpinVector<vComplexD>       vHalfSpinVectorD;
 typedef iHalfSpinVector<vComplexD2>      vHalfSpinVectorD2;
 // HalfSpinColour vector
 typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
 typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
 typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
-typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
+typedef iHalfSpinColourVector<vComplex >  vHalfSpinColourVector;
-typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
+typedef iHalfSpinColourVector<vComplexF>  vHalfSpinColourVectorF;
-typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
+typedef iHalfSpinColourVector<vComplexD>  vHalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplexD2> vHalfSpinColourVectorD2;
 //G-parity flavour vector
 typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
 typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
 typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
 typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
 typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplexD2>        vGparityFlavourVectorD2;
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
@@ -229,6 +267,7 @@ typedef iSinglet<ComplexD>         TComplexD;    // FIXME This is painful. Tenso
 typedef iSinglet<vComplex >        vTComplex ;   // what if we don't know the tensor structure
 typedef iSinglet<vComplexF>        vTComplexF;   // what if we don't know the tensor structure
 typedef iSinglet<vComplexD>        vTComplexD;   // what if we don't know the tensor structure
 typedef iSinglet<vComplexD2>       vTComplexD2;   // what if we don't know the tensor structure
 typedef iSinglet<Real >            TReal;        // Shouldn't need these; can I make it work without?
 typedef iSinglet<RealF>            TRealF;       // Shouldn't need these; can I make it work without?
@@ -246,47 +285,58 @@ typedef iSinglet<Integer >         TInteger;
 typedef Lattice<vColourMatrix>          LatticeColourMatrix;
 typedef Lattice<vColourMatrixF>         LatticeColourMatrixF;
 typedef Lattice<vColourMatrixD>         LatticeColourMatrixD;
 typedef Lattice<vColourMatrixD2>        LatticeColourMatrixD2;
 typedef Lattice<vSpinMatrix>            LatticeSpinMatrix;
 typedef Lattice<vSpinMatrixF>           LatticeSpinMatrixF;
 typedef Lattice<vSpinMatrixD>           LatticeSpinMatrixD;
 typedef Lattice<vSpinMatrixD2>          LatticeSpinMatrixD2;
 typedef Lattice<vSpinColourMatrix>      LatticeSpinColourMatrix;
 typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
 typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;
 typedef Lattice<vSpinColourMatrixD2>    LatticeSpinColourMatrixD2;
 typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
 typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
 typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;
 typedef Lattice<vSpinColourSpinColourMatrixD2>    LatticeSpinColourSpinColourMatrixD2;
-typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
+typedef Lattice<vLorentzColourMatrix>   LatticeLorentzColourMatrix;
-typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
+typedef Lattice<vLorentzColourMatrixF>  LatticeLorentzColourMatrixF;
-typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
+typedef Lattice<vLorentzColourMatrixD>  LatticeLorentzColourMatrixD;
 typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;
 // DoubleStored gauge field
-typedef Lattice<vDoubleStoredColourMatrix>  LatticeDoubleStoredColourMatrix;
+typedef Lattice<vDoubleStoredColourMatrix>   LatticeDoubleStoredColourMatrix;
-typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
+typedef Lattice<vDoubleStoredColourMatrixF>  LatticeDoubleStoredColourMatrixF;
-typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
+typedef Lattice<vDoubleStoredColourMatrixD>  LatticeDoubleStoredColourMatrixD;
 typedef Lattice<vDoubleStoredColourMatrixD2> LatticeDoubleStoredColourMatrixD2;
 typedef Lattice<vSpinVector>            LatticeSpinVector;
 typedef Lattice<vSpinVectorF>           LatticeSpinVectorF;
 typedef Lattice<vSpinVectorD>           LatticeSpinVectorD;
 typedef Lattice<vSpinVectorD2>          LatticeSpinVectorD2;
 typedef Lattice<vColourVector>          LatticeColourVector;
 typedef Lattice<vColourVectorF>         LatticeColourVectorF;
 typedef Lattice<vColourVectorD>         LatticeColourVectorD;
 typedef Lattice<vColourVectorD2>        LatticeColourVectorD2;
 typedef Lattice<vSpinColourVector>      LatticeSpinColourVector;
 typedef Lattice<vSpinColourVectorF>     LatticeSpinColourVectorF;
 typedef Lattice<vSpinColourVectorD>     LatticeSpinColourVectorD;
 typedef Lattice<vSpinColourVectorD2>    LatticeSpinColourVectorD2;
 typedef Lattice<vHalfSpinVector>        LatticeHalfSpinVector;
 typedef Lattice<vHalfSpinVectorF>       LatticeHalfSpinVectorF;
 typedef Lattice<vHalfSpinVectorD>       LatticeHalfSpinVectorD;
 typedef Lattice<vHalfSpinVectorD2>      LatticeHalfSpinVectorD2;
-typedef Lattice<vHalfSpinColourVector>  LatticeHalfSpinColourVector;
+typedef Lattice<vHalfSpinColourVector>   LatticeHalfSpinColourVector;
-typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
+typedef Lattice<vHalfSpinColourVectorF>  LatticeHalfSpinColourVectorF;
-typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
+typedef Lattice<vHalfSpinColourVectorD>  LatticeHalfSpinColourVectorD;
 typedef Lattice<vHalfSpinColourVectorD2> LatticeHalfSpinColourVectorD2;
 typedef Lattice<vTReal>            LatticeReal;
 typedef Lattice<vTRealF>           LatticeRealF;
@@ -295,6 +345,7 @@ typedef Lattice<vTRealD>           LatticeRealD;
 typedef Lattice<vTComplex>         LatticeComplex;
 typedef Lattice<vTComplexF>        LatticeComplexF;
 typedef Lattice<vTComplexD>        LatticeComplexD;
 typedef Lattice<vTComplexD2>       LatticeComplexD2;
 typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"
@@ -302,37 +353,42 @@ typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"
 ///////////////////////////////////////////
 // Physical names for things
 ///////////////////////////////////////////
-typedef LatticeHalfSpinColourVector  LatticeHalfFermion;
+typedef LatticeHalfSpinColourVector   LatticeHalfFermion;
-typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
+typedef LatticeHalfSpinColourVectorF  LatticeHalfFermionF;
-typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
+typedef LatticeHalfSpinColourVectorD  LatticeHalfFermionD;
 typedef LatticeHalfSpinColourVectorD2 LatticeHalfFermionD2;
 typedef LatticeSpinColourVector      LatticeFermion;
 typedef LatticeSpinColourVectorF     LatticeFermionF;
 typedef LatticeSpinColourVectorD     LatticeFermionD;
 typedef LatticeSpinColourVectorD2    LatticeFermionD2;
 typedef LatticeSpinColourMatrix                LatticePropagator;
 typedef LatticeSpinColourMatrixF               LatticePropagatorF;
 typedef LatticeSpinColourMatrixD               LatticePropagatorD;
 typedef LatticeSpinColourMatrixD2              LatticePropagatorD2;
 typedef LatticeLorentzColourMatrix             LatticeGaugeField;
 typedef LatticeLorentzColourMatrixF            LatticeGaugeFieldF;
 typedef LatticeLorentzColourMatrixD            LatticeGaugeFieldD;
 typedef LatticeLorentzColourMatrixD2           LatticeGaugeFieldD2;
 typedef LatticeDoubleStoredColourMatrix        LatticeDoubledGaugeField;
 typedef LatticeDoubleStoredColourMatrixF       LatticeDoubledGaugeFieldF;
 typedef LatticeDoubleStoredColourMatrixD       LatticeDoubledGaugeFieldD;
 typedef LatticeDoubleStoredColourMatrixD2      LatticeDoubledGaugeFieldD2;
 template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
 // Uhgg... typing this hurt  ;)
 // (my keyboard got burning hot when I typed this, must be the anti-Fermion)
 typedef Lattice<vColourVector>          LatticeStaggeredFermion;    
 typedef Lattice<vColourVectorF>         LatticeStaggeredFermionF;    
 typedef Lattice<vColourVectorD>         LatticeStaggeredFermionD;    
 typedef Lattice<vColourVectorD2>        LatticeStaggeredFermionD2;    
 typedef Lattice<vColourMatrix>          LatticeStaggeredPropagator; 
 typedef Lattice<vColourMatrixF>         LatticeStaggeredPropagatorF; 
 typedef Lattice<vColourMatrixD>         LatticeStaggeredPropagatorD; 
 typedef Lattice<vColourMatrixD2>        LatticeStaggeredPropagatorD2; 
 //////////////////////////////////////////////////////////////////////////////
 // Peek and Poke named after physics attributes
@@ -40,9 +40,47 @@ class Action
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
  RealD Fdt_norm_sum;
  RealD Fdt_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
  RealD refresh_us;
  void  reset_timer(void)        {
    deriv_us = S_us = refresh_us = 0.0;
    deriv_norm_sum = deriv_max_sum=0.0;
    Fdt_max_sum =  Fdt_norm_sum = 0.0;
    deriv_num=0;
  }
  void  deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) {
    if ( max > deriv_max_sum ) {
      deriv_max_sum=max;
    }
    deriv_norm_sum+=nrm;
    if ( Fdt_max > Fdt_max_sum ) {
      Fdt_max_sum=Fdt_max;
    }
    Fdt_norm_sum+=Fdt_nrm; deriv_num++;
  }
  RealD deriv_max_average(void)       { return deriv_max_sum; };
  RealD deriv_norm_average(void)      { return deriv_norm_sum/deriv_num; };
  RealD Fdt_max_average(void)         { return Fdt_max_sum; };
  RealD Fdt_norm_average(void)        { return Fdt_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
  RealD S_timer(void)            { return S_us; };
  RealD refresh_timer(void)      { return refresh_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
  // Heatbath?
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
  virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
  virtual std::string action_name()    = 0;                             // return the action name
  virtual std::string LogParameters()  = 0;                             // prints action parameters
@@ -37,6 +37,10 @@ NAMESPACE_CHECK(ActionSet);
 #include <Grid/qcd/action/ActionParams.h>
 NAMESPACE_CHECK(ActionParams);
 #include <Grid/qcd/action/filters/MomentumFilter.h>
 #include <Grid/qcd/action/filters/DirichletFilter.h>
 #include <Grid/qcd/action/filters/DDHMCFilter.h>
 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
@@ -34,27 +34,45 @@ directory
 NAMESPACE_BEGIN(Grid);
-// These can move into a params header and be given MacroMagic serialisation
+
 struct GparityWilsonImplParams {
  Coordinate twists;
-  GparityWilsonImplParams() : twists(Nd, 0) {};
+                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  int  partialDirichlet;
  GparityWilsonImplParams() : twists(Nd, 0) {
    dirichlet.resize(0);
    partialDirichlet=0;
  };
 };
 struct WilsonImplParams {
  bool overlapCommsCompute;
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  int  partialDirichlet;
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
    dirichlet.resize(0);
    partialDirichlet=0;
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
    partialDirichlet=0;
    dirichlet.resize(0);
  }
 };
 struct StaggeredImplParams {
-  StaggeredImplParams()  {};
+  Coordinate dirichlet; // Blocksize of dirichlet BCs
  int  partialDirichlet;
  StaggeredImplParams()
  {
    partialDirichlet=0;
    dirichlet.resize(0);
  };
 };
  struct OneFlavourRationalParams : Serializable {
@@ -63,9 +81,11 @@ struct StaggeredImplParams {
 				    RealD, hi, 
 				    int,   MaxIter, 
 				    RealD, tolerance, 
 				    RealD, mdtolerance, 
 				    int,   degree, 
 				    int,   precision,
-				    int,   BoundsCheckFreq);
+				    int,   BoundsCheckFreq,
 				    RealD, BoundsCheckTol);
  // MaxIter and tolerance, vectors??
@@ -76,16 +96,62 @@ struct StaggeredImplParams {
 				RealD tol      = 1.0e-8, 
                           	int _degree    = 10,
 				int _precision = 64,
-				int _BoundsCheckFreq=20)
+				int _BoundsCheckFreq=20,
 				RealD mdtol    = 1.0e-6,
 				double _BoundsCheckTol=1e-6)
      : lo(_lo),
 	hi(_hi),
 	MaxIter(_maxit),
 	tolerance(tol),
        mdtolerance(mdtol),
 	degree(_degree),
        precision(_precision),
-        BoundsCheckFreq(_BoundsCheckFreq){};
+        BoundsCheckFreq(_BoundsCheckFreq),
        BoundsCheckTol(_BoundsCheckTol){};
  };
  /*Action parameters for the generalized rational action
    The approximation is for (M^dag M)^{1/inv_pow}
    where inv_pow is the denominator of the fractional power.
    Default inv_pow=2 for square root, making this equivalent to 
    the OneFlavourRational action
  */
    struct RationalActionParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
 				    int, inv_pow, 
 				    RealD, lo, //low eigenvalue bound of rational approx
 				    RealD, hi, //high eigenvalue bound of rational approx
 				    int,   MaxIter,  //maximum iterations in msCG
 				    RealD, action_tolerance,  //msCG tolerance in action evaluation
 				    int,   action_degree, //rational approx tolerance in action evaluation
 				    RealD, md_tolerance,  //msCG tolerance in MD integration
 				    int,   md_degree, //rational approx tolerance in MD integration
 				    int,   precision, //precision of floating point arithmetic
 				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
  // constructor 
  RationalActionParams(int _inv_pow = 2,
 		       RealD _lo      = 0.0, 
 		       RealD _hi      = 1.0, 
 		       int _maxit     = 1000,
 		       RealD _action_tolerance      = 1.0e-8, 
 		       int _action_degree    = 10,
 		       RealD _md_tolerance      = 1.0e-8, 
 		       int _md_degree    = 10,
 		       int _precision = 64,
 		       int _BoundsCheckFreq=20)
    : inv_pow(_inv_pow), 
      lo(_lo),
      hi(_hi),
      MaxIter(_maxit),
      action_tolerance(_action_tolerance),
      action_degree(_action_degree),
      md_tolerance(_md_tolerance),
      md_degree(_md_degree),
      precision(_precision),
      BoundsCheckFreq(_BoundsCheckFreq){};
  };
 NAMESPACE_END(Grid);
 #endif
@@ -71,6 +71,7 @@ public:
  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
  RealD MassPlus(void) { return mass_plus; };
  RealD MassMinus(void) { return mass_minus; };
  void  SetMass(RealD _mass) { 
    mass_plus=mass_minus=_mass; 
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
@@ -182,16 +183,6 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
  void CayleyReport(void);
  void CayleyZeroCounters(void);
  double M5Dflops;
  double M5Dcalls;
  double M5Dtime;
  double MooeeInvFlops;
  double MooeeInvCalls;
  double MooeeInvTime;
 protected:
  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
@@ -140,6 +140,7 @@ public:
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD2>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
@@ -0,0 +1,291 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DWFSlow.h
 Copyright (C) 2022
 Author: Peter Boyle <pboyle@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template <class Impl>
 class DWFSlowFermion : public FermionOperator<Impl>
 {
 public:
  INHERIT_IMPL_TYPES(Impl);
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
  GridBase *GaugeGrid(void) { return _grid4; }
  GridBase *GaugeRedBlackGrid(void) { return _cbgrid4; }
  GridBase *FermionGrid(void) { return _grid; }
  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
  //////////////////////////////////////////////////////////////////
  virtual void  M(const FermionField &in, FermionField &out)
  {
    FermionField tmp(_grid);
    out = (5.0 - M5) * in;
    Dhop(in,tmp,DaggerNo);
    out = out + tmp;
  }
  virtual void  Mdag(const FermionField &in, FermionField &out)
  {
    FermionField tmp(_grid);
    out = (5.0 - M5) * in;
    Dhop(in,tmp,DaggerYes);
    out = out + tmp;
  };
  /////////////////////////////////////////////////////////
  // half checkerboard operations 5D redblack so just site identiy
  /////////////////////////////////////////////////////////
  void Meooe(const FermionField &in, FermionField &out)
  {
    if ( in.Checkerboard() == Odd ) {
      this->DhopEO(in,out,DaggerNo);
    } else {
      this->DhopOE(in,out,DaggerNo);
    }
  }
  void MeooeDag(const FermionField &in, FermionField &out)
  {
    if ( in.Checkerboard() == Odd ) {
      this->DhopEO(in,out,DaggerYes);
    } else {
      this->DhopOE(in,out,DaggerYes);
    }
  };
  // allow override for twisted mass and clover
  virtual void Mooee(const FermionField &in, FermionField &out)
  {
    out = (5.0 - M5) * in;
  }
  virtual void MooeeDag(const FermionField &in, FermionField &out)
  {
    out = (5.0 - M5) * in;
  }
  virtual void MooeeInv(const FermionField &in, FermionField &out)
  {
    out = (1.0/(5.0 - M5)) * in;
  };
  virtual void MooeeInvDag(const FermionField &in, FermionField &out)
  {
    out = (1.0/(5.0 - M5)) * in;
  };
  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass,std::vector<double> twist) {} ;
  ////////////////////////
  // Derivative interface
  ////////////////////////
  // Interface calls an internal routine
  void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)  { assert(0);};
  void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
  void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
  ///////////////////////////////////////////////////////////////
  // non-hermitian hopping term; half cb or both
  ///////////////////////////////////////////////////////////////
  void Dhop(const FermionField &in, FermionField &out, int dag)
  {
    FermionField tmp(in.Grid());
    Dhop5(in,out,MassField,MassField,dag );
    for(int mu=0;mu<4;mu++){
      DhopDirU(in,Umu[mu],Umu[mu],tmp,mu,dag );    out = out + tmp;
    }
  };
  void DhopOE(const FermionField &in, FermionField &out, int dag)
  {
    FermionField tmp(in.Grid());
    assert(in.Checkerboard()==Even);
    Dhop5(in,out,MassFieldOdd,MassFieldEven,dag);
    for(int mu=0;mu<4;mu++){
      DhopDirU(in,UmuOdd[mu],UmuEven[mu],tmp,mu,dag );    out = out + tmp;
    }
  };
  void DhopEO(const FermionField &in, FermionField &out, int dag)
  {
    FermionField tmp(in.Grid());
    assert(in.Checkerboard()==Odd);
    Dhop5(in,out, MassFieldEven,MassFieldOdd ,dag );  
    for(int mu=0;mu<4;mu++){
      DhopDirU(in,UmuEven[mu],UmuOdd[mu],tmp,mu,dag );    out = out + tmp;
    }
  };
  ///////////////////////////////////////////////////////////////
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ assert(0);};
  void MdirAll(const FermionField &in, std::vector<FermionField> &out)   { assert(0);};
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { assert(0);};
  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out)    { assert(0);};
  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { assert(0);};
  void DhopDirU(const FermionField &in, const GaugeLinkField &U5e, const GaugeLinkField &U5o, FermionField &out, int mu, int dag)
  {
    RealD     sgn= 1.0;
    if (dag ) sgn=-1.0;
    Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaX,
 			 Gamma::Algebra::GammaY,
 			 Gamma::Algebra::GammaZ,
 			 Gamma::Algebra::GammaT
    };
    //    mass is  1,1,1,1,-m has to multiply the round the world term
    FermionField tmp (in.Grid());
    tmp = U5e * Cshift(in,mu+1,1);
    out = tmp - Gamma(Gmu[mu])*tmp*sgn;
    tmp = Cshift(adj(U5o)*in,mu+1,-1);
    out = out + tmp + Gamma(Gmu[mu])*tmp*sgn;
    out = -0.5*out;
  };
  void Dhop5(const FermionField &in, FermionField &out, ComplexField &massE, ComplexField &massO, int dag)
  {
    // Mass term.... must multiple the round world with mass = 1,1,1,1, -m
    RealD     sgn= 1.0;
    if (dag ) sgn=-1.0;
    Gamma G5(Gamma::Algebra::Gamma5);
    FermionField tmp (in.Grid());
    tmp = massE*Cshift(in,0,1);
    out = tmp - G5*tmp*sgn;
    tmp = Cshift(massO*in,0,-1);
    out = out + tmp + G5*tmp*sgn;
    out = -0.5*out;
  };
  // Constructor
  DWFSlowFermion(GaugeField &_Umu, GridCartesian &Fgrid,
 		 GridRedBlackCartesian &Hgrid, RealD _mass, RealD _M5)
    :
    _grid(&Fgrid),
    _cbgrid(&Hgrid),
    _grid4(_Umu.Grid()),
    Umu(Nd,&Fgrid),
    UmuEven(Nd,&Hgrid),
    UmuOdd(Nd,&Hgrid),
    MassField(&Fgrid),
    MassFieldEven(&Hgrid),
    MassFieldOdd(&Hgrid),
    M5(_M5),
    mass(_mass),
    _tmp(&Hgrid)
    {
      Ls=Fgrid._fdimensions[0];
      ImportGauge(_Umu);
      typedef typename FermionField::scalar_type scalar;
      Lattice<iScalar<vInteger> > coor(&Fgrid);
      LatticeCoordinate(coor, 0); // Scoor
      ComplexField one(&Fgrid);
      MassField =scalar(-mass);
      one       =scalar(1.0);
      MassField =where(coor==Integer(Ls-1),MassField,one);
      for(int mu=0;mu<Nd;mu++){
 	pickCheckerboard(Even,UmuEven[mu],Umu[mu]);
 	pickCheckerboard(Odd ,UmuOdd[mu],Umu[mu]);
      }
      pickCheckerboard(Even,MassFieldEven,MassField);
      pickCheckerboard(Odd ,MassFieldOdd,MassField);
    }
  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu4)
  {
    GaugeLinkField U4(_grid4);
    for(int mu=0;mu<Nd;mu++){
      U4 = PeekIndex<LorentzIndex>(_Umu4, mu);
      for(int s=0;s<this->Ls;s++){
 	InsertSlice(U4,Umu[mu],s,0);
      }
    }
  }
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
 public:
  virtual RealD Mass(void) { return mass; }
  virtual int   isTrivialEE(void) { return 1; };
  RealD mass;
  RealD M5;
  int Ls;
  GridBase *_grid4;
  GridBase *_grid;
  GridBase *_cbgrid4;
  GridBase *_cbgrid;
  // Copy of the gauge field , with even and odd subsets
  std::vector<GaugeLinkField> Umu;
  std::vector<GaugeLinkField> UmuEven;
  std::vector<GaugeLinkField> UmuOdd;
  ComplexField MassField;
  ComplexField MassFieldEven;
  ComplexField MassFieldOdd;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
  void ContractConservedCurrent(PropagatorField &q_in_1,
                                PropagatorField &q_in_2,
                                PropagatorField &q_out,
                                PropagatorField &phys_src,
                                Current curr_type,
                                unsigned int mu){}
  void SeqConservedCurrent(PropagatorField &q_in,
                           PropagatorField &q_out,
                           PropagatorField &phys_src,
                           Current curr_type,
                           unsigned int mu,
                           unsigned int tmin,
 			   unsigned int tmax,
 			   ComplexField &lattice_cmplx){}
 };
 typedef DWFSlowFermion<WilsonImplF> DWFSlowFermionF;
 typedef DWFSlowFermion<WilsonImplD> DWFSlowFermionD;
 NAMESPACE_END(Grid);
@@ -47,6 +47,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/DWFSlow.h>       // Slow DWF
 #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(Wilson);
@@ -112,28 +113,21 @@ NAMESPACE_CHECK(DWFutils);
 // Cayley 5d
 NAMESPACE_BEGIN(Grid);
-typedef WilsonFermion<WilsonImplR> WilsonFermionR;
+typedef WilsonFermion<WilsonImplD2> WilsonFermionD2;
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;
 //typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
 //typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
 //typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
 typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
 typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
 typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermionR;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
 // Twisted mass fermion
-typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
+typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2;
 typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
@@ -141,23 +135,20 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
 template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
-typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
+typedef WilsonClover<WilsonImplD2> WilsonCloverFermionD2;
 typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
 typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
+typedef WilsonExpClover<WilsonImplD2> WilsonExpCloverFermionD2;
 typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
 typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
 typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
 typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
 typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
@@ -165,161 +156,108 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
-typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
+typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
-typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
+typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
 typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
 typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
 typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 typedef DomainWallFermion<WilsonImplD2> DomainWallFermionD2;
-//typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
+typedef DomainWallEOFAFermion<WilsonImplD2> DomainWallEOFAFermionD2;
 //typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
 //typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
 typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
-//typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
+typedef MobiusFermion<WilsonImplD2> MobiusFermionD2;
 //typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
 //typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
-//typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
+typedef MobiusEOFAFermion<WilsonImplD2> MobiusEOFAFermionD2;
 //typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
 //typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
 typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
 typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
 typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
-//typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
+typedef ZMobiusFermion<ZWilsonImplD2> ZMobiusFermionD2;
 //typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
 //typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
 typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
-//typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
+typedef ScaledShamirFermion<WilsonImplD2> ScaledShamirFermionD2;
 //typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
 //typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
 // Ls vectorised
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
-typedef MobiusZolotarevFermion<WilsonImplR> MobiusZolotarevFermionR;
+typedef MobiusZolotarevFermion<WilsonImplD2> MobiusZolotarevFermionD2;
 typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF;
 typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD;
-typedef ShamirZolotarevFermion<WilsonImplR> ShamirZolotarevFermionR;
+typedef ShamirZolotarevFermion<WilsonImplD2> ShamirZolotarevFermionD2;
 typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF;
 typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD;
-typedef OverlapWilsonCayleyTanhFermion<WilsonImplR> OverlapWilsonCayleyTanhFermionR;
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplD2> OverlapWilsonCayleyTanhFermionD2;
 typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF;
 typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD;
-typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplR> OverlapWilsonCayleyZolotarevFermionR;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD2> OverlapWilsonCayleyZolotarevFermionD2;
 typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
 typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD;
 // Continued fraction
-typedef OverlapWilsonContFracTanhFermion<WilsonImplR> OverlapWilsonContFracTanhFermionR;
+typedef OverlapWilsonContFracTanhFermion<WilsonImplD2> OverlapWilsonContFracTanhFermionD2;
 typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF;
 typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD;
-typedef OverlapWilsonContFracZolotarevFermion<WilsonImplR> OverlapWilsonContFracZolotarevFermionR;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD2> OverlapWilsonContFracZolotarevFermionD2;
 typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
 typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD;
 // Partial fraction
-typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplR> OverlapWilsonPartialFractionTanhFermionR;
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD2> OverlapWilsonPartialFractionTanhFermionD2;
 typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
 typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD;
-typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplR> OverlapWilsonPartialFractionZolotarevFermionR;
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD2> OverlapWilsonPartialFractionZolotarevFermionD2;
 typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
 typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD;
 // Gparity cases; partial list until tested
 typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
 typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
 typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
 //typedef WilsonFermion<GparityWilsonImplRL>     GparityWilsonFermionRL;
 //typedef WilsonFermion<GparityWilsonImplFH>     GparityWilsonFermionFH;
 //typedef WilsonFermion<GparityWilsonImplDF>     GparityWilsonFermionDF;
 typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
-//typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
+typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionD2;
 //typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
 //typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
 typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
-//typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
+typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionD2;
 //typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
 //typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
 typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
-//typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
+typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionD2;
 //typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
 //typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
 typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
 typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
-//typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
+typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionD2;
 //typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
 //typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
 typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
 typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
 typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
 //typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
 //typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
 //typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
 typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
 typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
 typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
 typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
 typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
@@ -49,6 +49,8 @@ public:
  virtual FermionField &tmp(void) = 0;
  virtual void DirichletBlock(const Coordinate & _Block) { assert(0); };
  GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
  GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
@@ -30,6 +30,18 @@ directory
 NAMESPACE_BEGIN(Grid);
 /*
  Policy implementation for G-parity boundary conditions
  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
 */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@@ -113,7 +125,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@@ -139,10 +151,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
+    //If this site is an global boundary site, perform the G-parity flavor twist
-
+    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
      if ( sl == 2 ) {
-       
+	//Only do the twist for lanes on the edge of the physical node
 	ExtractBuffer<sobj> vals(Nsimd);
 	extract(chi,vals);
@@ -197,6 +209,19 @@ public:
    reg = memory;
  }
  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
    autoView(poke_f0_v, poke_f0, CpuRead);
    autoView(poke_f1_v, poke_f1, CpuRead);
    autoView(Uds_v, Uds, CpuWrite);
    thread_foreach(ss,poke_f0_v,{
 	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
 	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
      });
  }
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@@ -207,14 +232,19 @@ public:
    GaugeLinkField Uconj(GaugeGrid);
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-        
+
-    for(int mu=0;mu<Nd;mu++){
+    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
-          
+    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
-      LatticeCoordinate(coor,mu);
+    for(int mu=0;mu<Nd-1;mu++){
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@@ -229,7 +259,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	  });
+	});
      }
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@@ -260,6 +290,38 @@ public:
        });
      }
    }
    { //periodic / antiperiodic temporal BCs
      int mu = Nd-1;
      int L   = GaugeGrid->GlobalDimensions()[mu];
      int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
      GaugeLinkField *Upoke = &U;
      if(Params.twists[mu]){ //antiperiodic
 	Utmp =  where(coor == Lmu, -U, U);
 	Upoke = &Utmp;
      }
      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
      //Get the barrel-shifted field
      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
      Upoke = &Utmp;
      if(Params.twists[mu]){
 	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
 	Upoke = &U;
      }
      Uconj = conjugate(*Upoke);
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@@ -298,28 +360,48 @@ public:
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
-  
+ 
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
-
+    int Ls=Btilde.Grid()->_fdimensions[0];
-    int Ls = Btilde.Grid()->_fdimensions[0];
+    
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
-      autoView( tmp_v , tmp, CpuWrite);
+      GridBase *GaugeGrid = mat.Grid();
-      autoView( Atilde_v , Atilde, CpuRead);
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
-      autoView( Btilde_v , Btilde, CpuRead);
+
-      thread_for(ss,tmp.Grid()->oSites(),{
+      if( Params.twists[mu] ){
-	  for (int s = 0; s < Ls; s++) {
+	LatticeCoordinate(coor,mu);
-	    int sF = s + Ls * ss;
+      }
-	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
+
-	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+      autoView( mat_v , mat, AcceleratorWrite);
-	  }
+      autoView( Btilde_v , Btilde, AcceleratorRead);
-	});
+      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
  	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
  	  zeroit(sum);
  	  for(int s=0;s<Ls;s++){
  	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
 	      //Flavor 0
  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
  	      sum = sum + outerProduct(bb,aa);
  	      //Flavor 1
  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
  	      sum = sum + conjugate(outerProduct(bb,aa));
  	    }
  	  }	    
  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
 };
@@ -47,18 +47,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
  void Report(void);
  void ZeroCounters(void);
  double DhopTotalTime;
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
@@ -52,18 +52,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
  void Report(void);
  void ZeroCounters(void);
  double DhopTotalTime;
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
@@ -47,18 +47,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
  void Report(void);
  void ZeroCounters(void);
  double DhopTotalTime;
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
@@ -32,17 +32,218 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////
 // Wilson compressor will need FaceGather policies for:
 // Periodic, Dirichlet, and partial Dirichlet for DWF
 ///////////////////////////////////////////////////////////////
 const int dwf_compressor_depth=2;
 #define DWF_COMPRESS
 class FaceGatherPartialDWF
 {
 public:
 #ifdef DWF_COMPRESS
  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
 #else
  static int PartialCompressionFactor(GridBase *grid) { return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
 				   const Lattice<vobj> &rhs,
 				   cobj *buffer,
 				   compressor &compress,
 				   int off,int so,int partial)
  {
    //DWF only hack: If a direction that is OFF node we use Partial Dirichlet
    //  Shrinks local and remote comms buffers
    GridBase *Grid = rhs.Grid();
    int Ls = Grid->_rdimensions[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else 
    int depth=Ls/2;
 #endif
    std::pair<int,int> *table_v = & table[0];
    auto rhs_v = rhs.View(AcceleratorRead);
    int vol=table.size()/Ls;
    accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
 	Integer i=idx/Ls;
 	Integer s=idx%Ls;
 	Integer sc=depth+s-(Ls-depth);
 	if(s<depth)     compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
 	if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
    });
    rhs_v.ViewClose();
  }
  template<class decompressor,class Decompression>
  static void DecompressFace(decompressor decompress,Decompression &dd)
  {
    auto Ls = dd.dims[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth=Ls/2;
 #endif    
    // Just pass in the Grid
    auto kp = dd.kernel_p;
    auto mp = dd.mpi_p;
    int size= dd.buffer_size;
    int vol= size/Ls;
    accelerator_forNB(o,size,1,{
 	int idx=o/Ls;
 	int   s=o%Ls;
 	if ( s < depth ) {
 	  int oo=s*vol+idx;
 	  kp[o]=mp[oo];
 	} else if ( s >= Ls-depth ) {
 	  int sc = depth + s - (Ls-depth);
 	  int oo=sc*vol+idx; 
 	  kp[o]=mp[oo];
 	} else {
 	  kp[o] = Zero();//fill rest with zero if partial dirichlet
 	}
    });
  }
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Need to gather *interior portions* for ALL s-slices in simd directions
  // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
    GridBase *Grid = rhs.Grid();
    int Ls = Grid->_rdimensions[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth = Ls/2;
 #endif
    // insertion of zeroes...
    assert( (table.size()&0x1)==0);
    int num=table.size()/2;
    int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
    auto rhs_v = rhs.View(AcceleratorRead);
    auto p0=&pointers[0][0];
    auto p1=&pointers[1][0];
    auto tp=&table[0];
    int nnum=num/Ls;
    accelerator_forNB(j, num, vobj::Nsimd(), {
 	//  Reorders both local and remote comms buffers
 	//  
 	int s  = j % Ls;
 	int sp1 = (s+depth)%Ls;  // peri incremented s slice
 	int hxyz= j/Ls;
 	int xyz0= hxyz*2; // xyzt part of coor
 	int xyz1= hxyz*2+1;
 	int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
 	int kk0= xyz0*Ls + s ; // s=0 goes to s=1
 	int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
 	compress.CompressExchange(p0[jj],p1[jj],
 				  rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
 				  rhs_v[so+tp[kk1 ].second], 
 				  type);
    });
    rhs_v.ViewClose();
  }
  // Merge routine is for SIMD faces
  template<class decompressor,class Merger>
  static void MergeFace(decompressor decompress,Merger &mm)
  {
    auto Ls = mm.dims[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth = Ls/2;
 #endif
    int  num= mm.buffer_size/2; // relate vol and Ls to buffer size
    auto mp = &mm.mpointer[0];
    auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
    auto vp1= &mm.vpointers[1][0];
    auto type= mm.type;
    int nnum = num/Ls;
    accelerator_forNB(o,num,Merger::Nsimd,{
 	int  s=o%Ls;
 	int hxyz=o/Ls; // xyzt related component
 	int xyz0=hxyz*2;
 	int xyz1=hxyz*2+1;
 	int sp = (s+depth)%Ls; 
 	int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
 	int oo0= s+xyz0*Ls;
 	int oo1= s+xyz1*Ls;
 	// same ss0, ss1 pair goes to new layout
 	decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
      });
  }
 };
 class FaceGatherDWFMixedBCs
 {
 public:
 #ifdef DWF_COMPRESS
  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
 #else 
  static int PartialCompressionFactor(GridBase *grid) {return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
 					 const Lattice<vobj> &rhs,
 					 cobj *buffer,
 					 compressor &compress,
 					 int off,int so,int partial)
  {
    //    std::cout << " face gather simple DWF partial "<<partial <<std::endl;
    if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
  }
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
    //    std::cout << " face gather exch DWF partial "<<partial <<std::endl;
    if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
    else        FaceGatherSimple::Gather_plane_exchange    (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
  }
  template<class decompressor,class Merger>
  static void MergeFace(decompressor decompress,Merger &mm)
  {
    int partial = mm.partial;
    //    std::cout << " merge DWF partial "<<partial <<std::endl;
    if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
    else           FaceGatherSimple::MergeFace(decompress,mm);
  }
  template<class decompressor,class Decompression>
  static void DecompressFace(decompressor decompress,Decompression &dd)
  {
    int partial = dd.partial;
    //    std::cout << " decompress DWF partial "<<partial <<std::endl;
    if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
    else           FaceGatherSimple::DecompressFace(decompress,dd);
  }
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
-// optimised versions supporting half precision too
+// optimised versions supporting half precision too??? Deprecate
 /////////////////////////////////////////////////////////////////////////////////////////////
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector,typename SFINAE = void >
 class WilsonCompressorTemplate;
 //Could make FaceGather a template param, but then behaviour is runtime not compile time
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
-class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
+class WilsonCompressorTemplate  : public FaceGatherDWFMixedBCs
-				typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
+//  : public FaceGatherSimple
 {
 public:
@@ -79,172 +280,81 @@ public:
  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
  /*****************************************************/
-  accelerator_inline void Exchange(SiteHalfSpinor *mp,
+  accelerator_inline void Exchange(SiteHalfSpinor &mp0,
-				   const SiteHalfSpinor * __restrict__ vp0,
+				   SiteHalfSpinor &mp1,
-				   const SiteHalfSpinor * __restrict__ vp1,
+				   const SiteHalfSpinor & vp0,
-				   Integer type,Integer o) const {
+				   const SiteHalfSpinor & vp1,
 				   Integer type) const {
 #ifdef GRID_SIMT
-    exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
+    exchangeSIMT(mp0,mp1,vp0,vp1,type);
 #else
    SiteHalfSpinor tmp1;
    SiteHalfSpinor tmp2;
-    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
+    exchange(tmp1,tmp2,vp0,vp1,type);
-    vstream(mp[2*o  ],tmp1);
+    vstream(mp0,tmp1);
-    vstream(mp[2*o+1],tmp2);
+    vstream(mp1,tmp2);
 #endif
  }
-
+  
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
-  accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
+  accelerator_inline void Decompress(SiteHalfSpinor &out,
-				     SiteHalfSpinor * __restrict__ in, Integer o) const {    
+				     SiteHalfSpinor &in) const {    
-    assert(0);
+    out = in;
  }
  /*****************************************************/
  /* Compress Exchange                                 */
  /*****************************************************/
-  accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
+  accelerator_inline void CompressExchange(SiteHalfSpinor &out0,
-					   SiteHalfSpinor * __restrict__ out1,
+					   SiteHalfSpinor &out1,
-					   const SiteSpinor * __restrict__ in,
+					   const SiteSpinor &in0,
-					   Integer j,Integer k, Integer m,Integer type) const
+					   const SiteSpinor &in1,
 					   Integer type) const
  {
 #ifdef GRID_SIMT
    typedef SiteSpinor vobj;
    typedef SiteHalfSpinor hvobj;
-    typedef decltype(coalescedRead(*in))    sobj;
+    typedef decltype(coalescedRead(in0))    sobj;
-    typedef decltype(coalescedRead(*out0)) hsobj;
+    typedef decltype(coalescedRead(out0)) hsobj;
    constexpr unsigned int Nsimd = vobj::Nsimd();
    unsigned int mask = Nsimd >> (type + 1);
    int lane = acceleratorSIMTlane(Nsimd);
    int j0 = lane &(~mask); // inner coor zero
    int j1 = lane |(mask) ; // inner coor one
-    const vobj *vp0 = &in[k];  // out0[j] = merge low bit of type from in[k] and in[m] 
+    const vobj *vp0 = &in0;
-    const vobj *vp1 = &in[m];  // out1[j] = merge hi  bit of type from in[k] and in[m]
+    const vobj *vp1 = &in1;
-    const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0
+    const vobj *vp = (lane&mask) ? vp1:vp0;
-    auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing
+    auto sa = coalescedRead(*vp,j0);
-    auto sb = coalescedRead(*vp,j1); // lane to read for out 1
+    auto sb = coalescedRead(*vp,j1);
    hsobj psa, psb;
-    projector::Proj(psa,sa,mu,dag);  // spin project the result0
+    projector::Proj(psa,sa,mu,dag);
-    projector::Proj(psb,sb,mu,dag);  // spin project the result1
+    projector::Proj(psb,sb,mu,dag);
-    coalescedWrite(out0[j],psa);
+    coalescedWrite(out0,psa);
-    coalescedWrite(out1[j],psb);
+    coalescedWrite(out1,psb);
 #else
    SiteHalfSpinor temp1, temp2;
    SiteHalfSpinor temp3, temp4;
-    projector::Proj(temp1,in[k],mu,dag);
+    projector::Proj(temp1,in0,mu,dag);
-    projector::Proj(temp2,in[m],mu,dag);
+    projector::Proj(temp2,in1,mu,dag);
    exchange(temp3,temp4,temp1,temp2,type);
-    vstream(out0[j],temp3);
+    vstream(out0,temp3);
-    vstream(out1[j],temp4);
+    vstream(out1,temp4);
 #endif
  }
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
-  accelerator_inline bool DecompressionStep(void) const { return false; }
+  accelerator_inline bool DecompressionStep(void) const {
    return false;
  }
 };
 #if 0
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
 				typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
 {
 public:
  int mu,dag;  
  void Point(int p) { mu=p; };
  WilsonCompressorTemplate(int _dag=0){
    dag = _dag;
  }
  typedef _Spinor         SiteSpinor;
  typedef _Hspinor     SiteHalfSpinor;
  typedef _HCspinor SiteHalfCommSpinor;
  typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
  accelerator_inline int CommDatumSize(void) const {
    return sizeof(SiteHalfCommSpinor);
  }
  /*****************************************************/
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
    SiteHalfSpinor hsp;
    SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
    projector::Proj(hsp,in,mu,dag);
    precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
  }
  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
 #ifdef GRID_SIMT
    typedef decltype(coalescedRead(buf)) sobj;
    sobj sp;
    auto sin = coalescedRead(in);
    projector::Proj(sp,sin,mu,dag);
    coalescedWrite(buf,sp);
 #else
    projector::Proj(buf,in,mu,dag);
 #endif
  }
  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
  /*****************************************************/
  accelerator_inline void Exchange(SiteHalfSpinor *mp,
                       SiteHalfSpinor *vp0,
                       SiteHalfSpinor *vp1,
 		       Integer type,Integer o) const {
    SiteHalfSpinor vt0,vt1;
    SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
    SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
    precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw);
    precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw);
    exchange(mp[2*o],mp[2*o+1],vt0,vt1,type);
  }
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
  accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const {
    SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
    precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
  }
  /*****************************************************/
  /* Compress Exchange                                 */
  /*****************************************************/
  accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
 			       SiteHalfSpinor *out1,
 			       const SiteSpinor *in,
 			       Integer j,Integer k, Integer m,Integer type) const {
    SiteHalfSpinor temp1, temp2,temp3,temp4;
    SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
    SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
    projector::Proj(temp1,in[k],mu,dag);
    projector::Proj(temp2,in[m],mu,dag);
    exchange(temp3,temp4,temp1,temp2,type);
    precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw);
    precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw);
  }
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
  accelerator_inline bool DecompressionStep(void) const { return true; }
 };
 #endif
 #define DECLARE_PROJ(Projector,Compressor,spProj)			\
  class Projector {							\
  public:								\
@@ -294,11 +404,7 @@ public:
  typedef typename Base::View_type View_type;
  typedef typename Base::StencilVector StencilVector;
-  void ZeroCountersi(void)  {  }
+  //  Vector<int> surface_list;
  void Reporti(int calls)  {  }
  std::vector<int> surface_list;
  WilsonStencil(GridBase *grid,
 		int npoints,
 		int checkerboard,
@@ -306,11 +412,11 @@ public:
 		const std::vector<int> &distances,Parameters p)  
    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
  { 
-    ZeroCountersi();
+    //    surface_list.resize(0);
    surface_list.resize(0);
    this->same_node.resize(npoints);
  };
  /*
  void BuildSurfaceList(int Ls,int vol4){
    // find same node for SHM
@@ -331,7 +437,8 @@ public:
      }
    }
  }
-
+  */
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
  {
@@ -377,28 +484,29 @@ public:
    int dag = compress.dag;
    int face_idx=0;
 #define vet_same_node(a,b) \
      { auto tmp = b;  }
    if ( dag ) { 
-      assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
+      vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx));
-      assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
+      vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx));
-      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
+      vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
-      assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
+      vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx));
-      assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
+      vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx));
-      assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
+      vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx));
-      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
+      vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
-      assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
+      vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx));
    } else {
-      assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
+      vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx));
-      assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
+      vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx));
-      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
+      vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
-      assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
+      vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx));
-      assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
+      vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx));
-      assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
+      vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx));
-      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
+      vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
-      assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
+      vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx));
    }
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
  }
 };
@@ -74,20 +74,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  void Report(void);
  void ZeroCounters(void);
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  double DhopTotalTime;
  double DerivCalls;
  double DerivCommTime;
  double DerivComputeTime;
  double DerivDhopComputeTime;
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
@@ -75,19 +75,8 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
-  void Report(void);
+  int Dirichlet;
-  void ZeroCounters(void);
+  Coordinate Block; 
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  double DhopTotalTime;
  double DerivCalls;
  double DerivCommTime;
  double DerivComputeTime;
  double DerivDhopComputeTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
@@ -173,7 +162,10 @@ public:
 		  GridCartesian         &FourDimGrid,
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());
-    
+
  virtual void DirichletBlock(const Coordinate & block)
  {
  }
  // Constructors
  /*
    WilsonFermion5D(int simd, 
@@ -37,7 +37,7 @@ NAMESPACE_BEGIN(Grid);
 template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
 class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
 public:
-
+  
  static const int Dimension = Representation::Dimension;
  static const bool isFundamental = Representation::isFundamental;
  static const bool LsVectorised=false;
@@ -242,19 +242,13 @@ public:
 typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > WilsonImplR;  // Real.. whichever prec
 typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF;  // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD;  // Double
-
+typedef WilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > WilsonImplD2;  // Double
 //typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL;  // Real.. whichever prec
 //typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH;  // Float
 //typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF;  // Double
 typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
 typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
 typedef WilsonImpl<vComplexD2, FundamentalRepresentation, CoeffComplex > ZWilsonImplD2; // Double
 //typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
 //typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
 //typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
 typedef WilsonImpl<vComplex,  AdjointRepresentation, CoeffReal > WilsonAdjImplR;   // Real.. whichever prec
 typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF;  // Float
 typedef WilsonImpl<vComplexD, AdjointRepresentation, CoeffReal > WilsonAdjImplD;  // Double
@@ -52,13 +52,6 @@ public:
  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
 public:
 #ifdef GRID_SYCL
 #define SYCL_HACK
 #endif  
 #ifdef SYCL_HACK
  static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
 			       int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
 #endif
  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
@@ -152,58 +152,6 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
  }
 }
 template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 {
  this->Report();
  Coordinate latt = GridDefaultLatt();          
  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP     = this->_FourDimGrid->_Nprocessors;
  if ( M5Dcalls > 0 ) {
    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
    // Flops = 10.0*(Nc*Ns) *Ls*vol
    RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    // Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
    // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
    // write = 1
    RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
    std::cout << GridLogMessage << "Average bandwidth (GB/s)                 : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
  }
  if ( MooeeInvCalls > 0 ) {
    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
 #ifdef GRID_CUDA
    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 #else
    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 #endif
  }
 }
 template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
 {
  this->ZeroCounters();
  M5Dflops=0;
  M5Dcalls=0;
  M5Dtime=0;
  MooeeInvFlops=0;
  MooeeInvCalls=0;
  MooeeInvTime=0;
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
@@ -646,7 +594,6 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
@@ -765,7 +712,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
    else          q_out +=     C;
  }
-#endif
+
 }
 template <class Impl>
@@ -832,7 +779,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
 #endif
 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
@@ -952,7 +898,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    InsertSlice(L_Q, q_out, s , 0);
  }
 #endif
 }
 #undef Pp
 #undef Pm
@@ -960,88 +905,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #undef TopRowWithSource
 #if 0
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
 						 Vector<iSinglet<Simd> > & Matp,
 						 Vector<iSinglet<Simd> > & Matm)
 {
  int Ls=this->Ls;
  GridBase *grid = this->FermionRedBlackGrid();
  int LLs = grid->_rdimensions[0];
  if ( LLs == Ls ) {
    return; // Not vectorised in 5th direction
  }
  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
  for(int s=0;s<Ls;s++){
    Pplus(s,s) = bee[s];
    Pminus(s,s)= bee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pminus(s,s+1) = -cee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pplus(s+1,s) = -cee[s+1];
  }
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
  Eigen::MatrixXcd PplusMat ;
  Eigen::MatrixXcd PminusMat;
  if ( inv ) {
    PplusMat =Pplus.inverse();
    PminusMat=Pminus.inverse();
  } else { 
    PplusMat =Pplus;
    PminusMat=Pminus;
  }
  if(dag){
    PplusMat.adjointInPlace();
    PminusMat.adjointInPlace();
  }
  typedef typename SiteHalfSpinor::scalar_type scalar_type;
  const int Nsimd=Simd::Nsimd();
  Matp.resize(Ls*LLs);
  Matm.resize(Ls*LLs);
  for(int s2=0;s2<Ls;s2++){
    for(int s1=0;s1<LLs;s1++){
      int istride = LLs;
      int ostride = 1;
      Simd Vp;
      Simd Vm;
      scalar_type *sp = (scalar_type *)&Vp;
      scalar_type *sm = (scalar_type *)&Vm;
      for(int l=0;l<Nsimd;l++){
 	if ( switcheroo<Coeff_t>::iscomplex() ) {
 	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
 	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
 	} else { 
 	  // if real
 	  scalar_type tmp;
 	  tmp = PplusMat (l*istride+s1*ostride,s2);
 	  sp[l] = scalar_type(tmp.real(),tmp.real());
 	  tmp = PminusMat(l*istride+s1*ostride,s2);
 	  sm[l] = scalar_type(tmp.real(),tmp.real());
 	}
      }
      Matp[LLs*s2+s1] = Vp;
      Matm[LLs*s2+s1] = Vm;
    }}
 }
 #endif
 NAMESPACE_END(Grid);
@@ -63,23 +63,18 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  // 10 = 3 complex mult + 2 complex add
  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
-  M5Dcalls++;
+  uint64_t nloop = grid->oSites();
  M5Dtime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t ss= sss*Ls;
+    uint64_t s = sss%Ls;
    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1, tmp2;
-    for(int s=0;s<Ls;s++){
+    uint64_t idx_u = ss+((s+1)%Ls);
-      uint64_t idx_u = ss+((s+1)%Ls);
+    uint64_t idx_l = ss+((s+Ls-1)%Ls);
-      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+    spProj5m(tmp1,psi(idx_u));
-      spProj5m(tmp1,psi(idx_u));
+    spProj5p(tmp2,psi(idx_l));
-      spProj5p(tmp2,psi(idx_l));
+    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
 }
 template<class Impl>  
@@ -105,23 +100,18 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  int Ls=this->Ls;
  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
+  uint64_t nloop = grid->oSites();
  M5Dtime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t ss=sss*Ls;
+    uint64_t s = sss%Ls;
    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1,tmp2;
-    for(int s=0;s<Ls;s++){
+    uint64_t idx_u = ss+((s+1)%Ls);
-      uint64_t idx_u = ss+((s+1)%Ls);
+    uint64_t idx_l = ss+((s+Ls-1)%Ls);
-      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+    spProj5p(tmp1,psi(idx_u));
-      spProj5p(tmp1,psi(idx_u));
+    spProj5m(tmp2,psi(idx_l));
-      spProj5m(tmp2,psi(idx_l));
+    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
 }
 template<class Impl>
@@ -142,8 +132,6 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
  auto pleem = & leem[0];
  auto pueem = & ueem[0];
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -180,8 +168,6 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
      coalescedWrite(chi[ss+s],res);
    }
  });
  MooeeInvTime+=usecond();
 }
@@ -204,10 +190,6 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
  assert(psi.Checkerboard() == psi.Checkerboard());
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -244,7 +226,6 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
      coalescedWrite(chi[ss+s],res);
    }
  });
  MooeeInvTime+=usecond();
 }
@@ -94,10 +94,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
      d_p[ss] = diag[s];
    }}
  M5Dcalls++;
  M5Dtime-=usecond();
  assert(Nc==3);
  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
@@ -198,7 +194,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
    }
 #endif
  });
  M5Dtime+=usecond();
 }
 template<class Impl>  
@@ -242,8 +237,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
      d_p[ss] = diag[s];
    }}
  M5Dcalls++;
  M5Dtime-=usecond();
  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
 #if 0
    alignas(64) SiteHalfSpinor hp;
@@ -339,7 +332,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
    }
 #endif
  });
  M5Dtime+=usecond();
 }
@@ -813,9 +805,6 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
  }
  assert(_Matp->size()==Ls*LLs);
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  if ( switcheroo<Coeff_t>::iscomplex() ) {
    thread_loop( (auto site=0;site<vol;site++),{
      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
@@ -825,7 +814,7 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    });
  }
-  MooeeInvTime+=usecond();
+
 }
 NAMESPACE_END(Grid);
@@ -54,8 +54,6 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  auto pupper = &upper[0];
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  auto nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -71,7 +69,6 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -91,8 +88,6 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  auto nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -108,7 +103,6 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -127,8 +121,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
  auto pleem = & this->leem[0];
  auto pueem = & this->ueem[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  uint64_t nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -164,7 +156,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
      coalescedWrite(chi[ss+s],res);
    }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@@ -185,8 +176,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
  assert(psi.Checkerboard() == psi.Checkerboard());
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  auto nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -223,7 +212,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
    }
  });
  this->MooeeInvTime += usecond();
 }
 NAMESPACE_END(Grid);
@@ -298,45 +298,33 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();
  DhopFaceTime-=usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime+=usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Remove explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime-=usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime+=usecond();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  DhopComputeTime2-=usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime2+=usecond();
 }
 template<class Impl>
@@ -347,22 +335,14 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  Compressor compressor;
  int LLs = in.Grid()->_rdimensions[0];
 //double t1=usecond();
  DhopTotalTime -= usecond();
  DhopCommTime -= usecond();
  st.HaloExchange(in,compressor);
  DhopCommTime += usecond();
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 }
 /*CHANGE END*/
@@ -371,7 +351,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@@ -383,7 +362,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@@ -395,7 +373,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());
@@ -404,58 +381,6 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Report(void) 
 {
  Coordinate latt = GridDefaultLatt();          
  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _FourDimGrid->_Nprocessors;
  RealD NN = _FourDimGrid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _FourDimGrid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime    = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopFaceTime    = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
 /////////////////////////////////////////////////////////////////////////
@@ -334,7 +334,6 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
@@ -346,7 +345,6 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -359,7 +357,6 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -418,47 +415,33 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  Compressor compressor; 
  int len =  U.Grid()->oSites();
  DhopTotalTime   -= usecond();
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime    += usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+= usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime    += usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
  DhopFaceTime    -= usecond();
  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime2    += usecond();
 }
@@ -471,78 +454,16 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  DhopTotalTime   -= usecond();
  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();
  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 };
  ////////////////////////////////////////////////////////////////
  // Reporting
  ////////////////////////////////////////////////////////////////
 template<class Impl>
 void ImprovedStaggeredFermion<Impl>::Report(void) 
 {
  Coordinate latt = _grid->GlobalDimensions();
  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _grid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void ImprovedStaggeredFermion<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime   = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopFaceTime    = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
@@ -55,9 +55,6 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss = sss*Ls;
@@ -73,7 +70,6 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -99,9 +95,6 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
  auto pshift_coeffs = &shift_coeffs[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss = sss*Ls;
@@ -122,7 +115,6 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -143,9 +135,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(), {
    uint64_t ss = sss*Ls;
@@ -161,8 +150,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -186,9 +173,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
  auto pshift_coeffs = &shift_coeffs[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  auto pm = this->pm;
  int nloop = grid->oSites()/Ls;
@@ -217,7 +201,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -237,9 +220,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -277,7 +257,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
    }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@@ -297,8 +276,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  auto pueem= & this->ueem[0];
  auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0];
  auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -343,7 +320,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
      }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@@ -363,9 +339,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  auto pleem= & this->leem[0];
  auto pueem= & this->ueem[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -402,7 +375,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
      coalescedWrite(chi[ss+s],res);
    }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@@ -423,9 +395,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
      uint64_t ss=sss*Ls;
@@ -469,7 +438,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
      }
  });
  this->MooeeInvTime += usecond();
 }
 NAMESPACE_END(Grid);
@@ -263,7 +263,6 @@ void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionFiel
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
@@ -275,7 +274,6 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -288,7 +286,6 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -345,47 +342,33 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
  Compressor compressor; 
  int len =  U.Grid()->oSites();
  DhopTotalTime   -= usecond();
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime    += usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+= usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime    += usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
  DhopFaceTime    -= usecond();
  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime2    += usecond();
 }
 template <class Impl>
@@ -396,78 +379,16 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  DhopTotalTime   -= usecond();
  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();
  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 };
  ////////////////////////////////////////////////////////////////
  // Reporting
  ////////////////////////////////////////////////////////////////
 template<class Impl>
 void NaiveStaggeredFermion<Impl>::Report(void) 
 {
  Coordinate latt = _grid->GlobalDimensions();
  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _grid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void NaiveStaggeredFermion<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime   = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopFaceTime    = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
@@ -60,8 +60,13 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid),
-  _tmp(&FiveDimRedBlackGrid)
+  _tmp(&FiveDimRedBlackGrid),
  Dirichlet(0)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
@@ -91,6 +96,19 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
  }
  if ( p.dirichlet.size() == Nd+1) {
    Coordinate block = p.dirichlet;
    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
      Dirichlet = 1;
      std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
      std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
      Block = block;
    }
  } else {
    Coordinate block(Nd+1,0);
    Block = block;
  }
  if (Impl::LsVectorised) { 
    int nsimd = Simd::Nsimd();
@@ -125,99 +143,38 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  StencilEven.BuildSurfaceList(LLs,vol4);
   StencilOdd.BuildSurfaceList(LLs,vol4);
   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
   //                       <<" " << StencilEven.surface_list.size()<<std::endl;
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
  RealD NP     = _FourDimGrid->_Nprocessors;
  RealD NN     = _FourDimGrid->NodeCount();
  RealD volume = Ls;  
  Coordinate latt = _FourDimGrid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Number of DhopEO Calls   : " << DhopCalls   << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
    // Average the compute time
    _FourDimGrid->GlobalSum(DhopComputeTime);
    DhopComputeTime/=NP;
    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
    RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
   }
  if ( DerivCalls > 0 ) {
    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl;  }
  if (DerivCalls > 0 || DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion5D Stencil"    <<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::ZeroCounters(void) {
  DhopCalls       = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopComputeTime2= 0;
  DhopFaceTime    = 0;
  DhopTotalTime   = 0;
  DerivCalls       = 0;
  DerivCommTime    = 0;
  DerivComputeTime = 0;
  DerivDhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
  if ( Dirichlet ) {
    if ( this->Params.partialDirichlet ) {
      std::cout << GridLogMessage << " partialDirichlet BCs " <<Block<<std::endl;
    } else {
      std::cout << GridLogMessage << " FULL Dirichlet BCs " <<Block<<std::endl;
    }
    std:: cout << GridLogMessage << "Checking block size multiple of rank boundaries for Dirichlet"<<std::endl;
    for(int d=0;d<Nd;d++) {
      int GaugeBlock = Block[d+1];
      int ldim=GaugeGrid()->LocalDimensions()[d];
      if (GaugeBlock) assert( (GaugeBlock%ldim)==0);
    }
    if (!this->Params.partialDirichlet) {
      std::cout << GridLogMessage << " Dirichlet filtering gauge field BCs block " <<Block<<std::endl;
      Coordinate GaugeBlock(Nd);
      for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
      DirichletFilter<GaugeField> Filter(GaugeBlock);
      Filter.applyFilter(HUmu);
    } else {
      std::cout << GridLogMessage << " Dirichlet "<< Dirichlet << " NOT filtered gauge field" <<std::endl;
    }
  }
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
@@ -259,7 +216,6 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 					  const FermionField &B,
 					  int dag)
 {
  DerivCalls++;
  assert((dag==DaggerNo) ||(dag==DaggerYes));
  conformable(st.Grid(),A.Grid());
@@ -270,15 +226,12 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
  FermionField Btilde(B.Grid());
  FermionField Atilde(B.Grid());
  DerivCommTime-=usecond();
  st.HaloExchange(B,compressor);
  DerivCommTime+=usecond();
  Atilde=A;
  int LLs = B.Grid()->_rdimensions[0];
  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma if dag
@@ -290,8 +243,6 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    // Call the single hop
    ////////////////////////
    DerivDhopComputeTime -= usecond();
    int Usites = U.Grid()->oSites();
    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
@@ -299,10 +250,8 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    ////////////////////////////
    // spin trace outer product
    ////////////////////////////
    DerivDhopComputeTime += usecond();
    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
  }
  DerivComputeTime += usecond();
 }
 template<class Impl>
@@ -360,12 +309,10 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         DoubledGaugeField & U,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopTotalTime-=usecond();
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
 }
@@ -374,6 +321,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
  GRID_TRACE("DhopInternalOverlappedComms");
  Compressor compressor(dag);
  int LLs = in.Grid()->_rdimensions[0];
@@ -382,53 +330,58 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  /////////////////////////////
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  {
-  st.HaloExchangeOptGather(in,compressor);
+    GRID_TRACE("Gather");
-  DhopFaceTime+=usecond();
+    st.HaloExchangeOptGather(in,compressor);
-
+    accelerator_barrier();
-  DhopCommTime -=usecond();
+  }
  std::vector<std::vector<CommsRequest_t> > requests;
  auto id=traceStart("Communicate overlapped");
  st.CommunicateBegin(requests);
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  {
-  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+    GRID_TRACE("MergeSHM");
-  DhopFaceTime+=usecond();
+    st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  }
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  } else {
    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
-  DhopCommTime   +=usecond();
+  traceStop(id);
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  {
-  st.CommsMerge(compressor);
+    GRID_TRACE("Merge");
-  DhopFaceTime+=usecond();
+    st.CommsMerge(compressor);
  }
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
 }
@@ -438,29 +391,30 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
 						    const FermionField &in, 
 						    FermionField &out,int dag)
 {
  GRID_TRACE("DhopInternalSerialComms");
  Compressor compressor(dag);
  int LLs = in.Grid()->_rdimensions[0];
  {
    GRID_TRACE("HaloExchange");
    st.HaloExchangeOpt(in,compressor);
  }
  DhopCommTime-=usecond();
  st.HaloExchangeOpt(in,compressor);
  DhopCommTime+=usecond();
  DhopComputeTime-=usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDag");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  } else {
    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@@ -472,7 +426,6 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@@ -484,7 +437,6 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());
@@ -539,12 +491,17 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  LatComplex    sk(_grid);  sk = Zero();
  LatComplex    sk2(_grid); sk2= Zero();
  LatComplex    W(_grid); W= Zero();
  LatComplex    a(_grid); a= Zero();
  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
  LatComplex 	cosha(_grid);
  LatComplex 	kmu(_grid);
  LatComplex 	Wea(_grid);
  LatComplex 	Wema(_grid);
  LatComplex 	ea(_grid);
  LatComplex 	ema(_grid);
  LatComplex 	eaLs(_grid);
  LatComplex 	emaLs(_grid);
  LatComplex 	ea2Ls(_grid);
  LatComplex 	ema2Ls(_grid);
  LatComplex 	sinha(_grid);
  LatComplex 	sinhaLs(_grid);
  LatComplex 	coshaLs(_grid);
@@ -579,39 +536,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  ////////////////////////////////////////////
  cosha = (one + W*W + sk) / (abs(W)*2.0);
-  // FIXME Need a Lattice acosh
+  ea = (cosha + sqrt(cosha*cosha-one));
-
+  ema= (cosha - sqrt(cosha*cosha-one));
-  {
+  eaLs = pow(ea,Ls);
-    autoView(cosha_v,cosha,CpuRead);
+  emaLs= pow(ema,Ls);
-    autoView(a_v,a,CpuWrite);
+  ea2Ls = pow(ea,2.0*Ls);
-    for(int idx=0;idx<_grid->lSites();idx++){
+  ema2Ls= pow(ema,2.0*Ls);
-      Coordinate lcoor(Nd);
+  Wea= abs(W) * ea;
-      Tcomplex cc;
+  Wema= abs(W) * ema;
-      //    RealD sgn;
+  //  a=log(ea);
-      _grid->LocalIndexToLocalCoor(idx,lcoor);
+  
-      peekLocalSite(cc,cosha_v,lcoor);
+  sinha = 0.5*(ea - ema);
-      assert((double)real(cc)>=1.0);
+  sinhaLs = 0.5*(eaLs-emaLs);
-      assert(fabs((double)imag(cc))<=1.0e-15);
+  coshaLs = 0.5*(eaLs+emaLs);
      cc = ScalComplex(::acosh(real(cc)),0.0);
      pokeLocalSite(cc,a_v,lcoor);
    }
  }
  Wea = ( exp( a) * abs(W)  );
  Wema= ( exp(-a) * abs(W)  );
  sinha = 0.5*(exp( a) - exp(-a));
  sinhaLs = 0.5*(exp( a*Ls) - exp(-a*Ls));
  coshaLs = 0.5*(exp( a*Ls) + exp(-a*Ls));
  A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
-  F = exp( a*Ls) * (one - Wea + (Wema - one) * mass*mass);
+  F = eaLs * (one - Wea + (Wema - one) * mass*mass);
-  F = F + exp(-a*Ls) * (Wema - one + (one - Wea) * mass*mass);
+  F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
  F = F - abs(W) * sinha * 4.0 * mass;
-  Bpp =  (A/F) * (exp(-a*Ls*2.0) - one) * (one - Wema) * (one - mass*mass * one);
+  Bpp =  (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
-  Bmm =  (A/F) * (one - exp(a*Ls*2.0)) * (one - Wea) * (one - mass*mass * one);
+  Bmm =  (A/F) * (one - ea2Ls)  * (one - Wea) * (one - mass*mass * one);
-  App =  (A/F) * (exp(-a*Ls*2.0) - one) * exp(-a) * (exp(-a) - abs(W)) * (one - mass*mass * one);
+  App =  (A/F) * (ema2Ls - one) * ema * (ema - abs(W)) * (one - mass*mass * one);
-  Amm =  (A/F) * (one - exp(a*Ls*2.0)) * exp(a) * (exp(a) - abs(W)) * (one - mass*mass * one);
+  Amm =  (A/F) * (one - ea2Ls)  * ea  * (ea  - abs(W)) * (one - mass*mass * one);
  ABpm = (A/F) * abs(W) * sinha * 2.0  * (one + mass * coshaLs * 2.0 + mass*mass * one);
  //P+ source, P- source
@@ -634,29 +581,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
      buf1_4d = Zero();
      ExtractSlice(buf1_4d, PRsource, (tt-1), 0);
      //G(s,t)
-      bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d;
+      bufR_4d = bufR_4d + A * eaLs * pow(ema,f) * signW * buf1_4d + A * emaLs * pow(ea,f) * signW * buf1_4d;
      //A++*exp(a(s+t))
-      bufR_4d = bufR_4d + App * exp(a*ss) * exp(a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + App * pow(ea,ss) * pow(ea,tt) * signW * buf1_4d ;
      //A+-*exp(a(s-t))
-      bufR_4d = bufR_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf1_4d ;
      //A-+*exp(a(-s+t))
-      bufR_4d = bufR_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf1_4d ;
      //A--*exp(a(-s-t))
-      bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + Amm * pow(ema,ss) * pow(ema,tt) * signW * buf1_4d ;
      //GL
      buf2_4d = Zero();
      ExtractSlice(buf2_4d, PLsource, (tt-1), 0);
      //G(s,t)
-      bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d;
+      bufL_4d = bufL_4d + A * eaLs * pow(ema,f) * signW * buf2_4d + A * emaLs * pow(ea,f) * signW * buf2_4d;
      //B++*exp(a(s+t))
-      bufL_4d = bufL_4d + Bpp * exp(a*ss) * exp(a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + Bpp * pow(ea,ss) * pow(ea,tt) * signW * buf2_4d ;
      //B+-*exp(a(s-t))
-      bufL_4d = bufL_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf2_4d ;
      //B-+*exp(a(-s+t))
-      bufL_4d = bufL_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf2_4d ;
      //B--*exp(a(-s-t))
-      bufL_4d = bufL_4d + Bmm * exp(-a*ss) * exp(-a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + Bmm * pow(ema,ss) * pow(ema,tt) * signW * buf2_4d ;
    }
    InsertSlice(bufR_4d, GR, (ss-1), 0);
    InsertSlice(bufL_4d, GL, (ss-1), 0);
@@ -775,28 +722,12 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
  W = one - M5 + sk2;
  ////////////////////////////////////////////
-  // Cosh alpha -> alpha
+  // Cosh alpha -> exp(+/- alpha)
  ////////////////////////////////////////////
  cosha =  (one + W*W + sk) / (abs(W)*2.0);
-  // FIXME Need a Lattice acosh
+  Wea = abs(W)*(cosha + sqrt(cosha*cosha-one));
-  {
+  Wema= abs(W)*(cosha - sqrt(cosha*cosha-one));
  autoView(cosha_v,cosha,CpuRead);
  autoView(a_v,a,CpuWrite);
  for(int idx=0;idx<_grid->lSites();idx++){
    Coordinate lcoor(Nd);
    Tcomplex cc;
    //    RealD sgn;
    _grid->LocalIndexToLocalCoor(idx,lcoor);
    peekLocalSite(cc,cosha_v,lcoor);
    assert((double)real(cc)>=1.0);
    assert(fabs((double)imag(cc))<=1.0e-15);
    cc = ScalComplex(::acosh(real(cc)),0.0);
    pokeLocalSite(cc,a_v,lcoor);
  }}
  Wea = ( exp( a) * abs(W)  );
  Wema= ( exp(-a) * abs(W)  );
  num   = num + ( one - Wema ) * mass * in;
  denom= ( Wea - one ) + mass*mass * (one - Wema); 
@@ -60,6 +60,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  if  (anisotropyCoeff.isAnisotropic){
@@ -76,91 +79,6 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
  StencilOdd.BuildSurfaceList(1,vol4);
 }
 template<class Impl>
 void WilsonFermion<Impl>::Report(void)
 {
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  RealD volume = 1;
  Coordinate latt = _grid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
    std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls   : " << DhopCalls   << std::endl;
    std::cout << GridLogMessage << "WilsonFermion TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
    // Average the compute time
    _grid->GlobalSum(DhopComputeTime);
    DhopComputeTime/=NP;
    RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
    RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
   }
  if ( DerivCalls > 0 ) {
    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
    std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls    : " <<DerivCalls <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    // how to count flops here?
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call               ? : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node      ? : " << mflops/NP << std::endl;
    // how to count flops here?
    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)        ? : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl;  }
  if (DerivCalls > 0 || DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion Stencil"    <<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
 void WilsonFermion<Impl>::ZeroCounters(void) {
  DhopCalls       = 0; // ok
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopComputeTime2= 0;
  DhopFaceTime    = 0;
  DhopTotalTime   = 0;
  DerivCalls       = 0; // ok
  DerivCommTime    = 0;
  DerivComputeTime = 0;
  DerivDhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
 template <class Impl>
 void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@@ -320,7 +238,6 @@ template <class Impl>
 void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
                                        GaugeField &mat, const FermionField &A,
                                        const FermionField &B, int dag) {
  DerivCalls++;
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
@@ -329,11 +246,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
  FermionField Atilde(B.Grid());
  Atilde = A;
  DerivCommTime-=usecond();
  st.HaloExchange(B, compressor);
  DerivCommTime+=usecond();
  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma (1+g)<->(1-g) if dag
@@ -341,7 +255,6 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    int gamma = mu;
    if (!dag) gamma += Nd;
    DerivDhopComputeTime -= usecond();
    int Ls=1;
    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
@@ -349,9 +262,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    // spin trace outer product
    //////////////////////////////////////////////////
    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
    DerivDhopComputeTime += usecond();
  }
  DerivComputeTime += usecond();
 }
 template <class Impl>
@@ -398,7 +309,6 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
@@ -410,7 +320,6 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
 {
  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -423,7 +332,6 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -488,14 +396,12 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
  DhopTotalTime-=usecond();
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else
 #endif
    DhopInternalSerial(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
 }
 template <class Impl>
@@ -504,6 +410,7 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
 						      const FermionField &in,
 						      FermionField &out, int dag)
 {
  GRID_TRACE("DhopOverlapped");
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
@@ -514,53 +421,55 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
  /////////////////////////////
  std::vector<std::vector<CommsRequest_t> > requests;
  st.Prepare();
-  DhopFaceTime-=usecond();
+  {
-  st.HaloGather(in,compressor);
+    GRID_TRACE("Gather");
-  DhopFaceTime+=usecond();
+    st.HaloGather(in,compressor);
  }
-  DhopCommTime -=usecond();
+  tracePush("Communication");
  st.CommunicateBegin(requests);
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  {
-  st.CommsMergeSHM(compressor);
+    GRID_TRACE("MergeSHM");
-  DhopFaceTime+=usecond();
+    st.CommsMergeSHM(compressor);
  }
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt;
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
-  DhopCommTime   +=usecond();
+  tracePop("Communication");
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
  {
    GRID_TRACE("Merge");
    st.CommsMerge(compressor);
  }
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  } else {
    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
 };
@@ -570,20 +479,22 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
  GRID_TRACE("DhopSerial");
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
-  DhopCommTime-=usecond();
+  {
-  st.HaloExchange(in, compressor);
+    GRID_TRACE("HaloExchange");
-  DhopCommTime+=usecond();
+    st.HaloExchange(in, compressor);
  }
  DhopComputeTime-=usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDag");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  } else {
    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 };
 /*Change ends */
@@ -72,20 +72,15 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  if (SE->_is_local) {						\
    int perm= SE->_permute;					\
    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
-    spProj(chi,tmp);						\
+    spProj(chi,tmp);							\
-  } else if ( st.same_node[Dir] ) {				\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);			\
-    chi = coalescedRead(buf[SE->_offset],lane);			\
+    Recon(result, Uchi);						\
-  }								\
+  }									\
  acceleratorSynchronise();						\
  if (SE->_is_local || st.same_node[Dir] ) {			\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
    Recon(result, Uchi);					\
  }								\
  acceleratorSynchronise();
 #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
  SE = st.GetEntry(ptype, Dir, sF);				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+  if (!SE->_is_local ) {		\
    auto chi = coalescedRead(buf[SE->_offset],lane);		\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
    Recon(result, Uchi);					\
@@ -416,19 +411,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }
 #define KERNEL_CALL_TMP(A) \
  const uint64_t    NN = Nsite*Ls;					\
  auto U_p = & U_v[0];							\
  auto in_p = & in_v[0];						\
  auto out_p = & out_v[0];						\
  auto st_p = st_v._entries_p;						\
  auto st_perm = st_v._permute_type;					\
  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
      int sF = ss;							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p);	\
    });									\
  accelerator_barrier();
 #define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
@@ -440,12 +422,34 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
 #define KERNEL_CALL_EXT(A)						\
  const uint64_t    NN = Nsite*Ls;					\
  const uint64_t    sz = st.surface_list.size();			\
  auto ptr = &st.surface_list[0];					\
  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
      int sF = ptr[ss];							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
    });									
 #define ASM_CALL(A)							\
-  thread_for( ss, Nsite, {						\
+  thread_for( sss, Nsite, {						\
    int ss = st.lo->Reorder(sss);					\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
  });
 #define ASM_CALL_SLICE(A)						\
  auto grid = in.Grid() ;						\
  int nt = grid->LocalDimensions()[4];					\
  int nxyz = Nsite/nt ;							\
  for(int t=0;t<nt;t++){						\
  thread_for( sss, nxyz, {						\
    int ss = t*nxyz+sss;						\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
    });}
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
@@ -508,7 +512,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
     acceleratorFenceComputeStream();
   }
   assert(0 && " Kernel optimisation case not covered ");
  }
@@ -9,6 +9,7 @@ STAG5_IMPL_LIST=""
 WILSON_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD \
 	   WilsonImplD2 \
 	   WilsonAdjImplF \
 	   WilsonAdjImplD \
 	   WilsonTwoIndexSymmetricImplF \
@@ -25,8 +26,9 @@ COMPACT_WILSON_IMPL_LIST=" \
 DWF_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD \
 	   WilsonImplD2 \
 	   ZWilsonImplF \
-	   ZWilsonImplD "
+	   ZWilsonImplD2 "
 GDWF_IMPL_LIST=" \
 	   GparityWilsonImplF \
@@ -0,0 +1,115 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 //--------------------------------------------------------------------
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////
 // DDHMC filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 template<typename GaugeField>
 struct DDHMCFilter: public MomentumFilterBase<GaugeField>
 {
  Coordinate Block;
  int Width;
  DDHMCFilter(const Coordinate &_Block,int _Width=2): Block(_Block) { Width=_Width; }
  void applyFilter(GaugeField &U) const override
  {
    GridBase *grid = U.Grid();
    Coordinate Global=grid->GlobalDimensions();
    GaugeField zzz(grid); zzz = Zero();
    LatticeInteger coor(grid); 
    auto zzz_mu = PeekIndex<LorentzIndex>(zzz,0);
    ////////////////////////////////////////////////////
    // Zero BDY layers
    ////////////////////////////////////////////////////
    std::cout<<GridLogMessage<<" DDHMC Force Filter Block "<<Block<<" width " <<Width<<std::endl;
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	LatticeCoordinate(coor,mu);
 	////////////////////////////////
 	// OmegaBar - zero all links contained in slice B-1,0 and
 	// mu links connecting to Omega
 	////////////////////////////////
 	if ( Width==1) { 
 	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	  U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
 	if ( Width==2) { 
 	  U    = where(mod(coor,B1)==Integer(B1-2),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
 	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	  U_mu = where(mod(coor,B1)==Integer(B1-3),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
 	if ( Width==3) { 
 	  U    = where(mod(coor,B1)==Integer(B1-3),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-2),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(2)   ,zzz,U); 
 	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	  U_mu = where(mod(coor,B1)==Integer(B1-4),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
 	if ( Width==4) { 
 	  U    = where(mod(coor,B1)==Integer(B1-4),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-3),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-2),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(2)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(3)   ,zzz,U); 
 	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	  U_mu = where(mod(coor,B1)==Integer(B1-5),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
      }
    }
  }
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,71 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 //--------------------------------------------------------------------
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct DirichletFilter: public MomentumFilterBase<MomentaField>
 {
  typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
  typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
  typedef iScalar<iScalar<iScalar<vector_type> > >            ScalarType; //complex phase for each site
  Coordinate Block;
  DirichletFilter(const Coordinate &_Block): Block(_Block){}
  void applyFilter(MomentaField &P) const override
  {
    GridBase *grid = P.Grid();
    typedef decltype(PeekIndex<LorentzIndex>(P, 0)) LatCM;
    ////////////////////////////////////////////////////
    // Zero strictly links crossing between domains
    ////////////////////////////////////////////////////
    LatticeInteger coor(grid); 
    LatCM zz(grid); zz = Zero();
    for(int mu=0;mu<Nd;mu++) {
      if ( (Block[mu]) && (Block[mu] <= grid->GlobalDimensions()[mu] ) ) {
 	// If costly could provide Grid earlier and precompute masks
 	std::cout << GridLogMessage << " Dirichlet in mu="<<mu<<std::endl;
 	LatticeCoordinate(coor,mu);
 	auto P_mu = PeekIndex<LorentzIndex>(P, mu);
 	P_mu = where(mod(coor,Block[mu])==Integer(Block[mu]-1),zz,P_mu);
 	PokeIndex<LorentzIndex>(P, P_mu, mu);
      }
    }
  }
 };
 NAMESPACE_END(Grid);
@@ -37,7 +37,8 @@ NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct MomentumFilterBase{
-  virtual void applyFilter(MomentaField &P) const;
+  virtual void applyFilter(MomentaField &P) const = 0;
  virtual ~MomentumFilterBase(){};
 };
 //Do nothing
@@ -83,7 +84,6 @@ struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
  }
 };
@@ -69,6 +69,11 @@ public:
    return PeriodicBC::ShiftStaple(Link,mu);
  }
  //Same as Cshift for periodic BCs
  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
    return PeriodicBC::CshiftLink(Link,mu,shift);
  }
  static inline bool isPeriodicGaugeField(void) { return true; }
 };
@@ -110,6 +115,11 @@ public:
      return PeriodicBC::CovShiftBackward(Link, mu, field);
  }
  //If mu is a conjugate BC direction
  //Out(x) = U^dag_\mu(x-mu)  | x_\mu != 0
  //       = U^T_\mu(L-1)  | x_\mu == 0
  //else
  //Out(x) = U^dag_\mu(x-mu mod L)
  static inline GaugeLinkField
  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
  {
@@ -129,6 +139,13 @@ public:
      return PeriodicBC::CovShiftIdentityForward(Link,mu);
  }
  //If mu is a conjugate BC direction
  //Out(x) = S_\mu(x+mu)  | x_\mu != L-1
  //       = S*_\mu(x+mu)  | x_\mu == L-1
  //else
  //Out(x) = S_\mu(x+mu mod L)
  //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
  {
    assert(_conjDirs.size() == Nd);
@@ -138,6 +155,27 @@ public:
      return PeriodicBC::ShiftStaple(Link,mu);
  }
  //Boundary-aware C-shift of gauge links / gauge transformation matrices
  //For conjugate BC direction
  //shift = 1
  //Out(x) = U_\mu(x+\hat\mu)  | x_\mu != L-1
  //       = U*_\mu(0)  | x_\mu == L-1
  //shift = -1
  //Out(x) = U_\mu(x-mu)  | x_\mu != 0
  //       = U*_\mu(L-1)  | x_\mu == 0
  //else
  //shift = 1
  //Out(x) = U_\mu(x+\hat\mu mod L)
  //shift = -1
  //Out(x) = U_\mu(x-\hat\mu mod L)
  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
    assert(_conjDirs.size() == Nd);
    if(_conjDirs[mu]) 
      return ConjugateBC::CshiftLink(Link,mu,shift);
    else     
      return PeriodicBC::CshiftLink(Link,mu,shift);
  }
  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
@@ -13,6 +13,31 @@ NAMESPACE_BEGIN(Grid);
      std::cout << GridLogMessage << "Pseudofermion action lamda_max "<<lambda_max<<"( bound "<<hi<<")"<<std::endl;
      assert( (lambda_max < hi) && " High Bounds Check on operator failed" );
    }
     template<class Field> void ChebyBoundsCheck(LinearOperatorBase<Field> &HermOp,
 						 Field &GaussNoise,
 						 RealD lo,RealD hi) 
    {
      int orderfilter = 1000;
      Chebyshev<Field> Cheb(lo,hi,orderfilter);
      GridBase *FermionGrid = GaussNoise.Grid();
      Field X(FermionGrid);
      Field Z(FermionGrid);
      X=GaussNoise;
      RealD Nx = norm2(X);
      Cheb(HermOp,X,Z);
      RealD Nz = norm2(Z);
      std::cout << "************************* "<<std::endl;
      std::cout << " noise                    = "<<Nx<<std::endl;
      std::cout << " Cheb x noise             = "<<Nz<<std::endl;
      std::cout << " Ratio                    = "<<Nz/Nx<<std::endl;
      std::cout << "************************* "<<std::endl;
      assert( ((Nz/Nx)<1.0) && " ChebyBoundsCheck ");
    }
    template<class Field> void InverseSqrtBoundsCheck(int MaxIter,double tol,
 						       LinearOperatorBase<Field> &HermOp,
@@ -40,13 +65,65 @@ NAMESPACE_BEGIN(Grid);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
-      std::cout << " noise                         = "<<Nx<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
-      std::cout << " (MdagM^-1/2)^2  noise         = "<<Nz<<std::endl;
+      std::cout << " | (MdagM^-1/2)^2  noise |^2         = "<<Nz<<std::endl;
-      std::cout << " MdagM (MdagM^-1/2)^2  noise   = "<<Ny<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/2)^2  noise |^2   = "<<Ny<<std::endl;
-      std::cout << " noise - MdagM (MdagM^-1/2)^2  noise   = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise |^2  = "<<Nd<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise|/|noise| = " << std::sqrt(Nd/Nx) << std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
    }
    /* For a HermOp = M^dag M, check the approximation of  HermOp^{-1/inv_pow}
       by computing   |X -    HermOp * [ Hermop^{-1/inv_pow} ]^{inv_pow} X|  < tol  
       for noise X (aka GaussNoise).
       ApproxNegPow should be the rational approximation for   X^{-1/inv_pow}
    */
    template<class Field> void InversePowerBoundsCheck(int inv_pow,
 						       int MaxIter,double tol,
 						       LinearOperatorBase<Field> &HermOp,
 						       Field &GaussNoise,
 						       MultiShiftFunction &ApproxNegPow) 
    {
      GridBase *FermionGrid = GaussNoise.Grid();
      Field X(FermionGrid);
      Field Y(FermionGrid);
      Field Z(FermionGrid);
      Field tmp1(FermionGrid), tmp2(FermionGrid);
      X=GaussNoise;
      RealD Nx = norm2(X);
      ConjugateGradientMultiShift<Field> msCG(MaxIter,ApproxNegPow);
      tmp1 = X;
      Field* in = &tmp1;
      Field* out = &tmp2;
      for(int i=0;i<inv_pow;i++){ //apply  [ Hermop^{-1/inv_pow}  ]^{inv_pow} X =   HermOp^{-1} X
 	msCG(HermOp, *in, *out); //backwards conventions!
 	if(i!=inv_pow-1) std::swap(in, out);
      }
      Z = *out;
      RealD Nz = norm2(Z);
      HermOp.HermOp(Z,Y);
      RealD Ny = norm2(Y);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
      std::cout << " | (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2        = "<<Nz<<std::endl;
      std::cout << " | MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2   = "<<Ny<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2  = "<<Nd<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |/| noise |  = "<<std::sqrt(Nd/Nx)<<std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
    }
 NAMESPACE_END(Grid);
@@ -0,0 +1,163 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundaryBoson.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & NumOp;// the basic operator
  RealD InnerStoppingCondition;
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
 public:
  DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_NumOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6)
    : NumOp(_NumOp), 
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol),
      InnerStoppingCondition(_InnerTol),
      Phi(_NumOp.FermionGrid()) {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    // P(phi) = e^{- phi^dag P^dag P phi}
    //
    // NumOp == P
    //
    // Take phi = P^{-1} eta  ; eta = P Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=ActionStoppingCondition;
    NumOp.ImportGauge(U);
    FermionField eta(NumOp.FermionGrid());
    gaussian(pRNG,eta);    eta=eta*scale;
    NumOp.ProjectBoundaryBar(eta);
    //DumpSliceNorm("eta",eta);
    NumOp.RInv(eta,Phi);
    //DumpSliceNorm("Phi",Phi);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Pdag P phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=ActionStoppingCondition;
    NumOp.ImportGauge(U);
    FermionField Y(NumOp.FermionGrid());
    NumOp.R(Phi,Y);
    RealD action = norm2(Y);
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=DerivativeStoppingCondition;
    NumOp.ImportGauge(U);
    GridBase *fgrid = NumOp.FermionGrid();
    GridBase *ugrid = NumOp.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DobiDdbPhi(fgrid);      // Vector A in my notes
    FermionField DoiDdDobiDdbPhi(fgrid); // Vector B in my notes
    FermionField DoidP_Phi(fgrid);    // Vector E in my notes
    FermionField DobidDddDoidP_Phi(fgrid);    // Vector F in my notes
    FermionField P_Phi(fgrid);
    // P term
    NumOp.dBoundaryBar(Phi,tmp);
    NumOp.dOmegaBarInv(tmp,DobiDdbPhi);        // Vector A
    NumOp.dBoundary(DobiDdbPhi,tmp);
    NumOp.dOmegaInv(tmp,DoiDdDobiDdbPhi);      // Vector B
    P_Phi  = Phi - DoiDdDobiDdbPhi;
    NumOp.ProjectBoundaryBar(P_Phi);
    // P^dag P term
    NumOp.dOmegaDagInv(P_Phi,DoidP_Phi); // Vector E
    NumOp.dBoundaryDag(DoidP_Phi,tmp);
    NumOp.dOmegaBarDagInv(tmp,DobidDddDoidP_Phi);   // Vector F
    NumOp.dBoundaryBarDag(DobidDddDoidP_Phi,tmp);
    X = DobiDdbPhi;
    Y = DobidDddDoidP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DoiDdDobiDdbPhi;
    Y = DoidP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,158 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & DenOp;// the basic operator
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  RealD InnerStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
  RealD refresh_action;
 public:
  DomainDecomposedBoundaryTwoFlavourPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_DenOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol = 1.0e-6 )
    : DenOp(_DenOp),
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol),
      InnerStoppingCondition(_InnerTol),
      Phi(_DenOp.FermionGrid()) {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    // P(phi) = e^{- phi^dag Rdag^-1 R^-1 phi}
    //
    // DenOp == R
    //
    // Take phi = R eta  ; eta = R^-1 Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol     =ActionStoppingCondition;
    DenOp.ImportGauge(U);
    FermionField eta(DenOp.FermionGrid());
    gaussian(pRNG,eta);    eta=eta*scale;
    DenOp.ProjectBoundaryBar(eta);
    DenOp.R(eta,Phi);
    //DumpSliceNorm("Phi",Phi);
    refresh_action = norm2(eta);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Rdag^-1 R^-1 phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol=ActionStoppingCondition;
    DenOp.ImportGauge(U);
    FermionField X(DenOp.FermionGrid());
    DenOp.RInv(Phi,X);
    RealD action = norm2(X);
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol=DerivativeStoppingCondition;
    DenOp.ImportGauge(U);
    GridBase *fgrid = DenOp.FermionGrid();
    GridBase *ugrid = DenOp.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DiDdb_Phi(fgrid);      // Vector C in my notes
    FermionField DidRinv_Phi(fgrid);    // Vector D in my notes
    FermionField Rinv_Phi(fgrid);
 //   FermionField RinvDagRinv_Phi(fgrid);
 //   FermionField DdbdDidRinv_Phi(fgrid);
    // R^-1 term
    DenOp.dBoundaryBar(Phi,tmp);
    DenOp.Dinverse(tmp,DiDdb_Phi);            // Vector C
    Rinv_Phi = Phi - DiDdb_Phi;
    DenOp.ProjectBoundaryBar(Rinv_Phi); 
    // R^-dagger R^-1 term
    DenOp.DinverseDag(Rinv_Phi,DidRinv_Phi); // Vector D
 /*
    DenOp.dBoundaryBarDag(DidRinv_Phi,DdbdDidRinv_Phi);
    RinvDagRinv_Phi = Rinv_Phi - DdbdDidRinv_Phi;
    DenOp.ProjectBoundaryBar(RinvDagRinv_Phi);
 */
    X = DiDdb_Phi;
    Y = DidRinv_Phi;
    DenOp.PeriodicFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=force;
    DenOp.PeriodicFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    DumpSliceNorm("force",dSdU);
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,237 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & NumOp;// the basic operator
  SchurFactoredFermionOperator<ImplD,ImplF> & DenOp;// the basic operator
  RealD InnerStoppingCondition;
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
 public:
  DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_NumOp, 
 						       SchurFactoredFermionOperator<ImplD,ImplF>  &_DenOp,
 						       RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6)
    : NumOp(_NumOp), DenOp(_DenOp),
      Phi(_NumOp.PeriodicFermOpD.FermionGrid()),
      InnerStoppingCondition(_InnerTol),
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol)
  {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField eta(NumOp.PeriodicFermOpD.FermionGrid());
    FermionField tmp(NumOp.PeriodicFermOpD.FermionGrid());
    // P(phi) = e^{- phi^dag P^dag Rdag^-1 R^-1 P phi}
    //
    // NumOp == P
    // DenOp == R
    //
    // Take phi = P^{-1} R eta  ; eta = R^-1 P Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    gaussian(pRNG,eta);    eta=eta*scale;
    NumOp.ProjectBoundaryBar(eta);
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = ActionStoppingCondition;
    NumOp.tol = ActionStoppingCondition;
    DenOp.R(eta,tmp);
    NumOp.RInv(tmp,Phi);
    DumpSliceNorm("Phi",Phi);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Pdag Rdag^-1 R^-1 P phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField X(NumOp.PeriodicFermOpD.FermionGrid());
    FermionField Y(NumOp.PeriodicFermOpD.FermionGrid());
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = ActionStoppingCondition;
    NumOp.tol = ActionStoppingCondition;
    NumOp.R(Phi,Y);
    DenOp.RInv(Y,X);
    RealD action = norm2(X);
    //    std::cout << " DD boundary action is " <<action<<std::endl;
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    GridBase *fgrid = NumOp.PeriodicFermOpD.FermionGrid();
    GridBase *ugrid = NumOp.PeriodicFermOpD.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DobiDdbPhi(fgrid);      // Vector A in my notes
    FermionField DoiDdDobiDdbPhi(fgrid); // Vector B in my notes
    FermionField DiDdbP_Phi(fgrid);      // Vector C in my notes
    FermionField DidRinvP_Phi(fgrid);    // Vector D in my notes
    FermionField DdbdDidRinvP_Phi(fgrid);
    FermionField DoidRinvDagRinvP_Phi(fgrid);    // Vector E in my notes
    FermionField DobidDddDoidRinvDagRinvP_Phi(fgrid);    // Vector F in my notes
    FermionField P_Phi(fgrid);
    FermionField RinvP_Phi(fgrid);
    FermionField RinvDagRinvP_Phi(fgrid);
    FermionField PdagRinvDagRinvP_Phi(fgrid);
    //    RealD action = S(U);
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = DerivativeStoppingCondition;
    NumOp.tol = DerivativeStoppingCondition;
    // P term
    NumOp.dBoundaryBar(Phi,tmp);
    NumOp.dOmegaBarInv(tmp,DobiDdbPhi);        // Vector A
    NumOp.dBoundary(DobiDdbPhi,tmp);
    NumOp.dOmegaInv(tmp,DoiDdDobiDdbPhi);      // Vector B
    P_Phi  = Phi - DoiDdDobiDdbPhi;
    NumOp.ProjectBoundaryBar(P_Phi);
    // R^-1 P term
    DenOp.dBoundaryBar(P_Phi,tmp);
    DenOp.Dinverse(tmp,DiDdbP_Phi);            // Vector C
    RinvP_Phi = P_Phi - DiDdbP_Phi;
    DenOp.ProjectBoundaryBar(RinvP_Phi); // Correct to here
    // R^-dagger R^-1 P term
    DenOp.DinverseDag(RinvP_Phi,DidRinvP_Phi); // Vector D
    DenOp.dBoundaryBarDag(DidRinvP_Phi,DdbdDidRinvP_Phi);
    RinvDagRinvP_Phi = RinvP_Phi - DdbdDidRinvP_Phi;
    DenOp.ProjectBoundaryBar(RinvDagRinvP_Phi);
    // P^dag R^-dagger R^-1 P term
    NumOp.dOmegaDagInv(RinvDagRinvP_Phi,DoidRinvDagRinvP_Phi); // Vector E
    NumOp.dBoundaryDag(DoidRinvDagRinvP_Phi,tmp);
    NumOp.dOmegaBarDagInv(tmp,DobidDddDoidRinvDagRinvP_Phi);   // Vector F
    NumOp.dBoundaryBarDag(DobidDddDoidRinvDagRinvP_Phi,tmp);
    PdagRinvDagRinvP_Phi = RinvDagRinvP_Phi- tmp;
    NumOp.ProjectBoundaryBar(PdagRinvDagRinvP_Phi);
    /*
    std::cout << "S eval  "<< action << std::endl;
    std::cout << "S - IP1 "<< innerProduct(Phi,PdagRinvDagRinvP_Phi) << std::endl;
    std::cout << "S - IP2 "<< norm2(RinvP_Phi) << std::endl;
    NumOp.R(Phi,tmp);
    tmp = tmp - P_Phi;
    std::cout << "diff1 "<<norm2(tmp) <<std::endl;
    DenOp.RInv(P_Phi,tmp);
    tmp = tmp - RinvP_Phi;
    std::cout << "diff2 "<<norm2(tmp) <<std::endl;
    DenOp.RDagInv(RinvP_Phi,tmp);
    tmp  = tmp - RinvDagRinvP_Phi;
    std::cout << "diff3 "<<norm2(tmp) <<std::endl;
    DenOp.RDag(RinvDagRinvP_Phi,tmp);
    tmp  = tmp - PdagRinvDagRinvP_Phi;
    std::cout << "diff4 "<<norm2(tmp) <<std::endl;
    */
    dSdU=Zero();
    X = DobiDdbPhi;
    Y = DobidDddDoidRinvDagRinvP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DoiDdDobiDdbPhi;
    Y = DoidRinvDagRinvP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DiDdbP_Phi;
    Y = DidRinvP_Phi;
    DenOp.PeriodicFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    DenOp.PeriodicFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
@@ -44,6 +44,10 @@ NAMESPACE_BEGIN(Grid);
  // Exact one flavour implementation of DWF determinant ratio //
  ///////////////////////////////////////////////////////////////
  //Note: using mixed prec CG for the heatbath solver in this action class will not work
  //      because the L, R operators must have their shift coefficients updated throughout the heatbath step
  //      You will find that the heatbath solver simply won't converge.
  //      To use mixed precision here use the ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction variant below
  template<class Impl>
  class ExactOneFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField>
  {
@@ -57,37 +61,60 @@ NAMESPACE_BEGIN(Grid);
      bool use_heatbath_forecasting;
      AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
      AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
-      SchurRedBlackDiagMooeeSolve<FermionField> SolverHB;
+      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBL;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBR;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverR;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverR;
      FermionField Phi; // the pseudofermion field for this trajectory
      RealD norm2_eta; //|eta|^2 where eta is the random gaussian field used to generate the pseudofermion field
      bool initial_action; //true for the first call to S after refresh, for which the identity S = |eta|^2 holds provided the rational approx is good
    public:
      //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
      virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
 	AbstractEOFAFermion<Impl>&op = LorR == 0 ? Lop : Rop;
 	op.RefreshShiftCoefficients(to);
      }
      //Use the same solver for L,R in all cases
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& CG, 
 					      Params& p, 
 					      bool use_fc=false) 
-	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,p,use_fc) {};
+	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,CG,p,use_fc) {};
-	
+
      //Use the same solver for L,R in the heatbath but different solvers elsewhere
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
-					      OperatorFunction<FermionField>& HeatbathCG, 
+					      OperatorFunction<FermionField>& HeatbathCG,
 					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 					      Params& p, 
 					      bool use_fc=false)
 	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,HeatbathCG,HeatbathCG, ActionCGL, ActionCGR, DerivCGL,DerivCGR,p,use_fc) {};
      //Use different solvers for L,R in all cases
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
 					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 					      Params& p, 
 					      bool use_fc=false) : 
        Lop(_Lop), 
 	Rop(_Rop), 
-	SolverHB(HeatbathCG,false,true),
+	SolverHBL(HeatbathCGL,false,true), SolverHBR(HeatbathCGR,false,true),
 	SolverL(ActionCGL, false, true), SolverR(ActionCGR, false, true), 
 	DerivativeSolverL(DerivCGL, false, true), DerivativeSolverR(DerivCGR, false, true), 
 	Phi(_Lop.FermionGrid()), 
 	param(p), 
-        use_heatbath_forecasting(use_fc)
+	use_heatbath_forecasting(use_fc),
 	initial_action(false)
      {
        AlgRemez remez(param.lo, param.hi, param.precision);
@@ -97,6 +124,8 @@ NAMESPACE_BEGIN(Grid);
        PowerNegHalf.Init(remez, param.tolerance, true);
      };
      const FermionField &getPhi() const{ return Phi; }
      virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; }
      virtual std::string LogParameters() {
@@ -117,6 +146,19 @@ NAMESPACE_BEGIN(Grid);
        else{ for(int s=0; s<Ls; ++s){ axpby_ssp_pminus(out, 0.0, in, 1.0, in, s, s); } }
      }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
        // 
        RealD scale = std::sqrt(0.5);
        FermionField eta    (Lop.FermionGrid());
        gaussian(pRNG,eta); eta = eta * scale;
 	refresh(U,eta);
      }
      // EOFA heatbath: see Eqn. (29) of arXiv:1706.05843
      // We generate a Gaussian noise vector \eta, and then compute
      //  \Phi = M_{\rm EOFA}^{-1/2} * \eta
@@ -124,12 +166,10 @@ NAMESPACE_BEGIN(Grid);
      //
      // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta
      //
-      virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
+     void refresh(const GaugeField &U, const FermionField &eta) {
      {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
        FermionField eta         (Lop.FermionGrid());
        FermionField CG_src      (Lop.FermionGrid());
        FermionField CG_soln     (Lop.FermionGrid());
        FermionField Forecast_src(Lop.FermionGrid());
@@ -140,11 +180,6 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
        // Seed with Gaussian noise vector (var = 0.5)
        RealD scale = std::sqrt(0.5);
        gaussian(pRNG,eta);
        eta = eta * scale;
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
        RealD N(PowerNegHalf.norm);
        for(int k=0; k<param.degree; ++k){ N += PowerNegHalf.residues[k] / ( 1.0 + PowerNegHalf.poles[k] ); }
@@ -160,15 +195,15 @@ NAMESPACE_BEGIN(Grid);
        tmp[1] = Zero();
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Lop.RefreshShiftCoefficients(-gamma_l);
+          heatbathRefreshShiftCoefficients(0, -gamma_l);
          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
            Lop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
-            SolverHB(Lop, CG_src, CG_soln);
+            SolverHBL(Lop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero(); // Just use zero as the initial guess
-            SolverHB(Lop, CG_src, CG_soln);
+	    SolverHBL(Lop, CG_src, CG_soln);
          }
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
@@ -187,15 +222,15 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.clear(); } // empirically, LH solns don't help for RH solves
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
+	  heatbathRefreshShiftCoefficients(1, -gamma_l*PowerNegHalf.poles[k]);
          if(use_heatbath_forecasting){
            Rop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
-            SolverHB(Rop, CG_src, CG_soln);
+            SolverHBR(Rop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero();
-            SolverHB(Rop, CG_src, CG_soln);
+            SolverHBR(Rop, CG_src, CG_soln);
          }
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
@@ -205,49 +240,117 @@ NAMESPACE_BEGIN(Grid);
        Phi = Phi + tmp[1];
        // Reset shift coefficients for energy and force evals
-        Lop.RefreshShiftCoefficients(0.0);
+	heatbathRefreshShiftCoefficients(0, 0.0);
-        Rop.RefreshShiftCoefficients(-1.0);
+	heatbathRefreshShiftCoefficients(1, -1.0);
 	//Mark that the next call to S is the first after refresh
 	initial_action = true;
 	// Bounds check
 	RealD EtaDagEta = norm2(eta);
 	norm2_eta = EtaDagEta;
 	//	RealD PhiDagMPhi= norm2(eta);
      };
-      void Meofa(const GaugeField& U,const FermionField &phi, FermionField & Mphi) 
+      void Meofa(const GaugeField& U,const FermionField &in, FermionField & out) 
      {
 #if 0
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
-        FermionField spProj_Phi(Lop.FermionGrid());
+        FermionField spProj_in(Lop.FermionGrid());
 	FermionField mPhi(Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
-	mPhi = phi;
+	out = in;
        // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
-        spProj(Phi, spProj_Phi, -1, Lop.Ls);
+        spProj(in, spProj_in, -1, Lop.Ls);
-        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
+        Lop.Omega(spProj_in, tmp[0], -1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverL(Lop, tmp[1], tmp[0]);
        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
        Lop.Omega(tmp[1], tmp[0], -1, 1);
-	mPhi = mPhi -  Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
+	spProj(tmp[0], tmp[1], -1, Lop.Ls);
 	out = out -  Lop.k * tmp[1];
        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
-        spProj(Phi, spProj_Phi, 1, Rop.Ls);
+        spProj(in, spProj_in, 1, Rop.Ls);
-        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
+        Rop.Omega(spProj_in, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverR(Rop, tmp[1], tmp[0]);
        Rop.Dtilde(tmp[0], tmp[1]);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
-        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
+	spProj(tmp[0], tmp[1], 1, Rop.Ls);
-#endif
+
        out = out + Rop.k * tmp[1];
      }
      //Due to the structure of EOFA, it is no more expensive to compute the inverse of Meofa
      //To ensure correctness we can simply reuse the heatbath code but use the rational approx
      //f(x) = 1/x   which corresponds to alpha_0=0,  alpha_1=1,  beta_1=0 => gamma_1=1
      void MeofaInv(const GaugeField &U, const FermionField &in, FermionField &out) {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
        FermionField CG_src      (Lop.FermionGrid());
        FermionField CG_soln     (Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
 	// = 1 * \eta
        out = in;
        // LH terms:
        // \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf)
        //          - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta
        spProj(in, tmp[0], -1, Lop.Ls);
        Lop.Omega(tmp[0], tmp[1], -1, 0);
        G5R5(CG_src, tmp[1]);
        {
          heatbathRefreshShiftCoefficients(0, -1.); //-gamma_1 = -1.
 	  CG_soln = Zero(); // Just use zero as the initial guess
 	  SolverHBL(Lop, CG_src, CG_soln);
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = Lop.k * tmp[0];
        }
        Lop.Omega(tmp[1], tmp[0], -1, 1);
        spProj(tmp[0], tmp[1], -1, Lop.Ls);
        out = out + tmp[1];
        // RH terms:
        // \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb)
        //          - \beta_l\gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta
        spProj(in, tmp[0], 1, Rop.Ls);
        Rop.Omega(tmp[0], tmp[1], 1, 0);
        G5R5(CG_src, tmp[1]);
        {
 	  heatbathRefreshShiftCoefficients(1, 0.); //-gamma_1 * beta_1 = 0
 	  CG_soln = Zero();
 	  SolverHBR(Rop, CG_src, CG_soln);
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = - Rop.k * tmp[0];
        }
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        spProj(tmp[0], tmp[1], 1, Rop.Ls);
        out = out + tmp[1];
        // Reset shift coefficients for energy and force evals
 	heatbathRefreshShiftCoefficients(0, 0.0);
 	heatbathRefreshShiftCoefficients(1, -1.0);
      };
      // EOFA action: see Eqn. (10) of arXiv:1706.05843
      virtual RealD S(const GaugeField& U)
      {
@@ -271,7 +374,7 @@ NAMESPACE_BEGIN(Grid);
        action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
        spProj(Phi, spProj_Phi, 1, Rop.Ls);
        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
@@ -281,6 +384,26 @@ NAMESPACE_BEGIN(Grid);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
 	if(initial_action){
 	  //For the first call to S after refresh,  S = |eta|^2. We can use this to ensure the rational approx is good
 	  RealD diff = action - norm2_eta;
 	  //S_init = eta^dag M^{-1/2} M M^{-1/2} eta
 	  //S_init - eta^dag eta =  eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta
 	  //If approximate solution
 	  //S_init - eta^dag eta =  eta^dag ( [M^{-1/2}+\delta M^{-1/2}] M [M^{-1/2}+\delta M^{-1/2}] - 1 ) eta
 	  //               \approx  eta^dag ( \delta M^{-1/2} M^{1/2} + M^{1/2}\delta M^{-1/2} ) eta
 	  // We divide out |eta|^2 to remove source scaling but the tolerance on this check should still be somewhat higher than the actual approx tolerance
 	  RealD test = fabs(diff)/norm2_eta; //test the quality of the rational approx
 	  std::cout << GridLogMessage << action_name() << " initial action " << action << " expect " << norm2_eta << "; diff " << diff << std::endl;
 	  std::cout << GridLogMessage << action_name() << "[ eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta ]/|eta^2| = " << test << "  expect 0 (tol " << param.BoundsCheckTol << ")" << std::endl;
 	  assert( ( test < param.BoundsCheckTol ) && " Initial action check failed" );
 	  initial_action = false;
 	}
        return action;
      };
@@ -329,6 +452,40 @@ NAMESPACE_BEGIN(Grid);
      };
  };
  template<class ImplD, class ImplF>
  class ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction : public ExactOneFlavourRatioPseudoFermionAction<ImplD>{
  public:
    INHERIT_IMPL_TYPES(ImplD);
    typedef OneFlavourRationalParams Params;
  private:
    AbstractEOFAFermion<ImplF>& LopF; // the basic LH operator
    AbstractEOFAFermion<ImplF>& RopF; // the basic RH operator
  public:
    virtual std::string action_name() { return "ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction"; }
    //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
    virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
      AbstractEOFAFermion<ImplF> &op = LorR == 0 ? LopF : RopF;
      op.RefreshShiftCoefficients(to);
      this->ExactOneFlavourRatioPseudoFermionAction<ImplD>::heatbathRefreshShiftCoefficients(LorR,to);
    }
    ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction(AbstractEOFAFermion<ImplF>& _LopF, 
 							     AbstractEOFAFermion<ImplF>& _RopF,
 							     AbstractEOFAFermion<ImplD>& _LopD, 
 							     AbstractEOFAFermion<ImplD>& _RopD,
 							     OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
 							     OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 							     OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 							     Params& p, 
 							     bool use_fc=false) : 
    LopF(_LopF), RopF(_RopF), ExactOneFlavourRatioPseudoFermionAction<ImplD>(_LopD, _RopD, HeatbathCGL, HeatbathCGR, ActionCGL, ActionCGR, DerivCGL, DerivCGR, p, use_fc){}
  };
 NAMESPACE_END(Grid);
 #endif
@@ -0,0 +1,434 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
    Copyright (C) 2015
    Author: Christopher Kelly <ckelly@bnl.gov>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
 NAMESPACE_BEGIN(Grid);
    /////////////////////////////////////////////////////////
    // Generic rational approximation for ratios of operators
    /////////////////////////////////////////////////////////
    /* S_f = -log( det(  [M^dag M]/[V^dag V] )^{1/inv_pow}  )
           = chi^dag ( [M^dag M]/[V^dag V] )^{-1/inv_pow} chi\
 	   = chi^dag ( [V^dag V]^{-1/2} [M^dag M] [V^dag V]^{-1/2} )^{-1/inv_pow} chi\
 	   = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{1/(2*inv_pow)} chi\
 	   S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
       BIG WARNING:	   
       Here V^dag V is referred to in this code as the "numerator" operator and M^dag M is the *denominator* operator.
       this refers to their position in the pseudofermion action, which is the *inverse* of what appears in the determinant
       Thus for DWF the numerator operator is the Pauli-Villars operator
       Here P/Q \sim R_{1/(2*inv_pow)}  ~ (V^dagV)^{1/(2*inv_pow)}  
       Here N/D \sim R_{-1/inv_pow} ~ (M^dagM)^{-1/inv_pow}  
    */
    template<class Impl>
    class GeneralEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef RationalActionParams Params;
      Params param;
      RealD  RefreshAction;
      //For action evaluation
      MultiShiftFunction ApproxPowerAction   ;  //rational approx for X^{1/inv_pow}
      MultiShiftFunction ApproxNegPowerAction;  //rational approx for X^{-1/inv_pow}
      MultiShiftFunction ApproxHalfPowerAction;   //rational approx for X^{1/(2*inv_pow)}
      MultiShiftFunction ApproxNegHalfPowerAction; //rational approx for X^{-1/(2*inv_pow)}
      //For the MD integration
      MultiShiftFunction ApproxPowerMD   ;  //rational approx for X^{1/inv_pow}
      MultiShiftFunction ApproxNegPowerMD;  //rational approx for X^{-1/inv_pow}
      MultiShiftFunction ApproxHalfPowerMD;   //rational approx for X^{1/(2*inv_pow)}
      MultiShiftFunction ApproxNegHalfPowerMD; //rational approx for X^{-1/(2*inv_pow)}
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
      FermionOperator<Impl> & DenOp;// the basic operator
      FermionField PhiEven; // the pseudo fermion field for this trajectory
      FermionField PhiOdd; // the pseudo fermion field for this trajectory
      //Generate the approximation to x^{1/inv_pow} (->approx)   and x^{-1/inv_pow} (-> approx_inv)  by an approx_degree degree rational approximation
      //CG_tolerance is used to issue a warning if the approximation error is larger than the tolerance of the CG and is otherwise just stored in the MultiShiftFunction for use by the multi-shift
      static void generateApprox(MultiShiftFunction &approx, MultiShiftFunction &approx_inv, int inv_pow, int approx_degree, double CG_tolerance, AlgRemez &remez){
 	std::cout<<GridLogMessage << "Generating degree "<< approx_degree<<" approximation for x^(1/" << inv_pow << ")"<<std::endl;
 	double error = remez.generateApprox(approx_degree,1,inv_pow);	
 	if(error > CG_tolerance)
 	  std::cout<<GridLogMessage << "WARNING: Remez approximation has a larger error " << error << " than the CG tolerance " << CG_tolerance << "! Try increasing the number of poles" << std::endl;
 	approx.Init(remez, CG_tolerance,false);
 	approx_inv.Init(remez, CG_tolerance,true);
      }
    protected:
      static constexpr bool Numerator = true;
      static constexpr bool Denominator = false;
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, FermionField &out){
 	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
 	msCG(schurOp,in, out);
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, std::vector<FermionField> &out_elems, FermionField &out){
 	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
 	msCG(schurOp,in, out_elems, out);
      }
      //Allow derived classes to override the gauge import
      virtual void ImportGauge(const GaugeField &U){
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
      }
    public:
      // allow non-uniform tolerances 
      void SetTolerances(std::vector<RealD> action_tolerance,std::vector<RealD> md_tolerance)
      {
 	assert(action_tolerance.size()==ApproxPowerAction.tolerances.size());
 	assert(    md_tolerance.size()==ApproxPowerMD.tolerances.size());
 	// Fix up the tolerances
 	for(int i=0;i<ApproxPowerAction.tolerances.size();i++){
 	  ApproxPowerAction.tolerances[i]       = action_tolerance[i];
 	  ApproxNegPowerAction.tolerances[i]    = action_tolerance[i];
 	  ApproxHalfPowerAction.tolerances[i]   = action_tolerance[i];
 	  ApproxNegHalfPowerAction.tolerances[i]= action_tolerance[i];
 	}
 	for(int i=0;i<ApproxPowerMD.tolerances.size();i++){
 	  ApproxPowerMD.tolerances[i]       = md_tolerance[i];
 	  ApproxNegPowerMD.tolerances[i]    = md_tolerance[i];
 	  ApproxHalfPowerMD.tolerances[i]   = md_tolerance[i];
 	  ApproxNegHalfPowerMD.tolerances[i]= md_tolerance[i];
 	}
 	// Print out - could deprecate
 	for(int i=0;i<ApproxPowerMD.tolerances.size();i++) {
 	  std::cout<<GridLogMessage << " ApproxPowerMD shift["<<i<<"] "
 		   <<" pole    "<<ApproxPowerMD.poles[i]
 		   <<" residue "<<ApproxPowerMD.residues[i]
 		   <<" tol     "<<ApproxPowerMD.tolerances[i]<<std::endl;
 	}
 	/*
 	  for(int i=0;i<ApproxNegPowerMD.tolerances.size();i++) {
 	  std::cout<<GridLogMessage << " ApproxNegPowerMD shift["<<i<<"] "
 		   <<" pole    "<<ApproxNegPowerMD.poles[i]
 		   <<" residue "<<ApproxNegPowerMD.residues[i]
 		   <<" tol     "<<ApproxNegPowerMD.tolerances[i]<<std::endl;
 	}
 	for(int i=0;i<ApproxHalfPowerMD.tolerances.size();i++) {
 	  std::cout<<GridLogMessage << " ApproxHalfPowerMD shift["<<i<<"] "
 		   <<" pole    "<<ApproxHalfPowerMD.poles[i]
 		   <<" residue "<<ApproxHalfPowerMD.residues[i]
 		   <<" tol     "<<ApproxHalfPowerMD.tolerances[i]<<std::endl;
 	}
 	for(int i=0;i<ApproxNegHalfPowerMD.tolerances.size();i++) {
 	  std::cout<<GridLogMessage << " ApproxNegHalfPowerMD shift["<<i<<"] "
 		   <<" pole    "<<ApproxNegHalfPowerMD.poles[i]
 		   <<" residue "<<ApproxNegHalfPowerMD.residues[i]
 		   <<" tol     "<<ApproxNegHalfPowerMD.tolerances[i]<<std::endl;
 	}
 	*/
      }
      GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 						     FermionOperator<Impl>  &_DenOp, 
 						     const Params & p
 						     ) : 
 	NumOp(_NumOp), 
 	DenOp(_DenOp), 
 	PhiOdd (_NumOp.FermionRedBlackGrid()),
 	PhiEven(_NumOp.FermionRedBlackGrid()),
 	param(p) 
      {
 	std::cout<<GridLogMessage << action_name() << " initialize: starting" << std::endl;
 	AlgRemez remez(param.lo,param.hi,param.precision);
 	//Generate approximations for action eval
 	generateApprox(ApproxPowerAction, ApproxNegPowerAction, param.inv_pow, param.action_degree, param.action_tolerance, remez);
 	generateApprox(ApproxHalfPowerAction, ApproxNegHalfPowerAction, 2*param.inv_pow, param.action_degree, param.action_tolerance, remez);
 	//Generate approximations for MD
 	if(param.md_degree != param.action_degree){ //note the CG tolerance is unrelated to the stopping condition of the Remez algorithm
 	  generateApprox(ApproxPowerMD, ApproxNegPowerMD, param.inv_pow, param.md_degree, param.md_tolerance, remez);
 	  generateApprox(ApproxHalfPowerMD, ApproxNegHalfPowerMD, 2*param.inv_pow, param.md_degree, param.md_tolerance, remez);
 	}else{
 	  std::cout<<GridLogMessage << "Using same rational approximations for MD as for action evaluation" << std::endl;
 	  ApproxPowerMD = ApproxPowerAction; 
 	  ApproxNegPowerMD = ApproxNegPowerAction;
 	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
 	    ApproxNegPowerMD.tolerances[i] = ApproxPowerMD.tolerances[i] = param.md_tolerance; //used for multishift
 	  ApproxHalfPowerMD = ApproxHalfPowerAction;
 	  ApproxNegHalfPowerMD = ApproxNegHalfPowerAction;
 	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
 	    ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance;
 	}
 	std::vector<RealD> action_tolerance(ApproxHalfPowerAction.tolerances.size(),param.action_tolerance);
 	std::vector<RealD> md_tolerance    (ApproxHalfPowerMD.tolerances.size(),param.md_tolerance);
 	SetTolerances(action_tolerance, md_tolerance);
 	std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl;
      };
      virtual std::string action_name(){return "GeneralEvenOddRatioRationalPseudoFermionAction";}
      virtual std::string LogParameters(){
 	std::stringstream sstream;
 	sstream << GridLogMessage << "["<<action_name()<<"] Power              : 1/" << param.inv_pow <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Low                :" << param.lo <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] High               :" << param.hi <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations     :" << param.MaxIter <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (Action) :" << param.action_tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree (Action)    :" << param.action_degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (MD)     :" << param.md_tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree (MD)        :" << param.md_degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Precision          :" << param.precision <<  std::endl;
 	return sstream.str();
      }
      //Access the fermion field
      const FermionField &getPhiOdd() const{ return PhiOdd; }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 	FermionField eta(NumOp.FermionGrid());	
 	// P(eta) \propto e^{- eta^dag eta}
 	//	
 	// The gaussian function draws from  P(x) \propto e^{- x^2 / 2 }    [i.e. sigma=1]
 	// Thus eta = x/sqrt{2} = x * sqrt(1/2)
 	RealD scale = std::sqrt(0.5);
 	gaussian(pRNG,eta);	eta=eta*scale;
 	refresh(U,eta);
      }
      //Allow for manual specification of random field for testing
      void refresh(const GaugeField &U, const FermionField &eta) {
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
 	// P(phi) = e^{- phi^dag (VdagV)^1/(2*inv_pow) (MdagM)^-1/inv_pow (VdagV)^1/(2*inv_pow) phi}
 	//        = e^{- phi^dag  (VdagV)^1/(2*inv_pow) (MdagM)^-1/(2*inv_pow) (MdagM)^-1/(2*inv_pow)  (VdagV)^1/(2*inv_pow) phi}
 	//
 	// Phi =  (VdagV)^-1/(2*inv_pow) Mdag^{1/(2*inv_pow)} eta 
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	ImportGauge(U);
 	// MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (M^dag M)^{1/" << 2*param.inv_pow << "} eta" << std::endl;
 	multiShiftInverse(Denominator, ApproxHalfPowerAction, param.MaxIter, etaOdd, tmp);
 	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (V^dag V)^{-1/" << 2*param.inv_pow << "} ( (M^dag M)^{1/" << 2*param.inv_pow << "} eta)" << std::endl;
 	multiShiftInverse(Numerator, ApproxNegHalfPowerAction, param.MaxIter, tmp, PhiOdd);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
 	RefreshAction = norm2( etaOdd );
        std::cout<<GridLogMessage << action_name() << " refresh: action is " << RefreshAction << std::endl;
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD Sinitial(const GaugeField &U) {
 	std::cout << GridLogMessage << "Returning stored two flavour refresh action "<<RefreshAction<<std::endl;
 	return RefreshAction;
      }
      virtual RealD S(const GaugeField &U) {
 	std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl;
 	ImportGauge(U);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	// VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerAction, param.MaxIter, PhiOdd,X);
 	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (M^dag M)^{-1/" << 2*param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Denominator, ApproxNegHalfPowerAction, param.MaxIter, X,Y);
 	// Randomly apply rational bounds checks.
 	int rcheck = rand();
 	auto grid = NumOp.FermionGrid();
        auto r=rand();
        grid->Broadcast(0,r);
 	if ( param.BoundsCheckFreq != 0 && (r % param.BoundsCheckFreq)==0 ) { 
 	  std::cout<<GridLogMessage << action_name() << " compute action: doing bounds check" << std::endl;
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	  std::cout<<GridLogMessage << action_name() << " compute action: checking high bounds" << std::endl;
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  std::cout<<GridLogMessage << action_name() << " compute action: full approximation" << std::endl;
 	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.action_tolerance*100,MdagM,gauss,ApproxNegPowerAction);
 	  std::cout<<GridLogMessage << action_name() << " compute action: bounds check complete" << std::endl;
 	}
 	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	RealD action = norm2(Y);
 	std::cout<<GridLogMessage << action_name() << " compute action: complete" << std::endl;
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	std::cout<<GridLogMessage << action_name() << " deriv: starting" << std::endl;
 	const int n_f  = ApproxNegPowerMD.poles.size();
 	const int n_pv = ApproxHalfPowerMD.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
 	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField           Y(NumOp.FermionRedBlackGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	ImportGauge(U);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, PhiOdd,MpvPhi_k,MpvPhi);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (M^dag M)^{-1/" << param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Denominator, ApproxNegPowerMD, param.MaxIter, MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} ( (M^dag M)^{-1/" << param.inv_pow << "} (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	RealD ak;
 	dSdU = Zero();
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)	
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (1)" << std::endl;
 	for(int k=0;k<n_f;k++){
 	  ak = ApproxNegPowerMD.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (2)+(3)" << std::endl;
 	for(int k=0;k<n_pv;k++){
          ak = ApproxHalfPowerMD.residues[k];
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
 	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
 	}
 	//dSdU = Ta(dSdU);
 	std::cout<<GridLogMessage << action_name() << " deriv: complete" << std::endl;
      };
    };
 NAMESPACE_END(Grid);
 #endif
@@ -0,0 +1,133 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
    Copyright (C) 2015
    Author: Christopher Kelly <ckelly@bnl.gov>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h>
 NAMESPACE_BEGIN(Grid);
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Generic rational approximation for ratios of operators utilizing the mixed precision multishift algorithm
    // cf. GeneralEvenOddRational.h for details
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class ImplD, class ImplF, class ImplD2>
    class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
    private:
      typedef typename ImplD2::FermionField FermionFieldD2;
      typedef typename ImplD::FermionField FermionFieldD;
      typedef typename ImplF::FermionField FermionFieldF;
      FermionOperator<ImplD> & NumOpD;
      FermionOperator<ImplD> & DenOpD;
      FermionOperator<ImplD2> & NumOpD2;
      FermionOperator<ImplD2> & DenOpD2;
      FermionOperator<ImplF> & NumOpF;
      FermionOperator<ImplF> & DenOpF;
      Integer ReliableUpdateFreq;
    protected:
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
 #if 0
 	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
 	ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
 	msCG(schurOp,in, out);
 #else
 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
 	// Action better with higher precision?
 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	precisionChange(inD2,in);
 	std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
 	msCG(schurOpD2, inD2, outD2);
 	precisionChange(out,outD2);
 #endif
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
 	SchurDifferentiableOperator<ImplF>  schurOpF (numerator ? NumOpF  : DenOpF);
 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
 	std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid());
 	ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	precisionChange(inD2,in);
 	std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
 	msCG(schurOpD2, inD2, out_elemsD2, outD2);
 	precisionChange(out,outD2);
 	for(int i=0;i<out_elems.size();i++){
 	  precisionChange(out_elems[i],out_elemsD2[i]);
 	}
      }
      //Allow derived classes to override the gauge import
      virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
 	typename ImplD2::GaugeField Ud2(NumOpD2.GaugeGrid());
 	precisionChange(Uf, Ud);
 	precisionChange(Ud2, Ud);
 	std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " << norm2(Ud2)<<std::endl;
 	NumOpD.ImportGauge(Ud);
 	DenOpD.ImportGauge(Ud);
 	NumOpF.ImportGauge(Uf);
 	DenOpF.ImportGauge(Uf);
 	NumOpD2.ImportGauge(Ud2);
 	DenOpD2.ImportGauge(Ud2);
      }
    public:
      GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD, 
 							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF, 
 							      FermionOperator<ImplD2>  &_NumOpD2, FermionOperator<ImplD2>  &_DenOpD2, 
 							      const RationalActionParams & p, Integer _ReliableUpdateFreq
 							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
 								  ReliableUpdateFreq(_ReliableUpdateFreq),
 								  NumOpD(_NumOpD), DenOpD(_DenOpD),
 								  NumOpF(_NumOpF), DenOpF(_DenOpF),
 								  NumOpD2(_NumOpD2), DenOpD2(_DenOpD2)
      {}
      virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
    };
 NAMESPACE_END(Grid);
 #endif
@@ -40,249 +40,64 @@ NAMESPACE_BEGIN(Grid);
    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
    template<class Impl>
-    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<Impl> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef OneFlavourRationalParams Params;
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegQuarter;
    private:
-     
+      static RationalActionParams transcribe(const Params &in){
-      FermionOperator<Impl> & NumOp;// the basic operator
+	RationalActionParams out;
-      FermionOperator<Impl> & DenOp;// the basic operator
+	out.inv_pow = 2;
-      FermionField PhiEven; // the pseudo fermion field for this trajectory
+	out.lo = in.lo;
-      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+	out.hi = in.hi;
 	out.MaxIter = in.MaxIter;
 	out.action_tolerance = out.md_tolerance = in.tolerance;
 	out.action_degree = out.md_degree = in.degree;
 	out.precision = in.precision;
 	out.BoundsCheckFreq = in.BoundsCheckFreq;
 	return out;
      }
    public:
      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
-					    FermionOperator<Impl>  &_DenOp, 
+							FermionOperator<Impl>  &_DenOp, 
-					    Params & p
+							const Params & p
-					    ) : 
+							) : 
-      NumOp(_NumOp), 
+	GeneralEvenOddRatioRationalPseudoFermionAction<Impl>(_NumOp, _DenOp, transcribe(p)){}
      DenOp(_DenOp), 
      PhiOdd (_NumOp.FermionRedBlackGrid()),
      PhiEven(_NumOp.FermionRedBlackGrid()),
      param(p) 
      {
 	AlgRemez remez(param.lo,param.hi,param.precision);
-	// MdagM^(+- 1/2)
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
+    };
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
-	// MdagM^(+- 1/4)
+    template<class Impl,class ImplF,class ImplD2>
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
+    class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction
-	remez.generateApprox(param.degree,1,4);
+      : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2> {
-   	PowerQuarter.Init(remez,param.tolerance,false);
+    public:
-	PowerNegQuarter.Init(remez,param.tolerance,true);
+      typedef OneFlavourRationalParams Params;
-      };
+    private:
-
+      static RationalActionParams transcribe(const Params &in){
-      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
+	RationalActionParams out;
-
+	out.inv_pow = 2;
-      virtual std::string LogParameters(){
+	out.lo = in.lo;
-	std::stringstream sstream;
+	out.hi = in.hi;
-	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
+	out.MaxIter = in.MaxIter;
-	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
+	out.action_tolerance = out.md_tolerance = in.tolerance;
-	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
+	out.action_degree = out.md_degree = in.degree;
-	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
+	out.precision = in.precision;
-	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
+	out.BoundsCheckFreq = in.BoundsCheckFreq;
-	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
+	return out;
 	return sstream.str();
      }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
-	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+    public:
-	//
+      OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
-	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
+								 FermionOperator<Impl>  &_DenOp, 
-	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
+								 FermionOperator<ImplF>  &_NumOpF, 
-	//
+								 FermionOperator<ImplF>  &_DenOpF, 
-	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
+								 FermionOperator<ImplD2>  &_NumOpD2, 
-	//
+								 FermionOperator<ImplD2>  &_DenOpD2, 
-	// P(eta) = e^{- eta^dag eta}
+								 const Params & p, Integer ReliableUpdateFreq
-	//
+							) : 
-	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2>(_NumOp, _DenOp,_NumOpF, _DenOpF,_NumOpD2, _DenOpD2, transcribe(p),ReliableUpdateFreq){}
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
-	RealD scale = std::sqrt(0.5);
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
 	FermionField eta(NumOp.FermionGrid());
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 	gaussian(pRNG,eta);	eta=eta*scale;
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	// MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
 	msCG_M(MdagM,etaOdd,tmp);
 	// VdagV^-1/4 MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
 	msCG_V(VdagV,tmp,PhiOdd);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	// VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	msCG_V(VdagV,PhiOdd,X);
 	// MdagM^-1/4 VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
 	msCG_M(MdagM,X,Y);
 	// Randomly apply rational bounds checks.
 	auto grid = NumOp.FermionGrid();
        auto r=rand();
        grid->Broadcast(0,r);
        if ( (r%param.BoundsCheckFreq)==0 ) { 
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
 	}
 	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
 	RealD action = norm2(Y);
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	const int n_f  = PowerNegHalf.poles.size();
 	const int n_pv = PowerQuarter.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
 	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField           Y(NumOp.FermionRedBlackGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
 	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	RealD ak;
 	dSdU = Zero();
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)
 	for(int k=0;k<n_f;k++){
 	  ak = PowerNegHalf.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	for(int k=0;k<n_pv;k++){
          ak = PowerQuarter.residues[k];
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
 	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
 	}
 	//dSdU = Ta(dSdU);
      };
    };
 NAMESPACE_END(Grid);
@@ -49,10 +49,12 @@ NAMESPACE_BEGIN(Grid);
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerNegQuarter;
      MultiShiftFunction MDPowerQuarter;
      MultiShiftFunction MDPowerNegHalf;
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
@@ -73,15 +75,22 @@ NAMESPACE_BEGIN(Grid);
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
 	MDPowerNegHalf.Init(remez,param.mdtolerance,true);
 	// MdagM^(+- 1/4)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
   	MDPowerQuarter.Init(remez,param.mdtolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
-      virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";}
+      virtual std::string action_name(){
 	std::stringstream sstream;
 	sstream<<"OneFlavourRatioRationalPseudoFermionAction("
 	       <<DenOp.Mass()<<") / det("<<NumOp.Mass()<<")";
 	return sstream.str();
      }
      virtual std::string LogParameters(){
 	std::stringstream sstream;
@@ -204,8 +213,8 @@ NAMESPACE_BEGIN(Grid);
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-	const int n_f  = PowerNegHalf.poles.size();
+	const int n_f  = MDPowerNegHalf.poles.size();
-	const int n_pv = PowerQuarter.poles.size();
+	const int n_pv = MDPowerQuarter.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionGrid());
@@ -224,8 +233,8 @@ NAMESPACE_BEGIN(Grid);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,MDPowerQuarter);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,MDPowerNegHalf);
 	msCG_V(VdagV,Phi,MpvPhi_k,MpvPhi);
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
@@ -244,7 +253,7 @@ NAMESPACE_BEGIN(Grid);
 	//(1)
 	for(int k=0;k<n_f;k++){
-	  ak = PowerNegHalf.residues[k];
+	  ak = MDPowerNegHalf.residues[k];
 	  DenOp.M(MfMpvPhi_k[k],Y);
 	  DenOp.MDeriv(tmp , MfMpvPhi_k[k], Y,DaggerYes );  dSdU=dSdU+ak*tmp;
 	  DenOp.MDeriv(tmp , Y, MfMpvPhi_k[k], DaggerNo );  dSdU=dSdU+ak*tmp;
@@ -254,7 +263,7 @@ NAMESPACE_BEGIN(Grid);
 	//(3)
 	for(int k=0;k<n_pv;k++){
-          ak = PowerQuarter.residues[k];
+          ak = MDPowerQuarter.residues[k];
 	  NumOp.M(MpvPhi_k[k],Y);
 	  NumOp.MDeriv(tmp,MpvMfMpvPhi_k[k],Y,DaggerYes); dSdU=dSdU+ak*tmp;
@@ -40,6 +40,8 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
@@ -38,7 +38,7 @@ NAMESPACE_BEGIN(Grid);
    class TwoFlavourEvenOddRatioPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
-
+      
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
      FermionOperator<Impl> & DenOp;// the basic operator
@@ -50,6 +50,8 @@ NAMESPACE_BEGIN(Grid);
      FermionField PhiOdd;   // the pseudo fermion field for this trajectory
      FermionField PhiEven;  // the pseudo fermion field for this trajectory
      RealD RefreshAction;
    public:
      TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
                                                FermionOperator<Impl>  &_DenOp, 
@@ -75,24 +77,22 @@ NAMESPACE_BEGIN(Grid);
          conformable(_NumOp.GaugeRedBlackGrid(), _DenOp.GaugeRedBlackGrid());
        };
-      virtual std::string action_name(){return "TwoFlavourEvenOddRatioPseudoFermionAction";}
+      virtual std::string action_name(){
 	std::stringstream sstream;
 	sstream<<"TwoFlavourEvenOddRatioPseudoFermionAction det("<<DenOp.Mass()<<") / det("<<NumOp.Mass()<<")";
 	return sstream.str();
      }
      virtual std::string LogParameters(){
 	std::stringstream sstream;
-	sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
+	sstream<< GridLogMessage << "["<<action_name()<<"] -- No further parameters "<<std::endl;
 	return sstream.str();
      } 
-      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
+      const FermionField &getPhiOdd() const{ return PhiOdd; }
-        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
        //
        // NumOp == V
        // DenOp == M
        //
        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
        //
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
@@ -100,39 +100,59 @@ NAMESPACE_BEGIN(Grid);
        RealD scale = std::sqrt(0.5);
        FermionField eta    (NumOp.FermionGrid());
        gaussian(pRNG,eta); eta = eta * scale;
 	refresh(U,eta);
      }
      void refresh(const GaugeField &U, const FermionField &eta) {
        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
        //
        // NumOp == V
        // DenOp == M
        //
        FermionField etaOdd (NumOp.FermionRedBlackGrid());
        FermionField etaEven(NumOp.FermionRedBlackGrid());
        FermionField tmp    (NumOp.FermionRedBlackGrid());
        gaussian(pRNG,eta);
        pickCheckerboard(Even,etaEven,eta);
        pickCheckerboard(Odd,etaOdd,eta);
        NumOp.ImportGauge(U);
        DenOp.ImportGauge(U);
 	std::cout << " TwoFlavourRefresh:  Imported gauge "<<std::endl;
        SchurDifferentiableOperator<Impl> Mpc(DenOp);
        SchurDifferentiableOperator<Impl> Vpc(NumOp);
 	std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl;
        // Odd det factors
        Mpc.MpcDag(etaOdd,PhiOdd);
 	std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl;
        tmp=Zero();
 	std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl;
        HeatbathSolver(Vpc,PhiOdd,tmp);
 	std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl;
        Vpc.Mpc(tmp,PhiOdd);            
 	std::cout << " TwoFlavourRefresh: Mpc "<<std::endl;
        // Even det factors
        DenOp.MooeeDag(etaEven,tmp);
        NumOp.MooeeInvDag(tmp,PhiEven);
 	std::cout << " TwoFlavourRefresh: Mee "<<std::endl;
-        PhiOdd =PhiOdd*scale;
+	RefreshAction = norm2(etaEven)+norm2(etaOdd);
-        PhiEven=PhiEven*scale;
+	std::cout << " refresh " <<action_name()<< " action "<<RefreshAction<<std::endl;
      };
      //////////////////////////////////////////////////////
      // S = phi^dag V (Mdag M)^-1 Vdag phi
      //////////////////////////////////////////////////////
      virtual RealD Sinitial(const GaugeField &U) {
 	std::cout << GridLogMessage << "Returning stored two flavour refresh action "<<RefreshAction<<std::endl;
 	return RefreshAction;
      }
      virtual RealD S(const GaugeField &U) {
        NumOp.ImportGauge(U);
@@ -0,0 +1,203 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/TwoFlavourRatio.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class Impl>
 class TwoFlavourRatioEO4DPseudoFermionAction : public Action<typename Impl::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(Impl);
 private:
  typedef FermionOperator<Impl> FermOp;
  FermionOperator<Impl> & NumOp;// the basic operator
  FermionOperator<Impl> & DenOp;// the basic operator
  OperatorFunction<FermionField> &DerivativeSolver;
  OperatorFunction<FermionField> &DerivativeDagSolver;
  OperatorFunction<FermionField> &ActionSolver;
  OperatorFunction<FermionField> &HeatbathSolver;
  FermionField phi4; // the pseudo fermion field for this trajectory
 public:
  TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					 FermionOperator<Impl>  &_DenOp, 
 					 OperatorFunction<FermionField> & DS,
 					 OperatorFunction<FermionField> & AS ) : 
    TwoFlavourRatioEO4DPseudoFermionAction(_NumOp,_DenOp, DS,DS,AS,AS) {};
  TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					 FermionOperator<Impl>  &_DenOp, 
 					 OperatorFunction<FermionField> & DS,
 					 OperatorFunction<FermionField> & DDS,
 					 OperatorFunction<FermionField> & AS,
 					 OperatorFunction<FermionField> & HS
 				       ) : NumOp(_NumOp),
 					   DenOp(_DenOp),
 					   DerivativeSolver(DS),
 					   DerivativeDagSolver(DDS),
 					   ActionSolver(AS),
 					   HeatbathSolver(HS),
 					   phi4(_NumOp.GaugeGrid())
  {};
  virtual std::string action_name(){return "TwoFlavourRatioEO4DPseudoFermionAction";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
    // P(phi) = e^{- phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi}
    //
    // NumOp == V
    // DenOp == M
    //
    // Take phi = (V^{-1} M)_11 eta  ; eta = (M^{-1} V)_11 Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    FermionField eta4(NumOp.GaugeGrid());
    FermionField eta5(NumOp.FermionGrid());
    FermionField tmp(NumOp.FermionGrid());
    FermionField phi5(NumOp.FermionGrid());
    gaussian(pRNG,eta4);
    NumOp.ImportFourDimPseudoFermion(eta4,eta5);
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(HeatbathSolver);
    DenOp.M(eta5,tmp);               // M eta
    PrecSolve(NumOp,tmp,phi5);  // phi = V^-1 M eta
    phi5=phi5*scale;
    std::cout << GridLogMessage << "4d pf refresh "<< norm2(phi5)<<"\n";
    // Project to 4d
    NumOp.ExportFourDimPseudoFermion(phi5,phi4);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField Y4(NumOp.GaugeGrid());
    FermionField X(NumOp.FermionGrid());
    FermionField Y(NumOp.FermionGrid());
    FermionField phi5(NumOp.FermionGrid());
    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(ActionSolver);
    NumOp.ImportFourDimPseudoFermion(phi4,phi5);
    NumOp.M(phi5,X);              // X= V phi
    PrecSolve(DenOp,X,Y);    // Y= (MdagM)^-1 Mdag Vdag phi = M^-1 V phi
    NumOp.ExportFourDimPseudoFermion(Y,Y4);
    RealD action = norm2(Y4);
    return action;
  };
  //////////////////////////////////////////////////////
  // dS/du = 2 Re phi^dag (V^dag M^-dag)_11  (M^-1 d V)_11  phi
  //       - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
  //////////////////////////////////////////////////////
  virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField  X(NumOp.FermionGrid());
    FermionField  Y(NumOp.FermionGrid());
    FermionField       phi(NumOp.FermionGrid());
    FermionField      Vphi(NumOp.FermionGrid());
    FermionField  MinvVphi(NumOp.FermionGrid());
    FermionField      tmp4(NumOp.GaugeGrid());
    FermionField  MdagInvMinvVphi(NumOp.FermionGrid());
    GaugeField   force(NumOp.GaugeGrid());	
    //Y=V phi
    //X = (Mdag V phi
    //Y = (Mdag M)^-1 Mdag V phi = M^-1 V Phi
    NumOp.ImportFourDimPseudoFermion(phi4,phi);
    NumOp.M(phi,Vphi);               //  V phi
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(DerivativeSolver);
    PrecSolve(DenOp,Vphi,MinvVphi);// M^-1 V phi
    std::cout << GridLogMessage << "4d deriv solve "<< norm2(MinvVphi)<<"\n";
    // Projects onto the physical space and back
    NumOp.ExportFourDimPseudoFermion(MinvVphi,tmp4);
    NumOp.ImportFourDimPseudoFermion(tmp4,Y);
    SchurRedBlackDiagMooeeDagSolve<FermionField> PrecDagSolve(DerivativeDagSolver);
    // X = proj M^-dag V phi
    // Need an adjoint solve
    PrecDagSolve(DenOp,Y,MdagInvMinvVphi);
    std::cout << GridLogMessage << "4d deriv solve dag "<< norm2(MdagInvMinvVphi)<<"\n";
    // phi^dag (Vdag Mdag^-1) (M^-1 dV)  phi
    NumOp.MDeriv(force ,MdagInvMinvVphi , phi, DaggerNo );  dSdU=force;
    // phi^dag (dVdag Mdag^-1) (M^-1 V)  phi
    NumOp.MDeriv(force , phi, MdagInvMinvVphi ,DaggerYes  );  dSdU=dSdU+force;
    //    - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
    DenOp.MDeriv(force,MdagInvMinvVphi,MinvVphi,DaggerNo);   dSdU=dSdU-force;
    DenOp.MDeriv(force,MinvVphi,MdagInvMinvVphi,DaggerYes);  dSdU=dSdU-force;
    dSdU *= -1.0; 
    //dSdU = - Ta(dSdU);
  };
 };
 NAMESPACE_END(Grid);
@@ -47,7 +47,7 @@ private:
  const unsigned int N = Impl::Group::Dimension;
  typedef typename Field::vector_object vobj;
-  typedef CartesianStencil<vobj, vobj,int> Stencil;
+  typedef CartesianStencil<vobj, vobj,DefaultImplParams> Stencil;
  SimpleCompressor<vobj> compressor;
  int npoint = 2 * Ndim;
@@ -82,7 +82,7 @@ public:
  virtual RealD S(const Field &p)
  {
    assert(p.Grid()->Nd() == Ndim);
-    static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements,0);
+    static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements);
    phiStencil.HaloExchange(p, compressor);
    Field action(p.Grid()), pshift(p.Grid()), phisquared(p.Grid());
    phisquared = p * p;
@@ -133,7 +133,7 @@ public:
    double interm_t = usecond();
    // move this outside
-    static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements,0);
+    static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements);
    phiStencil.HaloExchange(p, compressor);
    double halo_t = usecond();
@@ -0,0 +1,6 @@
 #ifndef GRID_GPARITY_H_
 #define GRID_GPARITY_H_
 #include<Grid/qcd/gparity/GparityFlavour.h>
 #endif
@@ -0,0 +1,34 @@
 #include <Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
 const std::array<const GparityFlavour, 3> GparityFlavour::sigma_mu = {{
    GparityFlavour(GparityFlavour::Algebra::SigmaX),
    GparityFlavour(GparityFlavour::Algebra::SigmaY),
    GparityFlavour(GparityFlavour::Algebra::SigmaZ)
    }};
 const std::array<const GparityFlavour, 6> GparityFlavour::sigma_all = {{
  GparityFlavour(GparityFlavour::Algebra::Identity),
  GparityFlavour(GparityFlavour::Algebra::SigmaX),
  GparityFlavour(GparityFlavour::Algebra::SigmaY),
  GparityFlavour(GparityFlavour::Algebra::SigmaZ),
  GparityFlavour(GparityFlavour::Algebra::ProjPlus),
  GparityFlavour(GparityFlavour::Algebra::ProjMinus)
 }};
 const std::array<const char *, GparityFlavour::nSigma> GparityFlavour::name = {{
    "SigmaX",
    "MinusSigmaX",
    "SigmaY",
    "MinusSigmaY",
    "SigmaZ",
    "MinusSigmaZ",
    "Identity",
    "MinusIdentity",
    "ProjPlus",
    "MinusProjPlus",
    "ProjMinus",
    "MinusProjMinus"}};
 NAMESPACE_END(Grid);
@@ -0,0 +1,475 @@
 #ifndef GRID_QCD_GPARITY_FLAVOUR_H
 #define GRID_QCD_GPARITY_FLAVOUR_H
 //Support for flavour-matrix operations acting on the G-parity flavour index
 #include <array>
 NAMESPACE_BEGIN(Grid);
 class GparityFlavour {
  public:
    GRID_SERIALIZABLE_ENUM(Algebra, undef,
                           SigmaX, 0,
 			   MinusSigmaX, 1,
                           SigmaY, 2,
 			   MinusSigmaY, 3,
                           SigmaZ, 4,
 			   MinusSigmaZ, 5,
 			   Identity, 6,
 			   MinusIdentity, 7,
 			   ProjPlus, 8,
 			   MinusProjPlus, 9,
 			   ProjMinus, 10,
 			   MinusProjMinus, 11
 			   );
    static constexpr unsigned int nSigma = 12;
    static const std::array<const char *, nSigma>                name;
    static const std::array<const GparityFlavour, 3>             sigma_mu;
    static const std::array<const GparityFlavour, 6>            sigma_all;
    Algebra                                                      g;
  public:
  accelerator GparityFlavour(Algebra initg): g(initg) {}  
 };
 // 0 1  x   vector
 // 1 0
 template<class vtype>
 accelerator_inline void multFlavourSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(1);
  ret(1) = rhs(0);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(1,0);
  ret(0,1) = rhs(1,1);
  ret(1,0) = rhs(0,0);
  ret(1,1) = rhs(0,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,1);
  ret(0,1) = rhs(0,0);
  ret(1,0) = rhs(1,1);
  ret(1,1) = rhs(1,0);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(1);
  ret(1) = -rhs(0);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(1,0);
  ret(0,1) = -rhs(1,1);
  ret(1,0) = -rhs(0,0);
  ret(1,1) = -rhs(0,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,1);
  ret(0,1) = -rhs(0,0);
  ret(1,0) = -rhs(1,1);
  ret(1,1) = -rhs(1,0);
 };
 // 0 -i  x   vector
 // i 0
 template<class vtype>
 accelerator_inline void multFlavourSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = timesMinusI(rhs(1));
  ret(1) = timesI(rhs(0));
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesMinusI(rhs(1,0));
  ret(0,1) = timesMinusI(rhs(1,1));
  ret(1,0) = timesI(rhs(0,0));
  ret(1,1) = timesI(rhs(0,1));
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesI(rhs(0,1));
  ret(0,1) = timesMinusI(rhs(0,0));
  ret(1,0) = timesI(rhs(1,1));
  ret(1,1) = timesMinusI(rhs(1,0));
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = timesI(rhs(1));
  ret(1) = timesMinusI(rhs(0));
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesI(rhs(1,0));
  ret(0,1) = timesI(rhs(1,1));
  ret(1,0) = timesMinusI(rhs(0,0));
  ret(1,1) = timesMinusI(rhs(0,1));
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesMinusI(rhs(0,1));
  ret(0,1) = timesI(rhs(0,0));
  ret(1,0) = timesMinusI(rhs(1,1));
  ret(1,1) = timesI(rhs(1,0));
 };
 // 1 0  x   vector
 // 0 -1
 template<class vtype>
 accelerator_inline void multFlavourSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(0);
  ret(1) = -rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(0);
  ret(1) = rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(0);
  ret(1) = rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(0);
  ret(1) = -rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 //G-parity flavour projection 1/2(1+\sigma_2)
 //1 -i
 //i  1
 template<class vtype>
 accelerator_inline void multFlavourProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = 0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
  ret(1) = 0.5*timesI(rhs(0)) + 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
  ret(1,0) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesI(rhs(0,1)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(0,1);
  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -0.5*rhs(0) + 0.5*timesI(rhs(1));
  ret(1) = 0.5*timesMinusI(rhs(0)) - 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) - 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
  ret(0,1) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(0,1);
  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
  ret(1,1) = 0.5*timesI(rhs(1,0)) - 0.5*rhs(1,1);
 };
 //G-parity flavour projection 1/2(1-\sigma_2)
 //1 i
 //-i  1
 template<class vtype>
 accelerator_inline void multFlavourProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = 0.5*rhs(0) + 0.5*timesI(rhs(1));
  ret(1) = 0.5*timesMinusI(rhs(0)) + 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
  ret(0,1) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(0,1);
  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
  ret(1,1) = 0.5*timesI(rhs(1,0)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
  ret(1) = 0.5*timesI(rhs(0)) - 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
  ret(1,0) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesI(rhs(0,1)) - 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(0,1);
  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) - 0.5*rhs(1,1);
 };
 template<class vtype> 
 accelerator_inline auto operator*(const GparityFlavour &G, const iVector<vtype, Ngp> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Ngp>, GparityFlavourTensorIndex>::value, iVector<vtype, Ngp>>::type
 {
  iVector<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    multFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    multFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    multFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    multFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    multFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    multFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    multFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    multFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    multFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    multFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    multFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    multFlavourMinusProjMinus(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const GparityFlavour &G, const iMatrix<vtype, Ngp> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
 {
  iMatrix<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    lmultFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    lmultFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    lmultFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    lmultFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    lmultFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    lmultFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    lmultFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    lmultFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    lmultFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    lmultFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    lmultFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    lmultFlavourMinusProjMinus(ret, arg); break;  
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const iMatrix<vtype, Ngp> &arg, const GparityFlavour &G)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
 {
  iMatrix<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    rmultFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    rmultFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    rmultFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    rmultFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    rmultFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    rmultFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    rmultFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    rmultFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    rmultFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    rmultFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    rmultFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    rmultFlavourMinusProjMinus(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 NAMESPACE_END(Grid);
 #endif // include guard
@@ -129,18 +129,10 @@ public:
    Runner(S);
  }
-  //////////////////////////////////////////////////////////////////
+  //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
-
+  //This is called automatically by Run but may be useful elsewhere, e.g. for integrator tuning experiments
-private:
+  void initializeGaugeFieldAndRNGs(Field &U){
-  template <class SmearingPolicy>
+    if(!Resources.haveRNGs()) Resources.AddRNGs();
  void Runner(SmearingPolicy &Smearing) {
    auto UGrid = Resources.GetCartesian();
    Resources.AddRNGs();
    Field U(UGrid);
    // Can move this outside?
    typedef IntegratorType<SmearingPolicy> TheIntegrator;
    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
    if (Parameters.StartingType == "HotStart") {
      // Hot start
@@ -159,14 +151,43 @@ private:
      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
 						     Resources.GetSerialRNG(),
 						     Resources.GetParallelRNG());
    } else if (Parameters.StartingType == "CheckpointStartReseed") {
      // Same as CheckpointRestart but reseed the RNGs using the fixed integer seeding used for ColdStart and HotStart
      // Useful for creating new evolution streams from an existing stream
      // WARNING: Unfortunately because the checkpointer doesn't presently allow us to separately restore the RNG and gauge fields we have to load
      // an existing RNG checkpoint first; make sure one is available and named correctly
      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
 						     Resources.GetSerialRNG(),
 						     Resources.GetParallelRNG());
      Resources.SeedFixedIntegers();      
    } else {
      // others
      std::cout << GridLogError << "Unrecognized StartingType\n";
      std::cout
 	<< GridLogError
-	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart, CheckpointStartReseed]\n";
      exit(1);
    }
  }
  //////////////////////////////////////////////////////////////////
 private:
  template <class SmearingPolicy>
  void Runner(SmearingPolicy &Smearing) {
    auto UGrid = Resources.GetCartesian();
    Field U(UGrid);
    initializeGaugeFieldAndRNGs(U);
    typedef IntegratorType<SmearingPolicy> TheIntegrator;
    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
    // Sets the momentum filter
    MDynamics.setMomentumFilter(*(Resources.GetMomentumFilter()));
    Smearing.set_Field(U);
@@ -34,6 +34,7 @@ directory
 			    * @brief Classes for Hybrid Monte Carlo update
 			    *
 			    * @author Guido Cossu
 			    * @author Peter Boyle
 			    */
 			   //--------------------------------------------------------------------
 #pragma once
@@ -52,6 +53,7 @@ struct HMCparameters: Serializable {
                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
                                  bool, MetropolisTest,
                                  Integer, NoMetropolisUntil,
 				  bool, PerformRandomShift, /* @brief Randomly shift the gauge configuration at the start of a trajectory */
                                  std::string, StartingType,
                                  IntegratorParameters, MD)
@@ -62,6 +64,7 @@ struct HMCparameters: Serializable {
    StartTrajectory   = 0;
    Trajectories      = 10;
    StartingType      = "HotStart";
    PerformRandomShift = true;
    /////////////////////////////////
  }
@@ -82,6 +85,7 @@ struct HMCparameters: Serializable {
    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
    std::cout << GridLogMessage << "[HMC parameters] Doing random shift      : " << std::boolalpha << PerformRandomShift << "\n";
    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
    MD.print_parameters();
  }
@@ -94,6 +98,7 @@ private:
  const HMCparameters Params;
  typedef typename IntegratorType::Field Field;
  typedef typename IntegratorType::FieldImplementation FieldImplementation;
  typedef std::vector< HmcObservable<Field> * > ObsListType;
  //pass these from the resource manager
@@ -115,22 +120,17 @@ private:
    random(sRNG, rn_test);
-    std::cout << GridLogMessage
+    std::cout << GridLogHMC << "--------------------------------------------------\n";
-              << "--------------------------------------------------\n";
+    std::cout << GridLogHMC << "exp(-dH) = " << prob << "  Random = " << rn_test << "\n";
-    std::cout << GridLogMessage << "exp(-dH) = " << prob
+    std::cout << GridLogHMC << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
              << "  Random = " << rn_test << "\n";
    std::cout << GridLogMessage
              << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
    if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
-      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
+      std::cout << GridLogHMC << "Metropolis_test -- ACCEPTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC << "--------------------------------------------------\n";
                << "--------------------------------------------------\n";
      return true;
    } else {  // rejected
-      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
+      std::cout << GridLogHMC << "Metropolis_test -- REJECTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC << "--------------------------------------------------\n";
                << "--------------------------------------------------\n";
      return false;
    }
  }
@@ -139,19 +139,80 @@ private:
  // Evolution
  /////////////////////////////////////////////////////////
  RealD evolve_hmc_step(Field &U) {
    TheIntegrator.refresh(U, sRNG, pRNG);  // set U and initialize P and phi's
-    RealD H0 = TheIntegrator.S(U);  // initial state action
+    GridBase *Grid = U.Grid();
    if(Params.PerformRandomShift){
 #if 0
      //////////////////////////////////////////////////////////////////////////////////////////////////////
      // Mainly for DDHMC perform a random translation of U modulo volume
      //////////////////////////////////////////////////////////////////////////////////////////////////////
      std::cout << GridLogMessage << "--------------------------------------------------\n";
      std::cout << GridLogMessage << "Random shifting gauge field by [";
      std::vector<typename FieldImplementation::GaugeLinkField> Umu(Grid->Nd(), U.Grid());
      for(int mu=0;mu<Grid->Nd();mu++) Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
      for(int d=0;d<Grid->Nd();d++) {
 	int L = Grid->GlobalDimensions()[d];
 	RealD rn_uniform;  random(sRNG, rn_uniform);
 	int shift = (int) (rn_uniform*L);
 	std::cout << shift;
 	if(d<Grid->Nd()-1) std::cout <<",";
 	else               std::cout <<"]\n";
 	//shift all fields together in a way that respects the gauge BCs
 	for(int mu=0; mu < Grid->Nd(); mu++)
 	  Umu[mu] = FieldImplementation::CshiftLink(Umu[mu],d,shift);
 	for(int mu=0;mu<Grid->Nd();mu++) PokeIndex<LorentzIndex>(U,Umu[mu],mu);
      }
      std::cout << GridLogMessage << "--------------------------------------------------\n";
 #endif	
    }
    TheIntegrator.reset_timer();
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // set U and initialize P and phi's
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << "Refresh momenta and pseudofermions";
    TheIntegrator.refresh(U, sRNG, pRNG);  
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // initial state action
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << "Compute initial action";
    RealD H0 = TheIntegrator.Sinitial(U);  
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::streamsize current_precision = std::cout.precision();
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
+    std::cout << GridLogHMC << "Total H before trajectory = " << H0 << "\n";
    std::cout.precision(current_precision);
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << " Molecular Dynamics evolution ";
    TheIntegrator.integrate(U);
    std::cout << GridLogMessage << "--------------------------------------------------\n";
-    RealD H1 = TheIntegrator.S(U);  // updated state action
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // updated state action
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << "Compute final action";
    RealD H1 = TheIntegrator.S(U);  
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    ///////////////////////////////////////////////////////////
    if(0){
      std::cout << "------------------------- Reversibility test" << std::endl;
@@ -163,17 +224,16 @@ private:
    }
    ///////////////////////////////////////////////////////////
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
+
-	      << "  dH = " << H1 - H0 << "\n";
+    std::cout << GridLogHMC << "--------------------------------------------------\n";
    std::cout << GridLogHMC << "Total H after trajectory  = " << H1 << "  dH = " << H1 - H0 << "\n";
    std::cout << GridLogHMC << "--------------------------------------------------\n";
    std::cout.precision(current_precision);
    return (H1 - H0);
  }
 public:
  /////////////////////////////////////////
@@ -195,10 +255,13 @@ public:
    // Actual updates (evolve a copy Ucopy then copy back eventually)
    unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory;
    for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
-      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
+
      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
      if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
-      	std::cout << GridLogMessage << "-- Thermalization" << std::endl;
+      	std::cout << GridLogHMC << "-- Thermalization" << std::endl;
      }
      double t0=usecond();
@@ -207,20 +270,19 @@ public:
      DeltaH = evolve_hmc_step(Ucopy);
      // Metropolis-Hastings test
      bool accept = true;
-      if (traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
+      if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
        accept = metropolis_test(DeltaH);
      } else {
-      	std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl;
+      	std::cout << GridLogHMC << "Skipping Metropolis test" << std::endl;
      }
      if (accept)
        Ucur = Ucopy; 
      double t1=usecond();
-      std::cout << GridLogMessage << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
+      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
      TheIntegrator.print_timer();
      for (int obs = 0; obs < Observables.size(); obs++) {
      	std::cout << GridLogDebug << "Observables # " << obs << std::endl;
@@ -228,7 +290,7 @@ public:
      	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
      }
-      std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
+      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
    }
  }
@@ -80,7 +80,9 @@ public:
      std::cout << GridLogError << "Seeds not initialized" << std::endl;
      exit(1);
    }
    std::cout << GridLogMessage << "Reseeding serial RNG with seed vector " << SerialSeeds << std::endl;
    sRNG_.SeedFixedIntegers(SerialSeeds);
    std::cout << GridLogMessage << "Reseeding parallel RNG with seed vector " << ParallelSeeds << std::endl;
    pRNG_->SeedFixedIntegers(ParallelSeeds);
  }
 };
@@ -72,6 +72,8 @@ class HMCResourceManager {
  typedef HMCModuleBase< BaseHmcCheckpointer<ImplementationPolicy> > CheckpointerBaseModule;
  typedef HMCModuleBase< HmcObservable<typename ImplementationPolicy::Field> > ObservableBaseModule;
  typedef ActionModuleBase< Action<typename ImplementationPolicy::Field>, GridModule > ActionBaseModule;
  typedef typename ImplementationPolicy::Field MomentaField;
  typedef typename ImplementationPolicy::Field Field;  
  // Named storage for grid pairs (std + red-black)
  std::unordered_map<std::string, GridModule> Grids;
@@ -80,6 +82,9 @@ class HMCResourceManager {
  // SmearingModule<ImplementationPolicy> Smearing;
  std::unique_ptr<CheckpointerBaseModule> CP;
  // Momentum filter
  std::unique_ptr<MomentumFilterBase<typename ImplementationPolicy::Field> > Filter;
  // A vector of HmcObservable modules
  std::vector<std::unique_ptr<ObservableBaseModule> > ObservablesList;
@@ -90,6 +95,7 @@ class HMCResourceManager {
  bool have_RNG;
  bool have_CheckPointer;
  bool have_Filter;
  // NOTE: operator << is not overloaded for std::vector<string> 
  // so this function is necessary
@@ -101,7 +107,7 @@ class HMCResourceManager {
 public:
-  HMCResourceManager() : have_RNG(false), have_CheckPointer(false) {}
+  HMCResourceManager() : have_RNG(false), have_CheckPointer(false), have_Filter(false) {}
  template <class ReaderClass, class vector_type = vComplex >
  void initialize(ReaderClass &Read){
@@ -129,6 +135,7 @@ public:
    RNGModuleParameters RNGpar(Read);
    SetRNGSeeds(RNGpar);
    // Observables
    auto &ObsFactory = HMC_ObservablesModuleFactory<observable_string, typename ImplementationPolicy::Field, ReaderClass>::getInstance(); 
    Read.push(observable_string);// here must check if existing...
@@ -208,6 +215,16 @@ public:
    AddGrid(s, Mod);
  }
  void SetMomentumFilter( MomentumFilterBase<typename ImplementationPolicy::Field> * MomFilter) {
    assert(have_Filter==false);
    Filter = std::unique_ptr<MomentumFilterBase<typename ImplementationPolicy::Field> >(MomFilter);
    have_Filter = true;
  }
  MomentumFilterBase<typename ImplementationPolicy::Field> *GetMomentumFilter(void) {
    if ( !have_Filter)
      SetMomentumFilter(new MomentumFilterNone<typename ImplementationPolicy::Field>());
    return Filter.get();
  }
  GridCartesian* GetCartesian(std::string s = "") {
    if (s.empty()) s = Grids.begin()->first;
@@ -226,6 +243,9 @@ public:
  //////////////////////////////////////////////////////
  // Random number generators
  //////////////////////////////////////////////////////
  //Return true if the RNG objects have been instantiated
  bool haveRNGs() const{ return have_RNG; }
  void AddRNGs(std::string s = "") {
    // Couple the RNGs to the GridModule tagged by s
@@ -33,7 +33,6 @@ directory
 #define INTEGRATOR_INCLUDED
 #include <memory>
 #include "MomentumFilter.h"
 NAMESPACE_BEGIN(Grid);
@@ -64,9 +63,10 @@ public:
 };
 /*! @brief Class for Molecular Dynamics management */
-template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy>
+template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy>
 class Integrator {
 protected:
  typedef FieldImplementation_ FieldImplementation;
  typedef typename FieldImplementation::Field MomentaField;  //for readability
  typedef typename FieldImplementation::Field Field;
@@ -119,36 +119,65 @@ protected:
    }
  } update_P_hireps{};
  void update_P(MomentaField& Mom, Field& U, int level, double ep) {
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing
    for (int a = 0; a < as[level].actions.size(); ++a) {
      double start_full = usecond();
      Field force(U.Grid());
      conformable(U.Grid(), Mom.Grid());
      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      double start_force = usecond();
      std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl;
      as[level].actions.at(a)->deriv_timer_start();
      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
      as[level].actions.at(a)->deriv_timer_stop();
      std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl;
      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
      auto name = as[level].actions.at(a)->action_name();
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
+
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
+      //      DumpSliceNorm("force ",force,Nd-1);
      MomFilter->applyFilter(force);
      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl;
      DumpSliceNorm("force filtered ",force,Nd-1);
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
      Real force_max   = std::sqrt(maxLocalNorm2(force));
      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    
      as[level].actions.at(a)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] dt           : " << ep <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max    : " << force_max <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt average  : " << impulse_abs <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt max      : " << impulse_max <<" "<<name<<std::endl;
      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
      double time_force = (end_force - start_force) / 1e3;
      std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
    }
    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);
    MomFilter->applyFilter(Mom);
  }
  void update_U(Field& U, double ep) 
@@ -162,8 +191,12 @@ protected:
  void update_U(MomentaField& Mom, Field& U, double ep) 
  {
    MomentaField MomFiltered(Mom.Grid());
    MomFiltered = Mom;
    MomFilter->applyFilter(MomFiltered);
    // exponential of Mom*U in the gauge fields case
-    FieldImplementation::update_field(Mom, U, ep);
+    FieldImplementation::update_field(MomFiltered, U, ep);
    // Update the smeared fields, can be implemented as observer
    Smearer.set_Field(U);
@@ -206,6 +239,77 @@ public:
  const MomentaField & getMomentum() const{ return P; }
  void reset_timer(void)
  {
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        as[level].actions.at(actionID)->reset_timer();
      }
    }
  }
  void print_timer(void)
  {
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::" << std::endl;
    std::cout << GridLogMessage << " Refresh cumulative timings "<<std::endl;
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] "
 		  << as[level].actions.at(actionID)->refresh_us*1.0e-6<<" s"<< std::endl;
      }
    }
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    std::cout << GridLogMessage << " Action cumulative timings "<<std::endl;
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] "
 		  << as[level].actions.at(actionID)->S_us*1.0e-6<<" s"<< std::endl;
      }
    }
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    std::cout << GridLogMessage << " Force cumulative timings "<<std::endl;
    std::cout << GridLogMessage << "------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] "
 		  << as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl;
      }
    }
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    std::cout << GridLogMessage << " Dslash counts "<<std::endl;
    std::cout << GridLogMessage << "------------------------- "<<std::endl;
    uint64_t full, partial, dirichlet;
    DslashGetCounts(dirichlet,partial,full);
    std::cout << GridLogMessage << " Full BCs               : "<<full<<std::endl;
    std::cout << GridLogMessage << " Partial dirichlet BCs  : "<<partial<<std::endl;
    std::cout << GridLogMessage << " Dirichlet BCs          : "<<dirichlet<<std::endl;
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    std::cout << GridLogMessage << " Force average size "<<std::endl;
    std::cout << GridLogMessage << "------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] :\n\t\t "
 		  <<" force max " << as[level].actions.at(actionID)->deriv_max_average()
 		  <<" norm "      << as[level].actions.at(actionID)->deriv_norm_average()
 		  <<" Fdt max  "  << as[level].actions.at(actionID)->Fdt_max_average()
 		  <<" Fdt norm "  << as[level].actions.at(actionID)->Fdt_norm_average()
 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
 		  << std::endl;
      }
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
  void print_parameters()
  {
    std::cout << GridLogMessage << "[Integrator] Name : "<< integrator_name() << std::endl;
@@ -224,7 +328,6 @@ public:
      }
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
  void reverse_momenta()
@@ -249,15 +352,19 @@ public:
  void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG) 
  {
    assert(P.Grid() == U.Grid());
-    std::cout << GridLogIntegrator << "Integrator refresh\n";
+    std::cout << GridLogIntegrator << "Integrator refresh" << std::endl;
    std::cout << GridLogIntegrator << "Generating momentum" << std::endl;
    FieldImplementation::generate_momenta(P, sRNG, pRNG);
    // Update the smeared fields, can be implemented as observer
    // necessary to keep the fields updated even after a reject
    // of the Metropolis
    std::cout << GridLogIntegrator << "Updating smeared fields" << std::endl;
    Smearer.set_Field(U);
    // Set the (eventual) representations gauge fields
    std::cout << GridLogIntegrator << "Updating representations" << std::endl;
    Representations.update(U);
    // The Smearer is attached to a pointer of the gauge field
@@ -267,15 +374,24 @@ public:
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
 	auto name = as[level].actions.at(actionID)->action_name();
        std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl;
 	as[level].actions.at(actionID)->refresh_timer_start();
        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
 	as[level].actions.at(actionID)->refresh_timer_stop();
 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl;
      }
      // Refresh the higher representation actions
      as[level].apply(refresh_hireps, Representations, sRNG, pRNG);
    }
    MomFilter->applyFilter(P);
  }
  // to be used by the actionlevel class to iterate
@@ -306,13 +422,17 @@ public:
    // Actions
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
 	        as[level].actions.at(actionID)->S_timer_start();
        Hterm = as[level].actions.at(actionID)->S(Us);
   	        as[level].actions.at(actionID)->S_timer_stop();
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
      }
      as[level].apply(S_hireps, Representations, level, H);
    }
@@ -320,6 +440,52 @@ public:
    return H;
  }
  struct _Sinitial {
    template <class FieldType, class Repr>
    void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) {
      for (int a = 0; a < repr_set.size(); ++a) {
        RealD Hterm = repr_set.at(a)->Sinitial(Rep.U);
        std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl;
        H += Hterm;
      }
    }
  } Sinitial_hireps{};
  RealD Sinitial(Field& U) 
  {  // here also U not used
    std::cout << GridLogIntegrator << "Integrator initial action\n";
    RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
    RealD Hterm;
    // Actions
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
 	        as[level].actions.at(actionID)->S_timer_start();
        Hterm = as[level].actions.at(actionID)->Sinitial(Us);
   	        as[level].actions.at(actionID)->S_timer_stop();
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
      }
      as[level].apply(Sinitial_hireps, Representations, level, H);
    }
    return H;
  }
  void integrate(Field& U) 
  {
    // reset the clocks
@@ -92,10 +92,11 @@ NAMESPACE_BEGIN(Grid);
 *  P 1/2                            P 1/2
 */
-template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> >
+template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> >
-class LeapFrog : public Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy> 
+class LeapFrog : public Integrator<FieldImplementation_, SmearingPolicy, RepresentationPolicy> 
 {
 public:
  typedef FieldImplementation_ FieldImplementation;
  typedef LeapFrog<FieldImplementation, SmearingPolicy, RepresentationPolicy> Algorithm;
  INHERIT_FIELD_TYPES(FieldImplementation);
@@ -135,13 +136,14 @@ public:
  }
 };
-template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> >
+template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> >
-class MinimumNorm2 : public Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy> 
+class MinimumNorm2 : public Integrator<FieldImplementation_, SmearingPolicy, RepresentationPolicy> 
 {
 private:
  const RealD lambda = 0.1931833275037836;
 public:
  typedef FieldImplementation_ FieldImplementation;
  INHERIT_FIELD_TYPES(FieldImplementation);
  MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm)
@@ -192,8 +194,8 @@ public:
  }
 };
-template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> >
+template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> >
-class ForceGradient : public Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy> 
+class ForceGradient : public Integrator<FieldImplementation_, SmearingPolicy, RepresentationPolicy> 
 {
 private:
  const RealD lambda = 1.0 / 6.0;
@@ -202,6 +204,7 @@ private:
  const RealD theta = 0.0;
 public:
  typedef FieldImplementation_ FieldImplementation;
  INHERIT_FIELD_TYPES(FieldImplementation);
  // Looks like dH scales as dt^4. tested wilson/wilson 2 level.
@@ -227,7 +230,8 @@ public:
    // Presently 4 force evals, and should have 3, so 1.33x too expensive.
    // could reduce this with sloppy CG to perhaps 1.15x too expensive
    // even without prediction.
-    this->update_P(Pfg, Ufg, level, 1.0);
+    this->update_P(Pfg, Ufg, level, fg_dt);
    Pfg = Pfg*(1.0/fg_dt);
    this->update_U(Pfg, Ufg, fg_dt);
    this->update_P(Ufg, level, ep);
  }
@@ -78,13 +78,13 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,
 // Now a specific registration with a fermion field
 // here must instantiate CG and CR for every new fermion field type (macro!!)
-static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,   
+static Registrar< ConjugateGradientModule<WilsonFermionD::FermionField>,   
-                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); 
+                  HMC_SolverModuleFactory<solver_string, WilsonFermionD::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); 
-static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,   
+static Registrar< BiCGSTABModule<WilsonFermionD::FermionField>,   
-                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __BiCGWFmodXMLInit("BiCGSTAB"); 
+                  HMC_SolverModuleFactory<solver_string, WilsonFermionD::FermionField, Serialiser> > __BiCGWFmodXMLInit("BiCGSTAB"); 
-static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,   
+static Registrar< ConjugateResidualModule<WilsonFermionD::FermionField>,   
-                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 
+                  HMC_SolverModuleFactory<solver_string, WilsonFermionD::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 
 // add the staggered, scalar versions here
@@ -31,15 +31,16 @@ directory
 NAMESPACE_BEGIN(Grid);
 struct TopologySmearingParameters : Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
 				  int, steps,
 				  float, step_size,
 				  int, meas_interval,
-				  float, maxTau);
+				  float, init_step_size,
 				  float, maxTau,
 				  float, tolerance);
-  TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
+ TopologySmearingParameters(float ss = 0.0f, int mi = 0, float mT = 0.0f, float tol = 1e-4):
-    steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
+  init_step_size(ss), meas_interval(mi), maxTau(mT), tolerance(tol){}
  template < class ReaderClass >
  TopologySmearingParameters(Reader<ReaderClass>& Reader){
@@ -97,9 +98,9 @@ public:
      if (Pars.do_smearing){
 	// using wilson flow by default here
-	WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
+	WilsonFlowAdaptive<PeriodicGimplR> WF(Pars.Smearing.init_step_size, Pars.Smearing.maxTau, Pars.Smearing.tolerance, Pars.Smearing.meas_interval);
-	WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
+	WF.smear(Usmear, U);
-	Real T0   = WF.energyDensityPlaquette(Usmear);
+	Real T0   = WF.energyDensityPlaquette(Pars.Smearing.maxTau, Usmear);
 	std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 		  << "T0                : [ " << traj << " ] "<< T0 << std::endl;
      }
@@ -7,6 +7,7 @@ Source file: ./lib/qcd/modules/plaquette.h
 Copyright (C) 2017
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -32,177 +33,318 @@ directory
 NAMESPACE_BEGIN(Grid);
 template <class Gimpl>
-class WilsonFlow: public Smear<Gimpl>{
+class WilsonFlowBase: public Smear<Gimpl>{
-  unsigned int Nstep;
+public:
-  unsigned int measure_interval;
+  //Store generic measurements to take during smearing process using std::function
-  mutable RealD epsilon, taus;
+  typedef std::function<void(int, RealD, const typename Gimpl::GaugeField &)> FunctionType;  //int: step,  RealD: flow time,  GaugeField : the gauge field
 protected:
  std::vector< std::pair<int, FunctionType> > functions; //The int maps to the measurement frequency
  mutable WilsonGaugeAction<Gimpl> SG;
-
+   
  void evolve_step(typename Gimpl::GaugeField&) const;
  void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD);
  RealD tau(unsigned int t)const {return epsilon*(t+1.0); }
 public:
  INHERIT_GIMPL_TYPES(Gimpl)
-  explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1):
+  explicit WilsonFlowBase(unsigned int meas_interval =1):
  Nstep(Nstep),
    epsilon(epsilon),
    measure_interval(interval),
    SG(WilsonGaugeAction<Gimpl>(3.0)) {
    // WilsonGaugeAction with beta 3.0
-    assert(epsilon > 0.0);
+    setDefaultMeasurements(meas_interval);
    LogMessage();
  }
  void resetActions(){ functions.clear(); }
-  void LogMessage() {
+  void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); }
    std::cout << GridLogMessage
 	      << "[WilsonFlow] Nstep   : " << Nstep << std::endl;
    std::cout << GridLogMessage
 	      << "[WilsonFlow] epsilon : " << epsilon << std::endl;
    std::cout << GridLogMessage
 	      << "[WilsonFlow] full trajectory : " << Nstep * epsilon << std::endl;
  }
-  virtual void smear(GaugeField&, const GaugeField&) const;
+  //Set the class to perform the default measurements: 
  //the plaquette energy density every step
  //the plaquette topological charge every 'topq_meas_interval' steps
  //and output to stdout
  void setDefaultMeasurements(int topq_meas_interval = 1);
-  virtual void derivative(GaugeField&, const GaugeField&, const GaugeField&) const {
+  void derivative(GaugeField&, const GaugeField&, const GaugeField&) const override{
    assert(0);
    // undefined for WilsonFlow
  }
-  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau);
+  //Compute t^2 <E(t)> for time t from the plaquette
-  RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const;
+  static RealD energyDensityPlaquette(const RealD t, const GaugeField& U);
-  RealD energyDensityPlaquette(const GaugeField& U) const;
+
  //Compute t^2 <E(t)> for time t from the 1x1 cloverleaf form
  //t is the Wilson flow time
  static RealD energyDensityCloverleaf(const RealD t, const GaugeField& U);
  //Evolve the gauge field by Nstep steps of epsilon and return the energy density computed every interval steps
  //The smeared field is output as V
  std::vector<RealD> flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval = 1);
  //Version that does not return the smeared field
  std::vector<RealD> flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval = 1);
  //Evolve the gauge field by Nstep steps of epsilon and return the Cloverleaf energy density computed every interval steps
  //The smeared field is output as V
  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval = 1);
  //Version that does not return the smeared field
  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval = 1);
 };
 //Basic iterative Wilson flow
 template <class Gimpl>
 class WilsonFlow: public WilsonFlowBase<Gimpl>{
 private:
  int Nstep; //number of steps
  RealD epsilon;  //step size
  //Evolve the gauge field by 1 step of size eps and update tau
  void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const;
 public:
  INHERIT_GIMPL_TYPES(Gimpl)
  //Integrate the Wilson flow for Nstep steps of size epsilon
  WilsonFlow(const RealD epsilon, const int Nstep, unsigned int meas_interval = 1): WilsonFlowBase<Gimpl>(meas_interval), Nstep(Nstep), epsilon(epsilon){}
  void smear(GaugeField& out, const GaugeField& in) const override;
 };
 //Wilson flow with adaptive step size
 template <class Gimpl>
 class WilsonFlowAdaptive: public WilsonFlowBase<Gimpl>{
 private:
  RealD init_epsilon; //initial step size
  RealD maxTau; //integrate to t=maxTau
  RealD tolerance; //integration error tolerance
  //Evolve the gauge field by 1 step and update tau and the current time step eps
  //
  //If the step size eps is too large that a significant integration error results,
  //the gauge field (U) and tau will not be updated and the function will return 0; eps will be adjusted to a smaller
  //value for the next iteration.
  //
  //For a successful integration step the function will return 1
  int evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps) const;
 public:
  INHERIT_GIMPL_TYPES(Gimpl)
  WilsonFlowAdaptive(const RealD init_epsilon, const RealD maxTau, const RealD tolerance, unsigned int meas_interval = 1): 
  WilsonFlowBase<Gimpl>(meas_interval), init_epsilon(init_epsilon), maxTau(maxTau), tolerance(tolerance){}
  void smear(GaugeField& out, const GaugeField& in) const override;
 };
 ////////////////////////////////////////////////////////////////////////////////
 // Implementations
 ////////////////////////////////////////////////////////////////////////////////
 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
+RealD WilsonFlowBase<Gimpl>::energyDensityPlaquette(const RealD t, const GaugeField& U){
  static WilsonGaugeAction<Gimpl> SG(3.0);
  return 2.0 * t * t * SG.S(U)/U.Grid()->gSites();
 }
 //Compute t^2 <E(t)> for time from the 1x1 cloverleaf form
 template <class Gimpl>
 RealD WilsonFlowBase<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeField& U){
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
  assert(Nd == 4);
  //E = 1/2 tr( F_munu F_munu )
  //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
  //F_01 F_02 F_03   F_12 F_13  F_23
  GaugeMat F(U.Grid());
  LatticeComplexD R(U.Grid());
  R = Zero();
  for(int mu=0;mu<3;mu++){
    for(int nu=mu+1;nu<4;nu++){
      WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
      R = R + trace(F*F);
    }
  }
  ComplexD out = sum(R);
  out = t*t*out / RealD(U.Grid()->gSites());
  return -real(out); //minus sign necessary for +ve energy
 }
 template <class Gimpl>
 std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){
  std::vector<RealD> out;
  resetActions();
  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
      std::cout << GridLogMessage << "[WilsonFlow] Computing plaquette energy density for step " << step << std::endl;
      out.push_back( energyDensityPlaquette(t,U) );
    });      
  smear(V,U);
  return out;
 }
 template <class Gimpl>
 std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){
  GaugeField V(U);
  return flowMeasureEnergyDensityPlaquette(V,U, measure_interval);
 }
 template <class Gimpl>
 std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){
  std::vector<RealD> out;
  resetActions();
  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
      std::cout << GridLogMessage << "[WilsonFlow] Computing Cloverleaf energy density for step " << step << std::endl;
      out.push_back( energyDensityCloverleaf(t,U) );
    });      
  smear(V,U);
  return out;
 }
 template <class Gimpl>
 std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){
  GaugeField V(U);
  return flowMeasureEnergyDensityCloverleaf(V,U, measure_interval);
 }
 template <class Gimpl>
 void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
  addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "  << step << "  " << t << "  " << energyDensityPlaquette(t,U) << std::endl;
    });
  addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "  << step << "  " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
    });
 }
 template <class Gimpl>
 void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{
  GaugeField Z(U.Grid());
  GaugeField tmp(U.Grid());
-  SG.deriv(U, Z);
+  this->SG.deriv(U, Z);
  Z *= 0.25;                                  // Z0 = 1/4 * F(U)
  Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0
  Z *= -17.0/8.0;
-  SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
+  this->SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
  Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
  Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
  Z *= -4.0/3.0;
-  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
+  this->SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
  tau += epsilon;
 }
 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD maxTau) {
+void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
-  if (maxTau - taus < epsilon){
+  std::cout << GridLogMessage
-    epsilon = maxTau-taus;
+	    << "[WilsonFlow] Nstep   : " << Nstep << std::endl;
-  }
+  std::cout << GridLogMessage
-  //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
+	    << "[WilsonFlow] epsilon : " << epsilon << std::endl;
-  GaugeField Z(U.Grid());
+  std::cout << GridLogMessage
-  GaugeField Zprime(U.Grid());
+	    << "[WilsonFlow] full trajectory : " << Nstep * epsilon << std::endl;
  GaugeField tmp(U.Grid()), Uprime(U.Grid());
  Uprime = U;
  SG.deriv(U, Z);
  Zprime = -Z;
  Z *= 0.25;                                  // Z0 = 1/4 * F(U)
  Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0
  Z *= -17.0/8.0;
  SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
  Zprime += 2.0*tmp;
  Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
  Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
  Z *= -4.0/3.0;
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
  // Ramos 
  Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0
  // Compute distance as norm^2 of the difference
  GaugeField diffU = U - Uprime;
  RealD diff = norm2(diffU);
  // adjust integration step
  taus += epsilon;
  //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
  epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
  //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
 }
 template <class Gimpl>
 RealD WilsonFlow<Gimpl>::energyDensityPlaquette(unsigned int step, const GaugeField& U) const {
  RealD td = tau(step);
  return 2.0 * td * td * SG.S(U)/U.Grid()->gSites();
 }
 template <class Gimpl>
 RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const GaugeField& U) const {
  return 2.0 * taus * taus * SG.S(U)/U.Grid()->gSites();
 }
 //#define WF_TIMING 
 template <class Gimpl>
 void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
  out = in;
-  for (unsigned int step = 1; step <= Nstep; step++) {
+  RealD taus = 0.;
  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
-    evolve_step(out);
+    evolve_step(out, taus);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
 #ifdef WF_TIMING
    std::cout << "Time to evolve " << diff.count() << " s\n";
 #endif
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
+    //Perform measurements
-		  << step << "  " << tau(step) << "  " 
+    for(auto const &meas : this->functions)
-	      << energyDensityPlaquette(step,out) << std::endl;
+      if( step % meas.first == 0 ) meas.second(step,taus,out);
    if( step % measure_interval == 0){
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
 		<< step << "  " 
 		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
    }
  }
 }
 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){
+int WilsonFlowAdaptive<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps) const{
  if (maxTau - tau < eps){
    eps = maxTau-tau;
  }
  //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
  GaugeField Z(U.Grid());
  GaugeField Zprime(U.Grid());
  GaugeField tmp(U.Grid()), Uprime(U.Grid()), Usave(U.Grid());
  Uprime = U;
  Usave = U;
  this->SG.deriv(U, Z);
  Zprime = -Z;
  Z *= 0.25;                                  // Z0 = 1/4 * F(U)
  Gimpl::update_field(Z, U, -2.0*eps);    // U = W1 = exp(ep*Z0)*W0
  Z *= -17.0/8.0;
  this->SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
  Zprime += 2.0*tmp;
  Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
  Gimpl::update_field(Z, U, -2.0*eps);    // U_= W2 = exp(ep*Z)*W1
  Z *= -4.0/3.0;
  this->SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
  Gimpl::update_field(Z, U, -2.0*eps);    // V(t+e) = exp(ep*Z)*W2
  // Ramos arXiv:1301.4388
  Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0
  // Compute distance using Ramos' definition
  GaugeField diffU = U - Uprime;
  RealD max_dist = 0;
  for(int mu=0;mu<Nd;mu++){
    typename Gimpl::GaugeLinkField diffU_mu = PeekIndex<LorentzIndex>(diffU, mu);
    RealD dist_mu = sqrt( maxLocalNorm2(diffU_mu) ) /Nc/Nc; //maximize over sites
    max_dist = std::max(max_dist, dist_mu); //maximize over mu
  }
  int ret;
  if(max_dist < tolerance) {
    tau += eps;
    ret = 1;
  } else {
    U = Usave;
    ret = 0;
  }
  eps = eps*0.95*std::pow(tolerance/max_dist,1./3.);
  std::cout << GridLogMessage << "Adaptive smearing : Distance: "<< max_dist <<" Step successful: " << ret << " New epsilon: " << eps << std::endl; 
  return ret;
 }
 template <class Gimpl>
 void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
  std::cout << GridLogMessage
 	    << "[WilsonFlow] initial epsilon : " << init_epsilon << std::endl;
  std::cout << GridLogMessage
 	    << "[WilsonFlow] full trajectory : " << maxTau << std::endl;
  std::cout << GridLogMessage
 	    << "[WilsonFlow] tolerance   : " << tolerance << std::endl;
  out = in;
-  taus = epsilon;
+  RealD taus = 0.;
  RealD eps = init_epsilon;
  unsigned int step = 0;
  do{
-    step++;
+    int step_success = evolve_step_adaptive(out, taus, eps); 
-    //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
+    step += step_success; //step will not be incremented if the integration step fails
-    evolve_step_adaptive(out, maxTau);
+
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
+    //Perform measurements
-		  << step << "  " << taus << "  "
+    if(step_success)
-	      << energyDensityPlaquette(out) << std::endl;
+      for(auto const &meas : this->functions)
-    if( step % measure_interval == 0){
+	if( step % meas.first == 0 ) meas.second(step,taus,out);
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
 		<< step << "  " 
 		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
    }
  } while (taus < maxTau);
 }
 NAMESPACE_END(Grid);
@@ -88,6 +88,12 @@ namespace PeriodicBC {
    return CovShiftBackward(Link,mu,arg);
  }
  //Boundary-aware C-shift of gauge links / gauge transformation matrices
  template<class gauge> Lattice<gauge>
  CshiftLink(const Lattice<gauge> &Link, int mu, int shift)
  {
    return Cshift(Link, mu, shift);
  }
 }
@@ -158,6 +164,9 @@ namespace ConjugateBC {
    //    std::cout<<"Gparity::CovCshiftBackward mu="<<mu<<std::endl;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
  //Out(x) = U^dag_\mu(x-mu)  | x_\mu != 0
  //       = U^T_\mu(L-1)  | x_\mu == 0
  template<class gauge> Lattice<gauge>
  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) {
    GridBase *grid = Link.Grid();
@@ -176,6 +185,9 @@ namespace ConjugateBC {
    return Link;
  }
  //Out(x) = S_\mu(x+\hat\mu)  | x_\mu != L-1
  //       = S*_\mu(0)  | x_\mu == L-1
  //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
  template<class gauge> Lattice<gauge>
  ShiftStaple(const Lattice<gauge> &Link, int mu)
  {
@@ -208,6 +220,47 @@ namespace ConjugateBC {
    return CovShiftBackward(Link,mu,arg);
  }
  //Boundary-aware C-shift of gauge links / gauge transformation matrices
  //shift = 1
  //Out(x) = U_\mu(x+\hat\mu)  | x_\mu != L-1
  //       = U*_\mu(0)  | x_\mu == L-1
  //shift = -1
  //Out(x) = U_\mu(x-mu)  | x_\mu != 0
  //       = U*_\mu(L-1)  | x_\mu == 0
  //shift = 2
  //Out(x) = U_\mu(x+2\hat\mu)  | x_\mu < L-2
  //       = U*_\mu(1)  | x_\mu == L-1
  //       = U*_\mu(0)  | x_\mu == L-2
  //shift = -2
  //Out(x) = U_\mu(x-2mu)  | x_\mu > 1
  //       = U*_\mu(L-2)  | x_\mu == 0
  //       = U*_\mu(L-1)  | x_\mu == 1
  //etc
  template<class gauge> Lattice<gauge>
  CshiftLink(const Lattice<gauge> &Link, int mu, int shift)
  {
    GridBase *grid = Link.Grid();
    int Lmu = grid->GlobalDimensions()[mu];
    assert(abs(shift) < Lmu && "Invalid shift value");
    Lattice<iScalar<vInteger>> coor(grid);
    LatticeCoordinate(coor, mu);
    Lattice<gauge> tmp(grid);
    if(shift > 0){
      tmp = Cshift(Link, mu, shift);
      tmp = where(coor >= Lmu-shift, conjugate(tmp), tmp);
      return tmp;
    }else if(shift < 0){
      tmp = Link;
      tmp = where(coor >= Lmu+shift, conjugate(tmp), tmp);
      return Cshift(tmp, mu, shift);
    }
    //shift == 0
    return Link;
  }
 }
@@ -40,27 +40,45 @@ public:
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
-  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
+  //A_\mu(x) = -i Ta(U_\mu(x) )   where Ta(U) = 1/2( U - U^dag ) - 1/2N tr(U - U^dag)  is the traceless antihermitian part. This is an O(A^3) approximation to the logarithm of U
-    for(int mu=0;mu<Nd;mu++){
+  static void GaugeLinkToLieAlgebraField(const GaugeMat &U, GaugeMat &A) {
-      Complex cmi(0.0,-1.0);
+    Complex cmi(0.0,-1.0);
-      A[mu] = Ta(U[mu]) * cmi;
+    A = Ta(U) * cmi;
    }
  }
-  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu,int orthog) {
+  
  //The derivative of the Lie algebra field
  static void DmuAmu(const std::vector<GaugeMat> &U, GaugeMat &dmuAmu,int orthog) {
    GridBase* grid = U[0].Grid();
    GaugeMat Ax(grid);
    GaugeMat Axm1(grid);
    GaugeMat Utmp(grid);
    dmuAmu=Zero();
    for(int mu=0;mu<Nd;mu++){
      if ( mu != orthog ) {
-	dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
+	//Rather than define functionality to work out how the BCs apply to A_\mu we simply use the BC-aware Cshift to the gauge links and compute A_\mu(x) and A_\mu(x-1) separately
 	//Ax = A_\mu(x)
 	GaugeLinkToLieAlgebraField(U[mu], Ax);
 	//Axm1 = A_\mu(x_\mu-1)
 	Utmp = Gimpl::CshiftLink(U[mu], mu, -1);
 	GaugeLinkToLieAlgebraField(Utmp, Axm1);
 	//Derivative
 	dmuAmu = dmuAmu + Ax - Axm1;
      }
    }
  }  
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
+  //Fix the gauge field Umu
  //0 < alpha < 1 is related to the step size, cf https://arxiv.org/pdf/1405.5812.pdf
  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
    GridBase *grid = Umu.Grid();
    GaugeMat xform(grid);
    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog,err_on_no_converge);
  }
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
  //Fix the gauge field Umu and also return the gauge transformation from the original gauge field, xform
    GridBase *grid = Umu.Grid();
@@ -123,28 +141,25 @@ public:
      }
    }
    std::cout << GridLogError << "Gauge fixing did not converge in " << maxiter << " iterations." << std::endl;
-    if (err_on_no_converge) assert(0);
+    if (err_on_no_converge)
      assert(0 && "Gauge fixing did not converge within the specified number of iterations");
  };
-  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
+  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) {
    GridBase *grid = U[0].Grid();
    std::vector<GaugeMat> A(Nd,grid);
    GaugeMat g(grid);
-
+    ExpiAlphaDmuAmu(U,g,alpha,dmuAmu,orthog);
    GaugeLinkToLieAlgebraField(U,A);
    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu,orthog);
    Real vol = grid->gSites();
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
    xform = g*xform ;
-    SU<Nc>::GaugeTransform(U,g);
+    SU<Nc>::GaugeTransform<Gimpl>(U,g);
    return trG;
  }
-  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
+  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) {
    GridBase *grid = U[0].Grid();
@@ -159,11 +174,7 @@ public:
    GaugeMat g(grid);
    GaugeMat dmuAmu_p(grid);
-    std::vector<GaugeMat> A(Nd,grid);
+    DmuAmu(U,dmuAmu,orthog);
    GaugeLinkToLieAlgebraField(U,A);
    DmuAmu(A,dmuAmu,orthog);
    std::vector<int> mask(Nd,1);
    for(int mu=0;mu<Nd;mu++) if (mu==orthog) mask[mu]=0;
@@ -207,16 +218,16 @@ public:
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
    xform = g*xform ;
-    SU<Nc>::GaugeTransform(U,g);
+    SU<Nc>::GaugeTransform<Gimpl>(U,g);
    return trG;
  }
-  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu,int orthog) {
+  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &U,GaugeMat &g, Real alpha, GaugeMat &dmuAmu,int orthog) {
    GridBase *grid = g.Grid();
    Complex cialpha(0.0,-alpha);
    GaugeMat ciadmam(grid);
-    DmuAmu(A,dmuAmu,orthog);
+    DmuAmu(U,dmuAmu,orthog);
    ciadmam = dmuAmu*cialpha;
    SU<Nc>::taExp(ciadmam,g);
  }  
@@ -615,7 +615,6 @@ public:
    GridBase *grid = out.Grid();
    typedef typename LatticeMatrixType::vector_type vector_type;
    typedef typename LatticeMatrixType::scalar_type scalar_type;
    typedef iSinglet<vector_type> vTComplexType;
@@ -694,32 +693,32 @@ public:
 * Adjoint rep gauge xform
 */
-  template<typename GaugeField,typename GaugeMat>
+  template<typename Gimpl>
-  static void GaugeTransform( GaugeField &Umu, GaugeMat &g){
+  static void GaugeTransform(typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
    GridBase *grid = Umu.Grid();
    conformable(grid,g.Grid());
-    GaugeMat U(grid);
+    typename Gimpl::GaugeLinkField U(grid);
-    GaugeMat ag(grid); ag = adj(g);
+    typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);
    for(int mu=0;mu<Nd;mu++){
      U= PeekIndex<LorentzIndex>(Umu,mu);
-      U = g*U*Cshift(ag, mu, 1);
+      U = g*U*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
      PokeIndex<LorentzIndex>(Umu,U,mu);
    }
  }
-  template<typename GaugeMat>
+  template<typename Gimpl>
-  static void GaugeTransform( std::vector<GaugeMat> &U, GaugeMat &g){
+  static void GaugeTransform( std::vector<typename Gimpl::GaugeLinkField> &U, typename Gimpl::GaugeLinkField &g){
    GridBase *grid = g.Grid();
-    GaugeMat ag(grid); ag = adj(g);
+    typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);
    for(int mu=0;mu<Nd;mu++){
-      U[mu] = g*U[mu]*Cshift(ag, mu, 1);
+      U[mu] = g*U[mu]*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
    }
  }
-  template<typename GaugeField,typename GaugeMat>
+  template<typename Gimpl>
-  static void RandomGaugeTransform(GridParallelRNG &pRNG, GaugeField &Umu, GaugeMat &g){
+  static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
    LieRandomize(pRNG,g,1.0);
-    GaugeTransform(Umu,g);
+    GaugeTransform<Gimpl>(Umu,g);
  }
  // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 )
@@ -125,6 +125,57 @@ public:
    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // sum over all spatial planes of plaquette
  //////////////////////////////////////////////////
  static void siteSpatialPlaquette(ComplexField &Plaq,
                            const std::vector<GaugeMat> &U) {
    ComplexField sitePlaq(U[0].Grid());
    Plaq = Zero();
    for (int mu = 1; mu < Nd-1; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceDirPlaquette(sitePlaq, U, mu, nu);
        Plaq = Plaq + sitePlaq;
      }
    }
  }
  ////////////////////////////////////
  // sum over all x,y,z and over all spatial planes of plaquette
  //////////////////////////////////////////////////
  static std::vector<RealD> timesliceSumSpatialPlaquette(const GaugeLorentz &Umu) {
    std::vector<GaugeMat> U(Nd, Umu.Grid());
    // inefficient here
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    ComplexField Plaq(Umu.Grid());
    siteSpatialPlaquette(Plaq, U);
    typedef typename ComplexField::scalar_object sobj;
    std::vector<sobj> Tq;
    sliceSum(Plaq, Tq, Nd-1);
    std::vector<Real> out(Tq.size());
    for(int t=0;t<Tq.size();t++) out[t] = TensorRemove(Tq[t]).real();
    return out;
  }
  //////////////////////////////////////////////////
  // average over all x,y,z and over all spatial planes of plaquette
  //////////////////////////////////////////////////
  static std::vector<RealD> timesliceAvgSpatialPlaquette(const GaugeLorentz &Umu) {
    std::vector<RealD> sumplaq = timesliceSumSpatialPlaquette(Umu);
    int Lt = Umu.Grid()->FullDimensions()[Nd-1];
    assert(sumplaq.size() == Lt);
    double vol = Umu.Grid()->gSites() / Lt;
    double faces = (1.0 * (Nd - 1)* (Nd - 2)) / 2.0;
    for(int t=0;t<Lt;t++)
      sumplaq[t] = sumplaq[t] / vol / faces / Nc; // Nd , Nc dependent... FIXME
    return sumplaq;
  }
  //////////////////////////////////////////////////
  // average over all x,y,z the temporal loop
  //////////////////////////////////////////////////
@@ -362,11 +413,11 @@ public:
    GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
    GaugeMat vu = v*u;
      //FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
-      FS = (u*v + Cshift(vu, mu, -1));
+      FS = (u*v + Gimpl::CshiftLink(vu, mu, -1));
      FS = 0.125*(FS - adj(FS));
  }
-  static Real TopologicalCharge(GaugeLorentz &U){
+  static Real TopologicalCharge(const GaugeLorentz &U){
    // 4d topological charge
    assert(Nd==4);
    // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
@@ -389,6 +440,203 @@ public:
  }
  //Clover-leaf Wilson loop combination for arbitrary mu-extent M and nu extent N,  mu >= nu
  //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf Eq 7  for 1x2 Wilson loop    
  //Clockwise ordering
  static void CloverleafMxN(GaugeMat &FS, const GaugeMat &Umu, const GaugeMat &Unu, int mu, int nu, int M, int N){  
 #define Fmu(A) Gimpl::CovShiftForward(Umu, mu, A)
 #define Bmu(A) Gimpl::CovShiftBackward(Umu, mu, A)
 #define Fnu(A) Gimpl::CovShiftForward(Unu, nu, A)
 #define Bnu(A) Gimpl::CovShiftBackward(Unu, nu, A)
 #define FmuI Gimpl::CovShiftIdentityForward(Umu, mu)
 #define BmuI Gimpl::CovShiftIdentityBackward(Umu, mu)
 #define FnuI Gimpl::CovShiftIdentityForward(Unu, nu)
 #define BnuI Gimpl::CovShiftIdentityBackward(Unu, nu)
    //Upper right loop
    GaugeMat tmp = BmuI;
    for(int i=1;i<M;i++)
      tmp = Bmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Bnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Fmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Fnu(tmp);
    FS = tmp;
    //Upper left loop
    tmp = BnuI;
    for(int j=1;j<N;j++)
      tmp = Bnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Fmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Fnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Bmu(tmp);
    FS = FS + tmp;
    //Lower right loop
    tmp = FnuI;
    for(int j=1;j<N;j++)
      tmp = Fnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Bmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Bnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Fmu(tmp);
    FS = FS + tmp;
    //Lower left loop
    tmp = FmuI;
    for(int i=1;i<M;i++)
      tmp = Fmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Fnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Bmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Bnu(tmp);
    FS = FS + tmp;
 #undef Fmu
 #undef Bmu
 #undef Fnu
 #undef Bnu
 #undef FmuI
 #undef BmuI
 #undef FnuI
 #undef BnuI
  }
  //Field strength from MxN Wilson loop
  //Note F_numu = - F_munu
  static void FieldStrengthMxN(GaugeMat &FS, const GaugeLorentz &U, int mu, int nu, int M, int N){  
    GaugeMat Umu = PeekIndex<LorentzIndex>(U, mu);
    GaugeMat Unu = PeekIndex<LorentzIndex>(U, nu);
    if(M == N){
      GaugeMat F(Umu.Grid());
      CloverleafMxN(F, Umu, Unu, mu, nu, M, N);
      FS = 0.125 * ( F - adj(F) );
    }else{
      //Average over both orientations
      GaugeMat horizontal(Umu.Grid()), vertical(Umu.Grid());
      CloverleafMxN(horizontal, Umu, Unu, mu, nu, M, N);
      CloverleafMxN(vertical, Umu, Unu, mu, nu, N, M);
      FS = 0.0625 * ( horizontal - adj(horizontal) + vertical - adj(vertical) );
    }
  }
  //Topological charge contribution from MxN Wilson loops
  //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf  Eq 6
  //output is the charge by timeslice: sum over timeslices to obtain the total
  static std::vector<Real> TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
    assert(Nd == 4);
    std::vector<std::vector<GaugeMat*> > F(Nd,std::vector<GaugeMat*>(Nd,nullptr));
    //Note F_numu = - F_munu
    //hence we only need to loop over mu,nu,rho,sigma that aren't related by permuting mu,nu  or rho,sigma
    //Use nu > mu
    for(int mu=0;mu<Nd-1;mu++){
      for(int nu=mu+1; nu<Nd; nu++){
 	F[mu][nu] = new GaugeMat(U.Grid());
 	FieldStrengthMxN(*F[mu][nu], U, mu, nu, M, N);
      }
    }
    Real coeff = -1./(32 * M_PI*M_PI * M*M * N*N); //overall sign to match CPS and Grid conventions, possibly related to time direction = 3 vs 0
    static const int combs[3][4] = { {0,1,2,3}, {0,2,1,3}, {0,3,1,2} };
    static const int signs[3] = { 1, -1, 1 }; //epsilon_{mu nu rho sigma}
    ComplexField fsum(U.Grid());
    fsum = Zero();
    for(int c=0;c<3;c++){
      int mu = combs[c][0], nu = combs[c][1], rho = combs[c][2], sigma = combs[c][3];
      int eps = signs[c];
      fsum = fsum + (8. * coeff * eps) * trace( (*F[mu][nu]) * (*F[rho][sigma]) ); 
    }
    for(int mu=0;mu<Nd-1;mu++)
      for(int nu=mu+1; nu<Nd; nu++)
 	delete F[mu][nu];
    typedef typename ComplexField::scalar_object sobj;
    std::vector<sobj> Tq;
    sliceSum(fsum, Tq, Nd-1);
    std::vector<Real> out(Tq.size());
    for(int t=0;t<Tq.size();t++) out[t] = TensorRemove(Tq[t]).real();
    return out;
  }
  static Real TopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
    std::vector<Real> Tq = TimesliceTopologicalChargeMxN(U,M,N);
    Real out(0);
    for(int t=0;t<Tq.size();t++) out += Tq[t];
    return out;
  }
  //Generate the contributions to the 5Li topological charge from Wilson loops of the following sizes
  //Use coefficients from hep-lat/9701012
  //1x1 : c1=(19.-55.*c5)/9.
  //2x2 : c2=(1-64.*c5)/9.
  //1x2 : c3=(-64.+640.*c5)/45.
  //1x3 : c4=1./5.-2.*c5
  //3x3 : c5=1./20.
  //Output array outer index contains the loops in the above order
  //Inner index is the time coordinate
  static std::vector<std::vector<Real> > TimesliceTopologicalCharge5LiContributions(const GaugeLorentz &U){
    static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };       
    std::vector<std::vector<Real> > out(5);
    for(int i=0;i<5;i++){	
      out[i] = TimesliceTopologicalChargeMxN(U,exts[i][0],exts[i][1]);
    }
    return out;
  }   
  static std::vector<Real> TopologicalCharge5LiContributions(const GaugeLorentz &U){   
    static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };
    std::vector<Real> out(5);
    std::cout << GridLogMessage << "Computing topological charge" << std::endl;
    for(int i=0;i<5;i++){
      out[i] = TopologicalChargeMxN(U,exts[i][0],exts[i][1]);
      std::cout << GridLogMessage << exts[i][0] << "x" << exts[i][1] << " Wilson loop contribution " << out[i] << std::endl;
    }
    return out;
  }
  //Compute the 5Li topological charge
  static std::vector<Real> TimesliceTopologicalCharge5Li(const GaugeLorentz &U){
    std::vector<std::vector<Real> > loops = TimesliceTopologicalCharge5LiContributions(U);
    double c5=1./20.;
    double c4=1./5.-2.*c5;
    double c3=(-64.+640.*c5)/45.;
    double c2=(1-64.*c5)/9.;
    double c1=(19.-55.*c5)/9.;
    int Lt = loops[0].size();
    std::vector<Real> out(Lt,0.);
    for(int t=0;t<Lt;t++)
      out[t] += c1*loops[0][t] + c2*loops[1][t] + c3*loops[2][t] + c4*loops[3][t] + c5*loops[4][t];
    return out;
  }
  static Real TopologicalCharge5Li(const GaugeLorentz &U){
    std::vector<Real> Qt = TimesliceTopologicalCharge5Li(U);
    Real Q = 0.;
    for(int t=0;t<Qt.size();t++) Q += Qt[t];
    std::cout << GridLogMessage << "5Li Topological charge: " << Q << std::endl;
    return Q;
  }
  //////////////////////////////////////////////////////
  // Similar to above for rectangle is required
  //////////////////////////////////////////////////////
@@ -501,7 +501,7 @@ struct Conj{
 struct TimesMinusI{
  // Complex
  template <typename T>
-  inline vec<T> operator()(vec<T> a, vec<T> b){
+  inline vec<T> operator()(vec<T> a){
    vec<T> out;
    const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap();
    svbool_t pg1 = acle<T>::pg1();
@@ -520,7 +520,7 @@ struct TimesMinusI{
 struct TimesI{
  // Complex
  template <typename T>
-  inline vec<T> operator()(vec<T> a, vec<T> b){
+  inline vec<T> operator()(vec<T> a){
    vec<T> out;
    const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap();
    svbool_t pg1 = acle<T>::pg1();
@@ -418,7 +418,7 @@ struct Conj{
 struct TimesMinusI{
  // Complex float
-  inline vecf operator()(vecf a, vecf b){
+  inline vecf operator()(vecf a){
    lutf tbl_swap = acle<float>::tbl_swap();
    pred pg1 = acle<float>::pg1();
    pred pg_odd = acle<float>::pg_odd();
@@ -428,7 +428,7 @@ struct TimesMinusI{
    return svneg_m(a_v, pg_odd, a_v);
  }
  // Complex double
-  inline vecd operator()(vecd a, vecd b){
+  inline vecd operator()(vecd a){
    lutd tbl_swap = acle<double>::tbl_swap();
    pred pg1 = acle<double>::pg1();
    pred pg_odd = acle<double>::pg_odd();
@@ -441,7 +441,7 @@ struct TimesMinusI{
 struct TimesI{
  // Complex float
-  inline vecf operator()(vecf a, vecf b){
+  inline vecf operator()(vecf a){
    lutf tbl_swap = acle<float>::tbl_swap();
    pred pg1 = acle<float>::pg1();
    pred pg_even = acle<float>::pg_even();
@@ -451,7 +451,7 @@ struct TimesI{
    return svneg_m(a_v, pg_even, a_v);
  }
  // Complex double
-  inline vecd operator()(vecd a, vecd b){
+  inline vecd operator()(vecd a){
    lutd tbl_swap = acle<double>::tbl_swap();
    pred pg1 = acle<double>::pg1();
    pred pg_even = acle<double>::pg_even();
@@ -405,12 +405,12 @@ struct Conj{
 struct TimesMinusI{
  //Complex single
-  inline __m256 operator()(__m256 in, __m256 ret){
+  inline __m256 operator()(__m256 in){
    __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in);   // r,-i
    return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
  }
  //Complex double
-  inline __m256d operator()(__m256d in, __m256d ret){
+  inline __m256d operator()(__m256d in){
    __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i
    return _mm256_shuffle_pd(tmp,tmp,0x5);
  }
@@ -418,12 +418,12 @@ struct TimesMinusI{
 struct TimesI{
  //Complex single
-  inline __m256 operator()(__m256 in, __m256 ret){
+  inline __m256 operator()(__m256 in){
    __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
    return _mm256_addsub_ps(_mm256_setzero_ps(),tmp);          // i,-r
  }
  //Complex double
-  inline __m256d operator()(__m256d in, __m256d ret){
+  inline __m256d operator()(__m256d in){
    __m256d tmp = _mm256_shuffle_pd(in,in,0x5);
    return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r
  }
@@ -271,14 +271,14 @@ struct Conj{
 struct TimesMinusI{
  //Complex single
-  inline __m512 operator()(__m512 in, __m512 ret){
+  inline __m512 operator()(__m512 in){
    //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
    //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0));   // 0x4E??
    __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
    return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
  }
  //Complex double
-  inline __m512d operator()(__m512d in, __m512d ret){
+  inline __m512d operator()(__m512d in){
    //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
    //return _mm512_shuffle_pd(tmp,tmp,0x55);
    __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
@@ -288,17 +288,16 @@ struct TimesMinusI{
 struct TimesI{
  //Complex single
-  inline __m512 operator()(__m512 in, __m512 ret){
+  inline __m512 operator()(__m512 in){
    __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
    return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); 
  }
  //Complex double
-  inline __m512d operator()(__m512d in, __m512d ret){
+  inline __m512d operator()(__m512d in){
    __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
    return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); 
  }
 };
 // Gpermute utilities consider coalescing into 1 Gpermute
@@ -0,0 +1,666 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/simd/Grid_vector_types.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template <class Scalar_type, class Vector_type>
 class Grid_simd2 {
 public:
  typedef typename RealPart<Scalar_type>::type Real;
  typedef Vector_type vector_type;
  typedef Scalar_type scalar_type;
  typedef union conv_t_union {
    Vector_type v;
    Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
    accelerator_inline conv_t_union(){};
  } conv_t;
  static constexpr int nvec=2;
  Vector_type v[nvec];
  static accelerator_inline constexpr int Nsimd(void) {
    static_assert( (sizeof(Vector_type) / sizeof(Scalar_type) >= 1), " size mismatch " );
    return nvec*sizeof(Vector_type) / sizeof(Scalar_type);
  }
  accelerator_inline Grid_simd2 &operator=(const Grid_simd2 &&rhs) {
    for(int n=0;n<nvec;n++) v[n] = rhs.v[n];
    return *this;
  };
  accelerator_inline Grid_simd2 &operator=(const Grid_simd2 &rhs) {
    for(int n=0;n<nvec;n++) v[n] = rhs.v[n];
    return *this;
  };  // faster than not declaring it and leaving to the compiler
  accelerator Grid_simd2() = default;
  accelerator_inline Grid_simd2(const Grid_simd2 &rhs) {    for(int n=0;n<nvec;n++) v[n] = rhs.v[n];  };
  accelerator_inline Grid_simd2(const Grid_simd2 &&rhs){    for(int n=0;n<nvec;n++) v[n] = rhs.v[n];  };
  accelerator_inline Grid_simd2(const Real a) { vsplat(*this, Scalar_type(a)); };
  // Enable if complex type
  template <typename S = Scalar_type> accelerator_inline
  Grid_simd2(const typename std::enable_if<is_complex<S>::value, S>::type a) {
      vsplat(*this, a);
  };
  /////////////////////////////
  // Constructors
  /////////////////////////////
  accelerator_inline Grid_simd2 &  operator=(const Zero &z) {
    vzero(*this);
    return (*this);
  }
  ///////////////////////////////////////////////
  // mac, mult, sub, add, adj
  ///////////////////////////////////////////////
  friend accelerator_inline void mac(Grid_simd2 *__restrict__ y,
 				     const Grid_simd2 *__restrict__ a,
 				     const Grid_simd2 *__restrict__ x) {
    *y = (*a) * (*x) + (*y);
  };
  friend accelerator_inline void mult(Grid_simd2 *__restrict__ y,
 				      const Grid_simd2 *__restrict__ l,
 				      const Grid_simd2 *__restrict__ r) {
    *y = (*l) * (*r);
  }
  friend accelerator_inline void sub(Grid_simd2 *__restrict__ y,
 				     const Grid_simd2 *__restrict__ l,
 				     const Grid_simd2 *__restrict__ r) {
    *y = (*l) - (*r);
  }
  friend accelerator_inline void add(Grid_simd2 *__restrict__ y,
 				     const Grid_simd2 *__restrict__ l,
 				     const Grid_simd2 *__restrict__ r) {
    *y = (*l) + (*r);
  }
  friend accelerator_inline void mac(Grid_simd2 *__restrict__ y,
 				     const Scalar_type *__restrict__ a,
 				     const Grid_simd2 *__restrict__ x) {
    *y = (*a) * (*x) + (*y);
  };
  friend accelerator_inline void mult(Grid_simd2 *__restrict__ y,
 				      const Scalar_type *__restrict__ l,
 				      const Grid_simd2 *__restrict__ r) {
    *y = (*l) * (*r);
  }
  friend accelerator_inline void sub(Grid_simd2 *__restrict__ y,
 				     const Scalar_type *__restrict__ l,
 				     const Grid_simd2 *__restrict__ r) {
    *y = (*l) - (*r);
  }
  friend accelerator_inline void add(Grid_simd2 *__restrict__ y,
 				     const Scalar_type *__restrict__ l,
 				     const Grid_simd2 *__restrict__ r) {
    *y = (*l) + (*r);
  }
  friend accelerator_inline void mac(Grid_simd2 *__restrict__ y,
 				     const Grid_simd2 *__restrict__ a,
 				     const Scalar_type *__restrict__ x) {
    *y = (*a) * (*x) + (*y);
  };
  friend accelerator_inline void mult(Grid_simd2 *__restrict__ y,
 				      const Grid_simd2 *__restrict__ l,
 				      const Scalar_type *__restrict__ r) {
    *y = (*l) * (*r);
  }
  friend accelerator_inline void sub(Grid_simd2 *__restrict__ y,
 				     const Grid_simd2 *__restrict__ l,
 				     const Scalar_type *__restrict__ r) {
    *y = (*l) - (*r);
  }
  friend accelerator_inline void add(Grid_simd2 *__restrict__ y,
 				     const Grid_simd2 *__restrict__ l,
 				     const Scalar_type *__restrict__ r) {
    *y = (*l) + (*r);
  }
  ////////////////////////////////////////////////////////////////////////
  // FIXME:  gonna remove these load/store, get, set, prefetch
  ////////////////////////////////////////////////////////////////////////
  friend accelerator_inline void vset(Grid_simd2 &ret, Scalar_type *a) {
    for(int n=0;n<nvec;n++) vset(ret.v[n],a);
  }
  ///////////////////////
  // Vstore
  ///////////////////////
  friend accelerator_inline void vstore(const Grid_simd2 &ret, Scalar_type *a) {
    for(int n=0;n<nvec;n++) vstore(ret.v[n],a);
  }
  ///////////////////////
  // Vprefetch
  ///////////////////////
  friend accelerator_inline void vprefetch(const Grid_simd2 &v) {
    vprefetch(v.v[0]);
  }
  ///////////////////////
  // Reduce
  ///////////////////////
  friend accelerator_inline Scalar_type Reduce(const Grid_simd2 &in) {
    return Reduce(in.v[0])+ Reduce(in.v[1]);
  }
  ////////////////////////////
  // operator scalar * simd
  ////////////////////////////
  friend accelerator_inline Grid_simd2 operator*(const Scalar_type &a, Grid_simd2 b) {
    Grid_simd2 va;
    vsplat(va, a);
    return va * b;
  }
  friend accelerator_inline Grid_simd2 operator*(Grid_simd2 b, const Scalar_type &a) {
    return a * b;
  }
  //////////////////////////////////
  // Divides
  //////////////////////////////////
  friend accelerator_inline Grid_simd2 operator/(const Scalar_type &a, Grid_simd2 b) {
    Grid_simd2 va;
    vsplat(va, a);
    return va / b;
  }
  friend accelerator_inline Grid_simd2 operator/(Grid_simd2 b, const Scalar_type &a) {
    Grid_simd2 va;
    vsplat(va, a);
    return b / a;
  }
  ///////////////////////
  // Unary negation
  ///////////////////////
  friend accelerator_inline Grid_simd2 operator-(const Grid_simd2 &r) {
    Grid_simd2 ret;
    vzero(ret);
    ret = ret - r;
    return ret;
  }
  // *=,+=,-= operators
  accelerator_inline Grid_simd2 &operator*=(const Grid_simd2 &r) {
    *this = (*this) * r;
    return *this;
  }
  accelerator_inline Grid_simd2 &operator+=(const Grid_simd2 &r) {
    *this = *this + r;
    return *this;
  }
  accelerator_inline Grid_simd2 &operator-=(const Grid_simd2 &r) {
    *this = *this - r;
    return *this;
  }
  ///////////////////////////////////////
  // Not all functions are supported
  // through SIMD and must breakout to
  // scalar type and back again. This
  // provides support
  ///////////////////////////////////////
  template <class functor>
  friend accelerator_inline Grid_simd2 SimdApply(const functor &func, const Grid_simd2 &v) {
    Grid_simd2 ret;
    for(int n=0;n<nvec;n++){
      ret.v[n]=SimdApply(func,v.v[n]);
    }
    return ret;
  }
  template <class functor>
  friend accelerator_inline Grid_simd2 SimdApplyBinop(const functor &func,
                                         const Grid_simd2 &x,
                                         const Grid_simd2 &y) {
    Grid_simd2 ret;
    for(int n=0;n<nvec;n++){
      ret.v[n]=SimdApplyBinop(func,x.v[n],y.v[n]);
    }
    return ret;
  }
  ///////////////////////
  // Exchange
  // Al Ah , Bl Bh -> Al Bl Ah,Bh
  ///////////////////////
  friend accelerator_inline void exchange0(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){
      out1.v[0] = in1.v[0];
      out1.v[1] = in2.v[0];
      out2.v[0] = in1.v[1];
      out2.v[1] = in2.v[1];
  }
  friend accelerator_inline void exchange1(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){
    exchange0(out1.v[0],out2.v[0],in1.v[0],in2.v[0]);
    exchange0(out1.v[1],out2.v[1],in1.v[1],in2.v[1]);
  }
  friend accelerator_inline void exchange2(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){
    exchange1(out1.v[0],out2.v[0],in1.v[0],in2.v[0]);
    exchange1(out1.v[1],out2.v[1],in1.v[1],in2.v[1]);
  }
  friend accelerator_inline void exchange3(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){
    exchange2(out1.v[0],out2.v[0],in1.v[0],in2.v[0]);
    exchange2(out1.v[1],out2.v[1],in1.v[1],in2.v[1]);
  }
  friend accelerator_inline void exchange4(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){
    exchange3(out1.v[0],out2.v[0],in1.v[0],in2.v[0]);
    exchange3(out1.v[1],out2.v[1],in1.v[1],in2.v[1]);
  }
  friend accelerator_inline void exchange(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2,int n)
  {
    if       (n==3) {
      exchange3(out1,out2,in1,in2);
    } else if(n==2) {
      exchange2(out1,out2,in1,in2);
    } else if(n==1) {
      exchange1(out1,out2,in1,in2);
    } else if(n==0) {
      exchange0(out1,out2,in1,in2);
    }
  }
  ////////////////////////////////////////////////////////////////////
  // General permute; assumes vector length is same across
  // all subtypes; may not be a good assumption, but could
  // add the vector width as a template param for BG/Q for example
  ////////////////////////////////////////////////////////////////////
  friend accelerator_inline void permute0(Grid_simd2 &y, Grid_simd2 b) {
    y.v[0]=b.v[1];
    y.v[1]=b.v[0];
  }
  friend accelerator_inline void permute1(Grid_simd2 &y, Grid_simd2 b) {
    permute0(y.v[0],b.v[0]);
    permute0(y.v[1],b.v[1]);
  }
  friend accelerator_inline void permute2(Grid_simd2 &y, Grid_simd2 b) {
    permute1(y.v[0],b.v[0]);
    permute1(y.v[1],b.v[1]);
  }
  friend accelerator_inline void permute3(Grid_simd2 &y, Grid_simd2 b) {
    permute2(y.v[0],b.v[0]);
    permute2(y.v[1],b.v[1]);
  }
  friend accelerator_inline void permute4(Grid_simd2 &y, Grid_simd2 b) {
    permute3(y.v[0],b.v[0]);
    permute3(y.v[1],b.v[1]);
  }
  friend accelerator_inline void permute(Grid_simd2 &y, Grid_simd2 b, int perm) {
    if(perm==3) permute3(y, b);
    else if(perm==2) permute2(y, b);
    else if(perm==1) permute1(y, b);
    else if(perm==0) permute0(y, b);
  }
  ///////////////////////////////
  // Getting single lanes
  ///////////////////////////////
  accelerator_inline Scalar_type getlane(int lane) const {
    if(lane < vector_type::Nsimd() ) return v[0].getlane(lane);
    else                             return v[1].getlane(lane%vector_type::Nsimd());
  }
  accelerator_inline void putlane(const Scalar_type &S, int lane){
    if(lane < vector_type::Nsimd() ) v[0].putlane(S,lane);
    else                             v[1].putlane(S,lane%vector_type::Nsimd());
  }
 };  // end of Grid_simd2 class definition
 ///////////////////////////////
 // Define available types
 ///////////////////////////////
 typedef Grid_simd2<complex<double>  , vComplexD>  vComplexD2;
 typedef Grid_simd2<double           , vRealD>     vRealD2;
 /////////////////////////////////////////
 // Some traits to recognise the types
 /////////////////////////////////////////
 template <typename T>
 struct is_simd : public std::false_type {};
 template <> struct is_simd<vRealF>     : public std::true_type {};
 template <> struct is_simd<vRealD>     : public std::true_type {};
 template <> struct is_simd<vRealH>     : public std::true_type {};
 template <> struct is_simd<vComplexF>  : public std::true_type {};
 template <> struct is_simd<vComplexD>  : public std::true_type {};
 template <> struct is_simd<vComplexH>  : public std::true_type {};
 template <> struct is_simd<vInteger>   : public std::true_type {};
 template <> struct is_simd<vRealD2>    : public std::true_type {};
 template <> struct is_simd<vComplexD2> : public std::true_type {};
 template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
 template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
 ///////////////////////////////////////////////
 // insert / extract with complex support
 ///////////////////////////////////////////////
 template <class S, class V>
 accelerator_inline S getlane(const Grid_simd<S, V> &in,int lane) {
  return in.getlane(lane);
 }
 template <class S, class V>
 accelerator_inline void putlane(Grid_simd<S, V> &vec,const S &_S, int lane){
  vec.putlane(_S,lane);
 }
 template <class S,IfNotSimd<S> = 0 >
 accelerator_inline S getlane(const S &in,int lane) {
  return in;
 }
 template <class S,IfNotSimd<S> = 0 >
 accelerator_inline void putlane(S &vec,const S &_S, int lane){
  vec = _S;
 }
 template <class S, class V>
 accelerator_inline S getlane(const Grid_simd2<S, V> &in,int lane) {
  return in.getlane(lane);
 }
 template <class S, class V>
 accelerator_inline void putlane(Grid_simd2<S, V> &vec,const S &_S, int lane){
  vec.putlane(_S,lane);
 }
 ////////////////////////////////////////////////////////////////////
 // General rotate
 ////////////////////////////////////////////////////////////////////
 template <class S, class V>
 accelerator_inline void vbroadcast(Grid_simd2<S,V> &ret,const Grid_simd2<S,V> &src,int lane){
  S* typepun =(S*) &src;
  vsplat(ret,typepun[lane]);
 }
 template <class S, class V, IfComplex<S> =0>
 accelerator_inline void rbroadcast(Grid_simd2<S,V> &ret,const Grid_simd2<S,V> &src,int lane){
  typedef typename V::vector_type vector_type;
  S* typepun =(S*) &src;
  ret.v[0].v = unary<vector_type>(real(typepun[lane]), VsplatSIMD());
  ret.v[1].v = unary<vector_type>(real(typepun[lane]), VsplatSIMD());
 }
 ///////////////////////
 // Splat
 ///////////////////////
 // this is only for the complex version
 template <class S, class V, IfComplex<S> = 0, class ABtype>
 accelerator_inline void vsplat(Grid_simd2<S, V> &ret, ABtype a, ABtype b) {
  vsplat(ret.v[0],a,b);
  vsplat(ret.v[1],a,b);
 }
 // overload if complex
 template <class S, class V>
 accelerator_inline void vsplat(Grid_simd2<S, V> &ret, EnableIf<is_complex<S>, S> c) {
  vsplat(ret, real(c), imag(c));
 }
 template <class S, class V>
 accelerator_inline void rsplat(Grid_simd2<S, V> &ret, EnableIf<is_complex<S>, S> c) {
  vsplat(ret, real(c), real(c));
 }
 // if real fill with a, if complex fill with a in the real part (first function
 // above)
 template <class S, class V>
 accelerator_inline void vsplat(Grid_simd2<S, V> &ret, NotEnableIf<is_complex<S>, S> a)
 {
  vsplat(ret.v[0],a);
  vsplat(ret.v[1],a);
 }
 //////////////////////////
 ///////////////////////////////////////////////
 // Initialise to 1,0,i for the correct types
 ///////////////////////////////////////////////
 // For complex types
 template <class S, class V, IfComplex<S> = 0>
 accelerator_inline void vone(Grid_simd2<S, V> &ret) {
  vsplat(ret, S(1.0, 0.0));
 }
 template <class S, class V, IfComplex<S> = 0>
 accelerator_inline void vzero(Grid_simd2<S, V> &ret) {
  vsplat(ret, S(0.0, 0.0));
 }  // use xor?
 template <class S, class V, IfComplex<S> = 0>
 accelerator_inline void vcomplex_i(Grid_simd2<S, V> &ret) {
  vsplat(ret, S(0.0, 1.0));
 }
 template <class S, class V, IfComplex<S> = 0>
 accelerator_inline void visign(Grid_simd2<S, V> &ret) {
  vsplat(ret, S(1.0, -1.0));
 }
 template <class S, class V, IfComplex<S> = 0>
 accelerator_inline void vrsign(Grid_simd2<S, V> &ret) {
  vsplat(ret, S(-1.0, 1.0));
 }
 // if not complex overload here
 template <class S, class V, IfReal<S> = 0>
 accelerator_inline void vone(Grid_simd2<S, V> &ret) {
  vsplat(ret, S(1.0));
 }
 template <class S, class V, IfReal<S> = 0>
 accelerator_inline void vzero(Grid_simd2<S, V> &ret) {
  vsplat(ret, S(0.0));
 }
 // For integral types
 template <class S, class V, IfInteger<S> = 0>
 accelerator_inline void vone(Grid_simd2<S, V> &ret) {
  vsplat(ret, 1);
 }
 template <class S, class V, IfInteger<S> = 0>
 accelerator_inline void vzero(Grid_simd2<S, V> &ret) {
  vsplat(ret, 0);
 }
 template <class S, class V, IfInteger<S> = 0>
 accelerator_inline void vtrue(Grid_simd2<S, V> &ret) {
  vsplat(ret, 0xFFFFFFFF);
 }
 template <class S, class V, IfInteger<S> = 0>
 accelerator_inline void vfalse(Grid_simd2<S, V> &ret) {
  vsplat(ret, 0);
 }
 template <class S, class V>
 accelerator_inline void zeroit(Grid_simd2<S, V> &z) {
  vzero(z);
 }
 ///////////////////////
 // Vstream
 ///////////////////////
 template <class S, class V, IfReal<S> = 0>
 accelerator_inline void vstream(Grid_simd2<S, V> &out, const Grid_simd2<S, V> &in) {
  vstream(out.v[0],in.v[0]);
  vstream(out.v[1],in.v[1]);
 }
 template <class S, class V, IfComplex<S> = 0>
 accelerator_inline void vstream(Grid_simd2<S, V> &out, const Grid_simd2<S, V> &in) {
  vstream(out.v[0],in.v[0]);
  vstream(out.v[1],in.v[1]);
 }
 template <class S, class V, IfInteger<S> = 0>
 accelerator_inline void vstream(Grid_simd2<S, V> &out, const Grid_simd2<S, V> &in) {
  vstream(out.v[0],in.v[0]);
  vstream(out.v[1],in.v[1]);
 }
 ////////////////////////////////////
 // Arithmetic operator overloads +,-,*
 ////////////////////////////////////
 template <class S, class V>
 accelerator_inline Grid_simd2<S, V> operator+(Grid_simd2<S, V> a, Grid_simd2<S, V> b) {
  Grid_simd2<S, V> ret;
  ret.v[0] = a.v[0]+b.v[0];
  ret.v[1] = a.v[1]+b.v[1];
  return ret;
 };
 template <class S, class V>
 accelerator_inline Grid_simd2<S, V> operator-(Grid_simd2<S, V> a, Grid_simd2<S, V> b) {
  Grid_simd2<S, V> ret;
  ret.v[0] = a.v[0]-b.v[0];
  ret.v[1] = a.v[1]-b.v[1];
  return ret;
 };
 // Distinguish between complex types and others
 template <class S, class V, IfComplex<S> = 0>
 accelerator_inline Grid_simd2<S, V> real_mult(Grid_simd2<S, V> a, Grid_simd2<S, V> b) {
  Grid_simd2<S, V> ret;
  ret.v[0] =real_mult(a.v[0],b.v[0]);
  ret.v[1] =real_mult(a.v[1],b.v[1]);
  return ret;
 };
 template <class S, class V, IfComplex<S> = 0>
 accelerator_inline Grid_simd2<S, V> real_madd(Grid_simd2<S, V> a, Grid_simd2<S, V> b, Grid_simd2<S,V> c) {
  Grid_simd2<S, V> ret;
  ret.v[0] =real_madd(a.v[0],b.v[0],c.v[0]);
  ret.v[1] =real_madd(a.v[1],b.v[1],c.v[1]);
  return ret;
 };
 // Distinguish between complex types and others
 template <class S, class V>
 accelerator_inline Grid_simd2<S, V> operator*(Grid_simd2<S, V> a, Grid_simd2<S, V> b) {
  Grid_simd2<S, V> ret;
  ret.v[0] = a.v[0]*b.v[0];
  ret.v[1] = a.v[1]*b.v[1];
  return ret;
 };
 // Distinguish between complex types and others
 template <class S, class V>
 accelerator_inline Grid_simd2<S, V> operator/(Grid_simd2<S, V> a, Grid_simd2<S, V> b) {
  Grid_simd2<S, V> ret;
  ret.v[0] = a.v[0]/b.v[0];
  ret.v[1] = a.v[1]/b.v[1];
  return ret;
 };
 ///////////////////////
 // Conjugate
 ///////////////////////
 template <class S, class V>
 accelerator_inline Grid_simd2<S, V> conjugate(const Grid_simd2<S, V> &in) {
  Grid_simd2<S, V> ret;
  ret.v[0] = conjugate(in.v[0]);
  ret.v[1] = conjugate(in.v[1]);
  return ret;
 }
 template <class S, class V, IfNotInteger<S> = 0>
 accelerator_inline Grid_simd2<S, V> adj(const Grid_simd2<S, V> &in) {
  return conjugate(in);
 }
 ///////////////////////
 // timesMinusI
 ///////////////////////
 template <class S, class V>
 accelerator_inline void timesMinusI(Grid_simd2<S, V> &ret, const Grid_simd2<S, V> &in) {
  timesMinusI(ret.v[0],in.v[0]);
  timesMinusI(ret.v[1],in.v[1]);
 }
 template <class S, class V>
 accelerator_inline Grid_simd2<S, V> timesMinusI(const Grid_simd2<S, V> &in) {
  Grid_simd2<S, V> ret;
  timesMinusI(ret.v[0],in.v[0]);
  timesMinusI(ret.v[1],in.v[1]);
  return ret;
 }
 ///////////////////////
 // timesI
 ///////////////////////
 template <class S, class V>
 accelerator_inline void timesI(Grid_simd2<S, V> &ret, const Grid_simd2<S, V> &in) {
  timesI(ret.v[0],in.v[0]);
  timesI(ret.v[1],in.v[1]);
 }
 template <class S, class V>
 accelerator_inline Grid_simd2<S, V> timesI(const Grid_simd2<S, V> &in) {
  Grid_simd2<S, V> ret;
  timesI(ret.v[0],in.v[0]);
  timesI(ret.v[1],in.v[1]);
  return ret;
 }
 /////////////////////
 // Inner, outer
 /////////////////////
 template <class S, class V>
 accelerator_inline Grid_simd2<S, V> innerProduct(const Grid_simd2<S, V> &l,const Grid_simd2<S, V> &r) {
  return conjugate(l) * r;
 }
 template <class S, class V>
 accelerator_inline Grid_simd2<S, V> outerProduct(const Grid_simd2<S, V> &l,const Grid_simd2<S, V> &r) {
  return l * conjugate(r);
 }
 template <class S, class V>
 accelerator_inline Grid_simd2<S, V> trace(const Grid_simd2<S, V> &arg) {
  return arg;
 }
 ////////////////////////////////////////////////////////////
 // copy/splat complex real parts into real;
 // insert real into complex and zero imag;
 ////////////////////////////////////////////////////////////
 accelerator_inline void precisionChange(vComplexD2 &out,const vComplexF  &in){
  Optimization::PrecisionChange::StoD(in.v,out.v[0].v,out.v[1].v);
 }
 accelerator_inline void precisionChange(vComplexF  &out,const vComplexD2 &in){
  out.v=Optimization::PrecisionChange::DtoS(in.v[0].v,in.v[1].v);
 }
 accelerator_inline void precisionChange(vComplexD2 *out,const vComplexF  *in,int nvec){
  for(int m=0;m<nvec;m++){ precisionChange(out[m],in[m]); }
 }
 accelerator_inline void precisionChange(vComplexF  *out,const vComplexD2 *in,int nvec){
  for(int m=0;m<nvec;m++){ precisionChange(out[m],in[m]); }
 }
 accelerator_inline void precisionChange(vRealD2 &out,const vRealF  &in){
  Optimization::PrecisionChange::StoD(in.v,out.v[0].v,out.v[1].v);
 }
 accelerator_inline void precisionChange(vRealF  &out,const vRealD2 &in){
  out.v=Optimization::PrecisionChange::DtoS(in.v[0].v,in.v[1].v);
 }
 accelerator_inline void precisionChange(vRealD2 *out,const vRealF  *in,int nvec){
  for(int m=0;m<nvec;m++){ precisionChange(out[m],in[m]); }
 }
 accelerator_inline void precisionChange(vRealF  *out,const vRealD2 *in,int nvec){
  for(int m=0;m<nvec;m++){ precisionChange(out[m],in[m]); }
 }
 NAMESPACE_END(Grid);
--- a/Show More
+++ b/Show More