Boosted fermion attempt

Qslash term added
Better macos
2026-01-23 09:54:50 +00:00 · 2024-10-17 18:37:33 +01:00 · 2023-09-14 16:14:03 -04:00 · 2023-09-14 16:12:21 -04:00
23 changed files with 530 additions and 1661 deletions
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -47,4 +47,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
-#include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -697,68 +697,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
+
  // the above should guarantee that the operations are local
-  
-#if 1
-
-  size_t nsite = 1;
-  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
-  
-  size_t tbytes = 4*nsite*sizeof(int);
-  int *table = (int*)malloc(tbytes);
- 
-  thread_for(idx, nsite, {
-      Coordinate from_coor, to_coor;
-      size_t rem = idx;
-      for(int i=0;i<nd;i++){
-	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
-	from_coor[i] = base_i + FromLowerLeft[i];
-	to_coor[i] = base_i + ToLowerLeft[i];
-      }
-      
-      int foidx = Fg->oIndex(from_coor);
-      int fiidx = Fg->iIndex(from_coor);
-      int toidx = Tg->oIndex(to_coor);
-      int tiidx = Tg->iIndex(to_coor);
-      int* tt = table + 4*idx;
-      tt[0] = foidx;
-      tt[1] = fiidx;
-      tt[2] = toidx;
-      tt[3] = tiidx;
-    });
-  
-  int* table_d = (int*)acceleratorAllocDevice(tbytes);
-  acceleratorCopyToDevice(table,table_d,tbytes);
-
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  autoView(from_v,From,AcceleratorRead);
-  autoView(to_v,To,AcceleratorWrite);
-  
-  accelerator_for(idx,nsite,1,{
-      static const int words=sizeof(vobj)/sizeof(vector_type);
-      int* tt = table_d + 4*idx;
-      int from_oidx = *tt++;
-      int from_lane = *tt++;
-      int to_oidx = *tt++;
-      int to_lane = *tt;
-
-      const vector_type* from = (const vector_type *)&from_v[from_oidx];
-      vector_type* to = (vector_type *)&to_v[to_oidx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	putlane(to[w], stmp, to_lane);
-      }
-    });
-  
-  acceleratorFreeDevice(table_d);    
-  free(table);
-  
-
-#else  
  Coordinate ldf = Fg->_ldimensions;
  Coordinate rdf = Fg->_rdimensions;
  Coordinate isf = Fg->_istride;
@@ -798,8 +738,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 #endif
    }
  });
-
-#endif
 }


@@ -892,8 +830,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 }


-//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
-//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@@ -915,65 +851,6 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
    }
  }

-#if 1
-  size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
-  size_t tbytes = 4*nsite*sizeof(int);
-  int *table = (int*)malloc(tbytes);
-  
-  thread_for(idx,nsite,{
-    Coordinate lcoor(nl);
-    Coordinate hcoor(nh);
-    lcoor[orthog] = slice_lo;
-    hcoor[orthog] = slice_hi;
-    size_t rem = idx;
-    for(int mu=0;mu<nl;mu++){
-      if(mu != orthog){
-	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu];
-	lcoor[mu] = hcoor[mu] = xmu;
-      }
-    }
-    int loidx = lg->oIndex(lcoor);
-    int liidx = lg->iIndex(lcoor);
-    int hoidx = hg->oIndex(hcoor);
-    int hiidx = hg->iIndex(hcoor);
-    int* tt = table + 4*idx;
-    tt[0] = loidx;
-    tt[1] = liidx;
-    tt[2] = hoidx;
-    tt[3] = hiidx;
-    });
-   
-  int* table_d = (int*)acceleratorAllocDevice(tbytes);
-  acceleratorCopyToDevice(table,table_d,tbytes);
-
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  autoView(lowDim_v,lowDim,AcceleratorRead);
-  autoView(higherDim_v,higherDim,AcceleratorWrite);
-  
-  accelerator_for(idx,nsite,1,{
-      static const int words=sizeof(vobj)/sizeof(vector_type);
-      int* tt = table_d + 4*idx;
-      int from_oidx = *tt++;
-      int from_lane = *tt++;
-      int to_oidx = *tt++;
-      int to_lane = *tt;
-
-      const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
-      vector_type* to = (vector_type *)&higherDim_v[to_oidx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	putlane(to[w], stmp, to_lane);
-      }
-    });
-  
-  acceleratorFreeDevice(table_d);    
-  free(table);
-  
-#else
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
@@ -989,7 +866,6 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
      pokeLocalSite(s,higherDimv,hcoor);
    }
  });
-#endif
 }


--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -26,32 +26,14 @@ Author: Peter Boyle pboyle@bnl.gov
 /*  END LEGAL */
 #pragma once

-#include<Grid/cshift/Cshift.h>
-
 NAMESPACE_BEGIN(Grid);

-//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
-template<typename vobj>
-struct CshiftImplBase{
-  virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
-  virtual ~CshiftImplBase(){}
-};
-template<typename vobj>
-struct CshiftImplDefault: public CshiftImplBase<vobj>{
-  Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
-};
-template<typename Gimpl>
-struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
-  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
-};  
-
 class PaddedCell {
 public:
  GridCartesian * unpadded_grid;
  int dims;
  int depth;
  std::vector<GridCartesian *> grids;
-
  ~PaddedCell()
  {
    DeleteGrids();
@@ -95,7 +77,7 @@ public:
    }
  };
  template<class vobj>
-  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
+  inline Lattice<vobj> Extract(Lattice<vobj> &in)
  {
    Lattice<vobj> out(unpadded_grid);

@@ -106,19 +88,19 @@ public:
    return out;
  }
  template<class vobj>
-  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
+  inline Lattice<vobj> Exchange(Lattice<vobj> &in)
  {
    GridBase *old_grid = in.Grid();
    int dims = old_grid->Nd();
    Lattice<vobj> tmp = in;
    for(int d=0;d<dims;d++){
-      tmp = Expand(d,tmp,cshift); // rvalue && assignment
+      tmp = Expand(d,tmp); // rvalue && assignment
    }
    return tmp;
  }
  // expand up one dim at a time
  template<class vobj>
-  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
+  inline Lattice<vobj> Expand(int dim,Lattice<vobj> &in)
  {
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
@@ -130,40 +112,20 @@ public:
    else       conformable(old_grid,grids[dim-1]);

    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
-
-    double tins=0, tshift=0;
-    
    // Middle bit
-    double t = usecond();
    for(int x=0;x<local[dim];x++){
      InsertSliceLocal(in,padded,x,depth+x,dim);
    }
-    tins += usecond() - t;
-    
    // High bit
-    t = usecond();
-    shifted = cshift.Cshift(in,dim,depth);
-    tshift += usecond() - t;
-
-    t=usecond();
+    shifted = Cshift(in,dim,depth);
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
    }
-    tins += usecond() - t;
-    
    // Low bit
-    t = usecond();
-    shifted = cshift.Cshift(in,dim,-depth);
-    tshift += usecond() - t;
-    
-    t = usecond();
+    shifted = Cshift(in,dim,-depth);
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,x,x,dim);
    }
-    tins += usecond() - t;
-
-    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
-    
    return padded;
  }

--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -124,6 +124,11 @@ public:
  RealD                _b;
  RealD                _c;

+  // possible boost
+  std::vector<ComplexD> qmu;
+  void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
+  void addQmu(const FermionField &in, FermionField &out, int dag);
+  
  // Cayley form Moebius (tanh and zolotarev)
  Vector<Coeff_t> omega;
  Vector<Coeff_t> bs;    // S dependent coeffs
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -60,6 +60,50 @@ public:
  //      virtual void   Instantiatable(void)=0;
  virtual void   Instantiatable(void) =0;

+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
+  {
+    std::cout << "Free Propagator for PartialFraction"<<std::endl;
+    FermionField in_k(in.Grid());
+    FermionField prop_k(in.Grid());
+    
+    FFT theFFT((GridCartesian *) in.Grid());
+
+    //phase for boundary condition
+    ComplexField coor(in.Grid());
+    ComplexField ph(in.Grid());  ph = Zero();
+    FermionField in_buf(in.Grid()); in_buf = Zero();
+    typedef typename Simd::scalar_type Scalar;
+    Scalar ci(0.0,1.0);
+    assert(twist.size() == Nd);//check that twist is Nd
+    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    int shift = 0;
+    for(unsigned int nu = 0; nu < Nd; nu++)
+      {
+	// Shift coordinate lattice index by 1 to account for 5th dimension.
+	LatticeCoordinate(coor, nu + shift);
+	double boundary_phase = ::acos(real(boundary[nu]));
+	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
+	//momenta for propagator shifted by twist+boundary
+	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
+      }
+    in_buf = exp(ci*ph*(-1.0))*in;
+
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    
+    //phase for boundary condition
+    out = out * exp(ci*ph);
+  };
+
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+    std::vector<Complex> boundary;
+    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+    FreePropagator(in,out,mass,boundary,twist);
+  };
+
+  
  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
 public:
  INHERIT_IMPL_TYPES(Impl);

-  const int part_frac_chroma_convention=1;
+  const int part_frac_chroma_convention=0;

  void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
  void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
@@ -83,12 +83,63 @@ public:
 			   GridRedBlackCartesian &FourDimRedBlackGrid,
 			   RealD _mass,RealD M5,const ImplParams &p= ImplParams());

+  PartialFractionFermion5D(GaugeField &_Umu,
+			   GridCartesian         &FiveDimGrid,
+			   GridRedBlackCartesian &FiveDimRedBlackGrid,
+			   GridCartesian         &FourDimGrid,
+			   GridRedBlackCartesian &FourDimRedBlackGrid,
+			   RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
+
+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
+  {
+    std::cout << "Free Propagator for PartialFraction"<<std::endl;
+    FermionField in_k(in.Grid());
+    FermionField prop_k(in.Grid());
+    
+    FFT theFFT((GridCartesian *) in.Grid());
+
+    //phase for boundary condition
+    ComplexField coor(in.Grid());
+    ComplexField ph(in.Grid());  ph = Zero();
+    FermionField in_buf(in.Grid()); in_buf = Zero();
+    typedef typename Simd::scalar_type Scalar;
+    Scalar ci(0.0,1.0);
+    assert(twist.size() == Nd);//check that twist is Nd
+    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    int shift = 0;
+    for(unsigned int nu = 0; nu < Nd; nu++)
+      {
+	// Shift coordinate lattice index by 1 to account for 5th dimension.
+	LatticeCoordinate(coor, nu + shift);
+	double boundary_phase = ::acos(real(boundary[nu]));
+	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
+	//momenta for propagator shifted by twist+boundary
+	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
+      }
+    in_buf = exp(ci*ph*(-1.0))*in;
+
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    
+    //phase for boundary condition
+    out = out * exp(ci*ph);
+  };
+
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+    std::vector<Complex> boundary;
+    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+    FreePropagator(in,out,mass,boundary,twist);
+  };
+  
 protected:

  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
  virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);

  // Part frac
+  std::vector<RealD> qmu;
  RealD mass;
  RealD dw_diag;
  RealD R;
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -48,7 +48,8 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
  mass_plus(_mass), mass_minus(_mass)
-{ 
+{
+  // qmu defaults to zero size;
 }

 ///////////////////////////////////////////////////////////////
@@ -270,6 +271,34 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  M5Ddag(psi,psi,Din,lower,diag,upper);
 }

+template<class Impl>
+void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
+{
+  if ( qmu.size() ) {
+
+    Gamma::Algebra Gmu [] = {
+      Gamma::Algebra::GammaX,
+      Gamma::Algebra::GammaY,
+      Gamma::Algebra::GammaZ,
+      Gamma::Algebra::GammaT
+    };
+    std::vector<ComplexD> coeff(Nd);
+    ComplexD ci(0,1);
+
+    assert(qmu.size()==Nd);
+
+    for(int mu=0;mu<Nd;mu++){
+       coeff[mu] = ci*qmu[mu];
+       if ( dag ) coeff[mu] = conjugate(coeff[mu]);
+    }
+
+    chi = chi + Gamma(Gmu[0])*psi*coeff[0];
+    for(int mu=1;mu<Nd;mu++){
+      chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
+    }
+  }
+}
+
 template<class Impl>
 void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
@@ -277,8 +306,12 @@ void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
  
  // Assemble Din
  Meooe5D(psi,Din);
-  
+
  this->DW(Din,chi,DaggerNo);
+
+  // add i q_mu gamma_mu here
+  addQmu(Din,chi,DaggerNo);
+  
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby(chi,1.0,1.0,chi,psi); 
  
@@ -295,6 +328,9 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  FermionField Din(psi.Grid());
  // Apply Dw
  this->DW(psi,Din,DaggerYes); 
+
+  // add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
+  addQmu(psi,Din,DaggerYes);
  
  MeooeDag5D(Din,chi);
  
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -42,13 +42,13 @@ template<class Impl>
 void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
 {
  // How to check Ls matches??
-  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
  int Ls = this->Ls;
+  std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
  assert(zdata->db==Ls);// Beta has Ls coeffs

  R=(1+this->mass)/(1-this->mass);
@@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -255,15 +255,76 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
  }
 	
  {
+    // The 'conventional' Cayley overlap operator is
+    //
+    // Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
+    //
+    //
+    // With massless limit 1/2(1+g5 sgnHw)
+    //
+    // Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
+    //
+    // However, the conventional normalisation has both a leading order factor of 2 in Zq
+    // at tree level AND a mass dependent (1-m) that are convenient to absorb.
+    //
+    // In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
+    //
+    // num = -i sin kmu gmu
+    //
+    // denom ( sqrt(sk^2 + (2shk^2 - 1)^2
+    //    b_k = sk2 - M5;
+    //     
+    //    w_k = sqrt(sk + b_k*b_k);
+    //
+    //    denom= ( w_k + b_k + mass*mass) ;
+    //
+    //    denom= one/denom;
+    //    out = num*denom;
+    //
+    // Chroma, and Grid define partial fraction via 4d operator
+    //
+    //   Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
+    //
+    // Now since:
+    //
+    //      (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
+    //
+    // This corresponds to a modified mass parameter
+    //
+    // It has an annoying 
+    //
+    // 
    double R=(1+this->mass)/(1-this->mass);
    //R g5 psi[Ls] + p[0] H
    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
-	
+    
    for(int b=0;b<nblock;b++){
      int s = 2*b+1;
      double pp = p[nblock-1-b];
      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
    }
+
+    if ( qmu.size() ) {
+
+      FermionField qslash_psi(psi.Grid());
+      
+      Gamma::Algebra Gmu [] = {
+			 Gamma::Algebra::GammaX,
+			 Gamma::Algebra::GammaY,
+			 Gamma::Algebra::GammaZ,
+			 Gamma::Algebra::GammaT
+      };
+      ComplexD ci(0,1);
+      assert(qmu.size()==Nd);
+      qslash_psi = Gamma(Gmu[0])*psi;
+      for(int mu=1;mu<Nd;mu++){
+	qslash_psi = Gamma(Gmu[mu])*psi;
+      }
+      //      RealD coeff = 1.0;
+      qslash_psi = Gamma(Gamma::Algebra::Gamma5)*qslash_psi*ci ; // i g5 qslash -- 1-m factor???
+      axpby_ssp(chi,1.0,chi,1.0, qslash_psi,Ls-1,Ls-1);
+    }
+    
  }

 }
@@ -411,7 +472,7 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@@ -421,7 +482,8 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      std::cout << " importing to slice " << Ls-1 <<std::endl;
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
@@ -442,7 +504,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,

 {
  int Ls = this->Ls;
-
+  qmu.resize(0);
  assert((Ls&0x1)==1); // Odd Ls required
  int nrational=Ls-1;

@@ -460,6 +522,22 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
  Approx::zolotarev_free(zdata);

 }
+template<class Impl>
+PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
+							 GridCartesian         &FiveDimGrid,
+							 GridRedBlackCartesian &FiveDimRedBlackGrid,
+							 GridCartesian         &FourDimGrid,
+							 GridRedBlackCartesian &FourDimRedBlackGrid,
+							 RealD _mass,RealD M5,
+							 std::vector<RealD> &_qmu,
+							 const ImplParams &p)
+  : PartialFractionFermion5D<Impl>(_Umu,
+			     FiveDimGrid,FiveDimRedBlackGrid,
+			     FourDimGrid,FourDimRedBlackGrid,
+			     _mass,M5,p)
+{
+  qmu=_qmu;
+}

 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -176,7 +176,7 @@ public:
      return PeriodicBC::CshiftLink(Link,mu,shift);
  }

-  static inline void       setDirections(const std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
+  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
 };
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@@ -43,7 +43,7 @@ public:
 private:
  RealD c_plaq;
  RealD c_rect;
-  typename WilsonLoops<Gimpl>::StapleAndRectStapleAllWorkspace workspace;
+
 public:
  PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){};

@@ -79,18 +79,27 @@ public:
    GridBase *grid = Umu.Grid();

    std::vector<GaugeLinkField> U (Nd,grid);
+    std::vector<GaugeLinkField> U2(Nd,grid);
+
    for(int mu=0;mu<Nd;mu++){
      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+      WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
    }
-    std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
-    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);

    GaugeLinkField dSdU_mu(grid);
    GaugeLinkField staple(grid);

    for (int mu=0; mu < Nd; mu++){
-      dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
-      dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
+
+      // Staple in direction mu
+
+      WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
+
+      dSdU_mu = Ta(U[mu]*staple)*factor_p;
+
+      WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
+
+      dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
 	  
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -37,14 +37,13 @@ NAMESPACE_BEGIN(Grid);
 // Make these members of an Impl class for BC's.

 namespace PeriodicBC { 
-  //Out(x) = Link(x)*field(x+mu)
+
  template<class covariant,class gauge> Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link, 
 									   int mu,
 									   const Lattice<covariant> &field)
  {
    return Link*Cshift(field,mu,1);// moves towards negative mu
  }
-  //Out(x) = Link^dag(x-mu)*field(x-mu)
  template<class covariant,class gauge> Lattice<covariant> CovShiftBackward(const Lattice<gauge> &Link, 
 									    int mu,
 									    const Lattice<covariant> &field)
@@ -53,19 +52,19 @@ namespace PeriodicBC {
    tmp = adj(Link)*field;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
-  //Out(x) = Link^dag(x-mu)
+
  template<class gauge> Lattice<gauge>
  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) 
  {
    return Cshift(adj(Link), mu, -1);
  }
-  //Out(x) = Link(x)
+
  template<class gauge> Lattice<gauge>
  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu)
  {
    return Link;
  }
-  //Link(x) = Link(x+mu)
+
  template<class gauge> Lattice<gauge>
  ShiftStaple(const Lattice<gauge> &Link, int mu)
  {
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -290,7 +290,7 @@ public:
  }
 */
  //////////////////////////////////////////////////
-  // the sum over all nu-oriented staples for nu != mu on each site
+  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

@@ -300,10 +300,6 @@ public:
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
-    Staple(staple, U, mu);
-  }
-
-  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
    staple = Zero();

    for (int nu = 0; nu < Nd; nu++) {
@@ -339,202 +335,6 @@ public:
    }
  }

-  /////////////
-  //Staples for each direction mu, summed over nu != mu
-  //staple: output staples for each mu (Nd)
-  //U: link array (Nd)
-  /////////////
-  static void StapleAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U) {
-    assert(staple.size() == Nd); assert(U.size() == Nd);
-    for(int mu=0;mu<Nd;mu++) Staple(staple[mu], U, mu);
-  }
-
-
-  //A workspace class allowing reuse of the stencil
-  class WilsonLoopPaddedStencilWorkspace{
-    std::unique_ptr<GeneralLocalStencil> stencil;
-    size_t nshift;
-
-    void generateStencil(GridBase* padded_grid){
-      double t0 = usecond();
-      
-      //Generate shift arrays
-      std::vector<Coordinate> shifts = this->getShifts();
-      nshift = shifts.size();
-      
-      double t1 = usecond();
-      //Generate local stencil
-      stencil.reset(new GeneralLocalStencil(padded_grid,shifts));
-      double t2 = usecond();
-      std::cout << GridLogPerformance << " WilsonLoopPaddedWorkspace timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms" << std::endl;   
-    }
-  public:
-    //Get the stencil. If not already generated, or if generated using a different Grid than in PaddedCell, it will be created on-the-fly
-    const GeneralLocalStencil & getStencil(const PaddedCell &pcell){
-      assert(pcell.depth >= this->paddingDepth());
-      if(!stencil || stencil->Grid() != (GridBase*)pcell.grids.back() ) generateStencil((GridBase*)pcell.grids.back());
-      return *stencil;
-    }
-    size_t Nshift() const{ return nshift; }
-    
-    virtual std::vector<Coordinate> getShifts() const = 0;
-    virtual int paddingDepth() const = 0; //padding depth required
-    
-    virtual ~WilsonLoopPaddedStencilWorkspace(){}
-  };
-
-  //This workspace allows the sharing of a common PaddedCell object between multiple stencil workspaces
-  class WilsonLoopPaddedWorkspace{
-    std::vector<WilsonLoopPaddedStencilWorkspace*> stencil_wk;
-    std::unique_ptr<PaddedCell> pcell;
-
-    void generatePcell(GridBase* unpadded_grid){
-      assert(stencil_wk.size());
-      int max_depth = 0;
-      for(auto const &s : stencil_wk) max_depth=std::max(max_depth, s->paddingDepth());
-      
-      pcell.reset(new PaddedCell(max_depth, dynamic_cast<GridCartesian*>(unpadded_grid)));
-    }
-    
-  public:
-    //Add a stencil definition. This should be done before the first call to retrieve a stencil object.
-    //Takes ownership of the pointer
-    void addStencil(WilsonLoopPaddedStencilWorkspace *stencil){
-      assert(!pcell);
-      stencil_wk.push_back(stencil);
-    }
-
-    const GeneralLocalStencil & getStencil(const size_t stencil_idx, GridBase* unpadded_grid){
-      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
-      return stencil_wk[stencil_idx]->getStencil(*pcell);
-    }      
-    const PaddedCell & getPaddedCell(GridBase* unpadded_grid){
-      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
-      return *pcell;
-    }
-    
-    ~WilsonLoopPaddedWorkspace(){
-      for(auto &s : stencil_wk) delete s;
-    }
-  };
-
-  //A workspace class allowing reuse of the stencil
-  class StaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
-  public:
-    std::vector<Coordinate> getShifts() const override{
-      std::vector<Coordinate> shifts;
-      for(int mu=0;mu<Nd;mu++){
-	for(int nu=0;nu<Nd;nu++){
-	  if(nu != mu){
-	    Coordinate shift_0(Nd,0);
-	    Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
-	    Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
-	    Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
-	    Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
-      
-	    //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
-	    shifts.push_back(shift_0);
-	    shifts.push_back(shift_nu);
-	    shifts.push_back(shift_mu);
-      
-	    //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
-	    shifts.push_back(shift_mnu);
-	    shifts.push_back(shift_mnu);
-	    shifts.push_back(shift_mnu_pmu);
-	  }
-	}
-      }
-      return shifts;
-    }
-
-    int paddingDepth() const override{ return 1; }
-  }; 
-
-  //Padded cell implementation of the staple method for all mu, summed over nu != mu
-  //staple: output staple for each mu, summed over nu != mu (Nd)
-  //U_padded: the gauge link fields padded out using the PaddedCell class
-  //Cell: the padded cell class
-  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
-    StaplePaddedAllWorkspace wk;
-    StaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
-  }
-  
-  //Padded cell implementation of the staple method for all mu, summed over nu != mu
-  //staple: output staple for each mu, summed over nu != mu (Nd)
-  //U_padded: the gauge link fields padded out using the PaddedCell class
-  //Cell: the padded cell class
-  //gStencil: the precomputed generalized local stencil for the staple
-  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
-    double t0 = usecond();
-    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
-    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
-    assert(Cell.depth >= 1);
-    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
-
-    int shift_mu_off = gStencil._npoints/Nd;
-    
-    //Open views to padded gauge links and keep open over mu loop
-    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
-    size_t vsize = Nd*sizeof(GaugeViewType);
-    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
-    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
-    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
-    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
-    
-    GaugeMat gStaple(ggrid);
-
-    int outer_off = 0;
-    for(int mu=0;mu<Nd;mu++){
-      { //view scope
-	autoView( gStaple_v , gStaple, AcceleratorWrite);
-	auto gStencil_v = gStencil.View();
-	
-	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
-	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
-	    stencil_ss = Zero();
-	    int off = outer_off;
-	    
-	    for(int nu=0;nu<Nd;nu++){
-	      if(nu != mu){	  
-		GeneralStencilEntry const* e = gStencil_v.GetEntry(off++,ss);
-		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(off++,ss);
-		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(off++,ss);
-		auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-      
-		stencil_ss = stencil_ss + U2 * U1 * U0;
-
-		e = gStencil_v.GetEntry(off++,ss);
-		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(off++,ss);
-		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(off++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-
-		stencil_ss = stencil_ss + U2 * U1 * U0;
-	      }
-	    }
-		
-	    coalescedWrite(gStaple_v[ss],stencil_ss);
-	  }
-	  );
-      } //ensure views are all closed!
-      
-      staple[mu] = Cell.Extract(gStaple);
-      outer_off += shift_mu_off;
-    }//mu loop
-
-    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
-    free(Ug_dirs_v_host);
-    acceleratorFreeDevice(Ug_dirs_v);
-    
-    double t1=usecond();
-    
-    std::cout << GridLogPerformance << "StaplePaddedAll timing:" << (t1-t0)/1000 << "ms" << std::endl;   
-  }
-
-   
  //////////////////////////////////////////////////
  // the sum over all staples on each site in direction mu,nu, upper part
  //////////////////////////////////////////////////
@@ -907,14 +707,18 @@ public:
  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
-    U2 = U * Gimpl::CshiftLink(U, mu, 1);
+    U2 = U * Cshift(U, mu, 1);
  }

  ////////////////////////////////////////////////////////////////////////////
-  // Hop by two optimisation strategy. Use RectStapleDouble to obtain 'U2'
+  // Hop by two optimisation strategy does not work nicely with Gparity. (could
+  // do,
+  // but need to track two deep where cross boundary and apply a conjugation).
+  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do
+  // so .
  ////////////////////////////////////////////////////////////////////////////
-  static void RectStapleOptimised(GaugeMat &Stap, const std::vector<GaugeMat> &U2,
-                                  const std::vector<GaugeMat> &U, int mu) {
+  static void RectStapleOptimised(GaugeMat &Stap, std::vector<GaugeMat> &U2,
+                                  std::vector<GaugeMat> &U, int mu) {

    Stap = Zero();

@@ -928,9 +732,9 @@ public:

        // Up staple    ___ ___
        //             |       |
-        tmp = Gimpl::CshiftLink(adj(U[nu]), nu, -1);
+        tmp = Cshift(adj(U[nu]), nu, -1);
        tmp = adj(U2[mu]) * tmp;
-        tmp = Gimpl::CshiftLink(tmp, mu, -2);
+        tmp = Cshift(tmp, mu, -2);

        Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);

@@ -938,14 +742,14 @@ public:
        //             |___ ___|
        //
        tmp = adj(U2[mu]) * U[nu];
-        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CshiftLink(tmp, mu, -2));
+        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Cshift(tmp, mu, -2));

        //              ___ ___
        //             |    ___|
        //             |___ ___|
        //

-        Stap += Gimpl::CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
+        Stap += Cshift(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);

        //              ___ ___
        //             |___    |
@@ -954,7 +758,7 @@ public:

        //  tmp= Staple2x1* Cshift(U[mu],mu,-2);
        //  Stap+= Cshift(tmp,mu,1) ;
-        Stap += Gimpl::CshiftLink(Staple2x1, mu, 1) * Gimpl::CshiftLink(U[mu], mu, -1);
+        Stap += Cshift(Staple2x1, mu, 1) * Cshift(U[mu], mu, -1);
        ;

        //       --
@@ -962,10 +766,10 @@ public:
        //
        //      |  |

-        tmp = Gimpl::CshiftLink(adj(U2[nu]), nu, -2);
+        tmp = Cshift(adj(U2[nu]), nu, -2);
        tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
-        tmp = U2[nu] * Gimpl::CshiftLink(tmp, nu, 2);
-        Stap += Gimpl::CshiftLink(tmp, mu, 1);
+        tmp = U2[nu] * Cshift(tmp, nu, 2);
+        Stap += Cshift(tmp, mu, 1);

        //      |  |
        //
@@ -974,12 +778,25 @@ public:

        tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
        tmp = adj(U2[nu]) * tmp;
-        tmp = Gimpl::CshiftLink(tmp, nu, -2);
-        Stap += Gimpl::CshiftLink(tmp, mu, 1);
+        tmp = Cshift(tmp, nu, -2);
+        Stap += Cshift(tmp, mu, 1);
      }
    }
  }

+  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
+    RectStapleUnoptimised(Stap, Umu, mu);
+  }
+  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
+                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
+                         int mu) {
+    if (Gimpl::isPeriodicGaugeField()) {
+      RectStapleOptimised(Stap, U2, U, mu);
+    } else {
+      RectStapleUnoptimised(Stap, Umu, mu);
+    }
+  }
+
  static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu,
                                    int mu) {
    GridBase *grid = Umu.Grid();
@@ -1078,288 +895,6 @@ public:
    }
  }

-  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
-    RectStapleUnoptimised(Stap, Umu, mu);
-  }
-  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
-                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
-                         int mu) {
-    RectStapleOptimised(Stap, U2, U, mu);
-  }
-  //////////////////////////////////////////////////////
-  //Compute the rectangular staples for all orientations
-  //Stap : Array of staples (Nd)
-  //U: Gauge links in each direction (Nd)
-  /////////////////////////////////////////////////////
-  static void RectStapleAll(std::vector<GaugeMat> &Stap, const std::vector<GaugeMat> &U){
-    assert(Stap.size() == Nd); assert(U.size() == Nd);
-    std::vector<GaugeMat> U2(Nd,U[0].Grid());
-    for(int mu=0;mu<Nd;mu++) RectStapleDouble(U2[mu], U[mu], mu);
-    for(int mu=0;mu<Nd;mu++) RectStapleOptimised(Stap[mu], U2, U, mu);
-  }
-
-  //A workspace class allowing reuse of the stencil
-  class RectStaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
-  public:
-    std::vector<Coordinate> getShifts() const override{
-      std::vector<Coordinate> shifts;
-      for (int mu = 0; mu < Nd; mu++){
-	for (int nu = 0; nu < Nd; nu++) {
-	  if (nu != mu) {
-	    auto genShift = [&](int mushift,int nushift){
-	      Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
-	    };
-
-	    //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
-	    shifts.push_back(genShift(0,0));
-	    shifts.push_back(genShift(0,+1));
-	    shifts.push_back(genShift(+1,+1));
-	    shifts.push_back(genShift(+2,0));
-	    shifts.push_back(genShift(+1,0));
-
-	    //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
-	    shifts.push_back(genShift(0,-1));
-	    shifts.push_back(genShift(0,-1));
-	    shifts.push_back(genShift(+1,-1));
-	    shifts.push_back(genShift(+2,-1));
-	    shifts.push_back(genShift(+1,0));
-
-	    //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
-	    shifts.push_back(genShift(-1,0));
-	    shifts.push_back(genShift(-1,-1));
-	    shifts.push_back(genShift(-1,-1));
-	    shifts.push_back(genShift(0,-1));
-	    shifts.push_back(genShift(+1,-1));
-
-	    //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
-	    shifts.push_back(genShift(-1,0));
-	    shifts.push_back(genShift(-1,0));
-	    shifts.push_back(genShift(-1,+1));
-	    shifts.push_back(genShift(0,+1));
-	    shifts.push_back(genShift(+1,0));
-
-	    //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
-	    shifts.push_back(genShift(0,0));
-	    shifts.push_back(genShift(0,+1));
-	    shifts.push_back(genShift(0,+2));
-	    shifts.push_back(genShift(+1,+1));
-	    shifts.push_back(genShift(+1,0));
-
-	    //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
-	    shifts.push_back(genShift(0,-1));
-	    shifts.push_back(genShift(0,-2));
-	    shifts.push_back(genShift(0,-2));
-	    shifts.push_back(genShift(+1,-2));
-	    shifts.push_back(genShift(+1,-1));
-	  }
-	}
-      }
-      return shifts;
-    }
-
-    int paddingDepth() const override{ return 2; }
-  }; 
-
-  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
-  //staple: output staple for each mu, summed over nu != mu (Nd)
-  //U_padded: the gauge link fields padded out using the PaddedCell class
-  //Cell: the padded cell class
-  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
-    RectStaplePaddedAllWorkspace wk;
-    RectStaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
-  }
-  
-  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
-  //staple: output staple for each mu, summed over nu != mu (Nd)
-  //U_padded: the gauge link fields padded out using the PaddedCell class
-  //Cell: the padded cell class
-  //gStencil: the stencil
-  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
-    double t0 = usecond();
-    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
-    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
-    assert(Cell.depth >= 2);
-    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
-
-    size_t nshift = gStencil._npoints;
-    int mu_off_delta = nshift / Nd;
-    
-    //Open views to padded gauge links and keep open over mu loop
-    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
-    size_t vsize = Nd*sizeof(GaugeViewType);
-    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
-    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
-    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
-    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
-
-    GaugeMat gStaple(ggrid); //temp staple object on padded grid
-
-    int offset = 0;
-    for(int mu=0; mu<Nd; mu++){
-
-      { //view scope
-	autoView( gStaple_v , gStaple, AcceleratorWrite);
-	auto gStencil_v = gStencil.View();
-
-	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
-	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
-	    stencil_ss = Zero();
-	    int s=offset;
-	    for(int nu=0;nu<Nd;nu++){
-	      if(nu != mu){
-		//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
-		GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
-		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-	    
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-		//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
-		e = gStencil_v.GetEntry(s++,ss);
-		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-		//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
-		e = gStencil_v.GetEntry(s++,ss);
-		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-		//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
-		e = gStencil_v.GetEntry(s++,ss);
-		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-		//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
-		e = gStencil_v.GetEntry(s++,ss);
-		U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
-
-		//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
-		e = gStencil_v.GetEntry(s++,ss);
-		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
-
-	      }
-	    }
-	    coalescedWrite(gStaple_v[ss],stencil_ss);
-	  }
-	  );
-	offset += mu_off_delta;
-      }//kernel/view scope
-
-      staple[mu] = Cell.Extract(gStaple);    
-    }//mu loop
-  
-    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
-    free(Ug_dirs_v_host);
-    acceleratorFreeDevice(Ug_dirs_v);
-    
-    double t1 = usecond();
-    
-    std::cout << GridLogPerformance << "RectStaplePaddedAll timings:" << (t1-t0)/1000 << "ms" << std::endl;   
-  }
-
-  //A workspace for reusing the PaddedCell and GeneralLocalStencil objects
-  class StapleAndRectStapleAllWorkspace: public WilsonLoopPaddedWorkspace{
-  public:
-    StapleAndRectStapleAllWorkspace(){
-      this->addStencil(new StaplePaddedAllWorkspace);
-      this->addStencil(new RectStaplePaddedAllWorkspace);
-    }
-  };     
-    
-  //////////////////////////////////////////////////////
-  //Compute the 1x1 and 1x2 staples for all orientations
-  //Stap : Array of staples (Nd)
-  //RectStap: Array of rectangular staples (Nd)
-  //U: Gauge links in each direction (Nd)
-  /////////////////////////////////////////////////////
-  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U){
-    StapleAndRectStapleAllWorkspace wk;
-    StapleAndRectStapleAll(Stap,RectStap,U,wk);
-  }
-  
-  //////////////////////////////////////////////////////
-  //Compute the 1x1 and 1x2 staples for all orientations
-  //Stap : Array of staples (Nd)
-  //RectStap: Array of rectangular staples (Nd)
-  //U: Gauge links in each direction (Nd)
-  //wk: a workspace containing stored PaddedCell and GeneralLocalStencil objects to maximize reuse
-  /////////////////////////////////////////////////////
-  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U, StapleAndRectStapleAllWorkspace &wk){
-#if 0
-    StapleAll(Stap, U);
-    RectStapleAll(RectStap, U);
-#else
-    double t0 = usecond();
-
-    GridCartesian* unpadded_grid = dynamic_cast<GridCartesian*>(U[0].Grid());
-    const PaddedCell &Ghost = wk.getPaddedCell(unpadded_grid);
-        
-    CshiftImplGauge<Gimpl> cshift_impl;
-    std::vector<GaugeMat> U_pad(Nd, Ghost.grids.back());
-    for(int mu=0;mu<Nd;mu++) U_pad[mu] = Ghost.Exchange(U[mu], cshift_impl);
-    double t1 = usecond();
-    StaplePaddedAll(Stap, U_pad, Ghost, wk.getStencil(0,unpadded_grid) );
-    double t2 = usecond();
-    RectStaplePaddedAll(RectStap, U_pad, Ghost, wk.getStencil(1,unpadded_grid));
-    double t3 = usecond();
-    std::cout << GridLogPerformance << "StapleAndRectStapleAll timings: pad:" << (t1-t0)/1000 << "ms, staple:" << (t2-t1)/1000 << "ms, rect-staple:" << (t3-t2)/1000 << "ms" << std::endl;
-#endif
-  }
-
  //////////////////////////////////////////////////
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
  //////////////////////////////////////////////////
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@@ -79,60 +79,60 @@ public:
    this->_entries.resize(npoints* osites);
    this->_entries_p = &_entries[0];

-    thread_for(site, osites, {
-	Coordinate Coor;
-	Coordinate NbrCoor;

-	for(Integer ii=0;ii<npoints;ii++){
-	  Integer lex = site*npoints+ii;
-	  GeneralStencilEntry SE;
-	  ////////////////////////////////////////////////
-	  // Outer index of neighbour Offset calculation
-	  ////////////////////////////////////////////////
-	  grid->oCoorFromOindex(Coor,site);
-	  for(int d=0;d<Coor.size();d++){
-	    int rd = grid->_rdimensions[d];
-	    NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
-	  }
-	  SE._offset      = grid->oIndexReduced(NbrCoor);
-
-	  ////////////////////////////////////////////////
-	  // Inner index permute calculation
-	  // Simpler version using icoor calculation
-	  ////////////////////////////////////////////////
-	  SE._permute =0;
-	  for(int d=0;d<Coor.size();d++){
-
-	    int fd = grid->_fdimensions[d];
-	    int rd = grid->_rdimensions[d];
-	    int ly = grid->_simd_layout[d];
-
-	    assert((ly==1)||(ly==2));
-
-	    int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
-	    int x = Coor[d];                // x in [0... rd-1] as an oSite 
-
-	    int permute_dim  = grid->PermuteDim(d);
-	    int permute_slice=0;
-	    if(permute_dim){    
-	      int  num = shift%rd; // Slice within dest osite cell of slice zero
-	      int wrap = shift/rd; // Number of osite local volume cells crossed through
-	      // x+num < rd dictates whether we are in same permute state as slice 0
-	      if ( x< rd-num ) permute_slice=wrap;
-	      else             permute_slice=(wrap+1)%ly;
-	    }
-	    if ( permute_slice ) {
-	      int ptype       =grid->PermuteType(d);
-	      uint8_t mask    =0x1<<ptype;
-	      SE._permute    |= mask;
-	    }
-	  }	
-	  ////////////////////////////////////////////////
-	  // Store in look up table
-	  ////////////////////////////////////////////////
-	  this->_entries[lex] = SE;
+    Coordinate Coor;
+    Coordinate NbrCoor;
+    for(Integer site=0;site<osites;site++){
+      for(Integer ii=0;ii<npoints;ii++){
+	Integer lex = site*npoints+ii;
+	GeneralStencilEntry SE;
+	////////////////////////////////////////////////
+	// Outer index of neighbour Offset calculation
+	////////////////////////////////////////////////
+	grid->oCoorFromOindex(Coor,site);
+	for(int d=0;d<Coor.size();d++){
+	  int rd = grid->_rdimensions[d];
+	  NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
 	}
-      });
+	SE._offset      = grid->oIndexReduced(NbrCoor);
+
+	////////////////////////////////////////////////
+	// Inner index permute calculation
+	// Simpler version using icoor calculation
+	////////////////////////////////////////////////
+	SE._permute =0;
+	for(int d=0;d<Coor.size();d++){
+
+	  int fd = grid->_fdimensions[d];
+	  int rd = grid->_rdimensions[d];
+	  int ly = grid->_simd_layout[d];
+
+	  assert((ly==1)||(ly==2));
+
+	  int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
+	  int x = Coor[d];                // x in [0... rd-1] as an oSite 
+
+	  int permute_dim  = grid->PermuteDim(d);
+	  int permute_slice=0;
+	  if(permute_dim){    
+	    int  num = shift%rd; // Slice within dest osite cell of slice zero
+	    int wrap = shift/rd; // Number of osite local volume cells crossed through
+                                  // x+num < rd dictates whether we are in same permute state as slice 0
+	    if ( x< rd-num ) permute_slice=wrap;
+	    else             permute_slice=(wrap+1)%ly;
+	  }
+	  if ( permute_slice ) {
+	    int ptype       =grid->PermuteType(d);
+	    uint8_t mask    =0x1<<ptype;
+	    SE._permute    |= mask;
+	  }
+	}	
+	////////////////////////////////////////////////
+	// Store in look up table
+	////////////////////////////////////////////////
+	this->_entries[lex] = SE;
+      }
+    }      
  }
  
 };
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -32,7 +32,6 @@

 #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate
 #include <Grid/stencil/Lebesgue.h>   // subdir aggregate
-#include <Grid/stencil/GeneralLocalStencil.h>

 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
--- a/Grid/tensors/Tensor_SIMT.h
+++ b/Grid/tensors/Tensor_SIMT.h
@@ -73,16 +73,6 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
    return vec;
  }
 }
-//'perm_mask' acts as a bitmask
-template<class vobj> accelerator_inline
-vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0)
-{
-  auto obj = vec, tmp = vec;
-  for (int d=0;d<nd;d++)
-    if (perm_mask & (0x1 << d)) { permute(obj,tmp,d); tmp=obj;}
-  return obj;
-}
-
 template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
 {
@@ -93,7 +83,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 {
  vstream(vec, extracted);
 }
-#else //==GRID_SIMT
+#else


 //#ifndef GRID_SYCL
@@ -176,14 +166,6 @@ typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,
  return extractLane(plane,vec);
 }
 template<class vobj> accelerator_inline
-typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd()))
-{
-  int plane = lane;
-  for (int d=0;d<nd;d++)
-    plane = (perm_mask & (0x1 << d)) ? plane ^ (vobj::Nsimd() >> (d + 1)) : plane;
-  return extractLane(plane,vec);
-}
-template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 {
  insertLane(lane,vec,extracted);
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@@ -1,4 +1,4 @@
 BREW=/opt/local/
-MPICXX=mpicxx CXX=c++-12 ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug
+CXX=mpicxx-openmpi-mp ../../configure --enable-simd=GEN --enable-comms=mpi --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug


--- a/tests/debug/Test_iwasaki_action_newstaple.cc
+++ b/tests/debug/Test_iwasaki_action_newstaple.cc
@@ -1,188 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_iwasaki_action_newstaple.cc
-
-    Copyright (C) 2015
-
-Author: Christopher Kelly <ckelly@bnl.gov>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-
-////////////////////////////////////////////////////////////////////////
-// PlaqPlusRectangleActoin
-////////////////////////////////////////////////////////////////////////
-template<class Gimpl>
-class PlaqPlusRectangleActionOrig : public Action<typename Gimpl::GaugeField> {
-public:
-
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-private:
-  RealD c_plaq;
-  RealD c_rect;
-
-public:
-  PlaqPlusRectangleActionOrig(RealD b,RealD c): c_plaq(b),c_rect(c){};
-
-  virtual std::string action_name(){return "PlaqPlusRectangleActionOrig";}
-      
-  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
-      
-  virtual std::string LogParameters(){
-    std::stringstream sstream;
-    sstream << GridLogMessage << "["<<action_name() <<"] c_plaq: " << c_plaq << std::endl;
-    sstream << GridLogMessage << "["<<action_name() <<"] c_rect: " << c_rect << std::endl;
-    return sstream.str();
-  }
-
-
-  virtual RealD S(const GaugeField &U) {
-    RealD vol = U.Grid()->gSites();
-
-    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
-    RealD rect = WilsonLoops<Gimpl>::avgRectangle(U);
-
-    RealD action=c_plaq*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5
-      +c_rect*(1.0 -rect)*(Nd*(Nd-1.0))*vol;
-
-    return action;
-  };
-
-  virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
-    //extend Ta to include Lorentz indexes
-    RealD factor_p = c_plaq/RealD(Nc)*0.5;
-    RealD factor_r = c_rect/RealD(Nc)*0.5;
-
-    GridBase *grid = Umu.Grid();
-
-    std::vector<GaugeLinkField> U (Nd,grid);
-    std::vector<GaugeLinkField> U2(Nd,grid);
-
-    for(int mu=0;mu<Nd;mu++){
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-      WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
-    }
-
-    GaugeLinkField dSdU_mu(grid);
-    GaugeLinkField staple(grid);
-
-    for (int mu=0; mu < Nd; mu++){
-
-      // Staple in direction mu
-
-      WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
-
-      dSdU_mu = Ta(U[mu]*staple)*factor_p;
-
-      WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
-
-      dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
-	  
-      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
-    }
-
-  };
-
-};
-
-// Convenience for common physically defined cases.
-//
-// RBC c1 parameterisation is not really RBC but don't have good
-// reference and we are happy to change name if prior use of this plaq coeff
-// parameterisation is made known to us. 
-template<class Gimpl>
-class RBCGaugeActionOrig : public PlaqPlusRectangleActionOrig<Gimpl> {
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-  RBCGaugeActionOrig(RealD beta,RealD c1) : PlaqPlusRectangleActionOrig<Gimpl>(beta*(1.0-8.0*c1), beta*c1) {};
-  virtual std::string action_name(){return "RBCGaugeActionOrig";}
-};
-
-template<class Gimpl>
-class IwasakiGaugeActionOrig : public RBCGaugeActionOrig<Gimpl> {
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-  IwasakiGaugeActionOrig(RealD beta) : RBCGaugeActionOrig<Gimpl>(beta,-0.331) {};
-  virtual std::string action_name(){return "IwasakiGaugeActionOrig";}
-};
-
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  Coordinate latt_size  = GridDefaultLatt();
-  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
-  Coordinate mpi_layout = GridDefaultMpi();
-  std::cout << " mpi "<<mpi_layout<<std::endl;
-  std::cout << " simd "<<simd_layout<<std::endl;
-  std::cout << " latt "<<latt_size<<std::endl;
-  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
-
-  GridParallelRNG   pRNG(&GRID);
-  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-  LatticeGaugeField U(&GRID);
-
-  SU<Nc>::HotConfiguration(pRNG,U);
-
-  //#define PRD
-#ifdef PRD
-  typedef PeriodicGimplD Gimpl;
-#else
-  typedef ConjugateGimplD Gimpl;
-  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
-  Gimpl::setDirections(conj_dirs);
-#endif
-
-  typedef typename WilsonLoops<Gimpl>::GaugeMat GaugeMat;
-  typedef typename WilsonLoops<Gimpl>::GaugeLorentz GaugeLorentz;
-
-  GaugeLorentz derivOrig(&GRID), derivNew(&GRID);
-  double beta = 2.13;
-  IwasakiGaugeActionOrig<Gimpl> action_orig(beta);
-  IwasakiGaugeAction<Gimpl> action_new(beta);
-
-  double torig=0, tnew=0;
-  int ntest = 10;
-  for(int i=0;i<ntest;i++){
-    double t0 = usecond();
-    action_orig.deriv(U, derivOrig);
-    double t1 = usecond();
-    action_new.deriv(U, derivNew);
-    double t2 = usecond();
-
-    GaugeLorentz diff = derivOrig - derivNew;
-    double n = norm2(diff);
-    std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
-    assert(n<1e-10);
-
-    std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms,  new: " << (t2-t1)/1000 << "ms" << std::endl;
-    torig += (t1-t0)/1000; tnew += (t2-t1)/1000;
-  }
-  std::cout << GridLogMessage << "Avg timings " << ntest << " iterations: orig:" << torig/ntest << "ms,   new:" << tnew/ntest << "ms" << std::endl;
-  
-  Grid_finalize();
-}
--- a/tests/debug/Test_optimized_staple_gaugebc.cc
+++ b/tests/debug/Test_optimized_staple_gaugebc.cc
@@ -1,94 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_optimized_staple_gaugebc.cc
-
-    Copyright (C) 2015
-
-Author: Christopher Kelly <ckelly@bnl.gov>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#include <Grid/lattice/PaddedCell.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
-
-using namespace std;
-using namespace Grid;
- 
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  Coordinate latt_size  = GridDefaultLatt();
-  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
-  Coordinate mpi_layout = GridDefaultMpi();
-  std::cout << " mpi "<<mpi_layout<<std::endl;
-  std::cout << " simd "<<simd_layout<<std::endl;
-  std::cout << " latt "<<latt_size<<std::endl;
-  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
-
-  GridParallelRNG   pRNG(&GRID);
-  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-  LatticeGaugeField U(&GRID);
-
-  SU<Nc>::HotConfiguration(pRNG,U);
-
-  //#define PRD
-#ifdef PRD
-  typedef PeriodicGimplD Gimpl;
-#else
-  typedef ConjugateGimplD Gimpl;
-  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
-  Gimpl::setDirections(conj_dirs);
-#endif
-
-  typedef typename WilsonLoops<Gimpl>::GaugeMat GaugeMat;
-  typedef typename WilsonLoops<Gimpl>::GaugeLorentz GaugeLorentz;
-
-  int count = 0;
-  double torig=0, topt=0;
-     
-  std::vector<GaugeMat> Umu(Nd,&GRID), U2(Nd,&GRID);
-  for(int mu=0;mu<Nd;mu++){
-    Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
-    WilsonLoops<Gimpl>::RectStapleDouble(U2[mu], Umu[mu], mu);
-  }
-
-  std::cout << GridLogMessage << "Checking optimized vs unoptimized RectStaple" << std::endl;
-  for(int mu=0;mu<Nd;mu++){
-    GaugeMat staple_orig(&GRID), staple_opt(&GRID), staple_U2(&GRID);
-    double t0 = usecond();
-    WilsonLoops<Gimpl>::RectStapleUnoptimised(staple_orig,U,mu);
-    double t1 = usecond();
-    WilsonLoops<Gimpl>::RectStapleOptimised(staple_opt, U2, Umu, mu);
-    double t2 = usecond();
-    torig += t1-t0;  topt += t2-t1;
-    ++count;
-    
-    GaugeMat diff = staple_orig - staple_opt;
-    double n = norm2(diff);
-    std::cout << GridLogMessage << mu << " " << n << std::endl;
-    assert(n<1e-10);
-  }
-  std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  optimized: " << topt/1000/count << "ms" << std::endl;
-  
-  Grid_finalize();
-}
--- a/tests/debug/Test_padded_cell_staple.cc
+++ b/tests/debug/Test_padded_cell_staple.cc
@@ -1,580 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_padded_cell_staple.cc
-
-    Copyright (C) 2015
-
-Author: Christopher Kelly <ckelly@bnl.gov>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#include <Grid/lattice/PaddedCell.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
-
-using namespace std;
-using namespace Grid;
-
-template <class Gimpl> class WilsonLoopsTest : public Gimpl {
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  typedef typename Gimpl::GaugeLinkField GaugeMat;
-  typedef typename Gimpl::GaugeField GaugeLorentz;
-
-
-  //Original implementation
-  static void StapleOrig(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
-			 int nu) {
-
-    GridBase *grid = Umu.Grid();
-
-    std::vector<GaugeMat> U(Nd, grid);
-    for (int d = 0; d < Nd; d++) {
-      U[d] = PeekIndex<LorentzIndex>(Umu, d);
-    }
-    staple = Zero();
-
-    if (nu != mu) {
-
-      // mu
-      // ^
-      // |__>  nu
-
-      //    __
-      //      |
-      //    __|
-      //
-
-      //Forward: Out(x) = Link(x)*field(x+mu)
-      //Backward: Out(x) = Link^dag(x-mu)*field(x-mu)
-      //ShiftStaple: Link(x) = Link(x+mu)
-
-      //tmp1 = U^dag_nu(x-nu)
-      //tmp2 = U^dag_mu(x-mu) tmp1(x-mu) = U^dag_mu(x-mu) U^dag_nu(x-nu-mu)
-      //tmp3 = U_nu(x) tmp2(x+nu) = U_nu(x)U^dag_mu(x-mu+nu) U^dag_nu(x-mu)
-      //tmp4 = tmp(x+mu) = U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
-
-      staple += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftForward(
-							  U[nu], nu,
-							  Gimpl::CovShiftBackward(
-										  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
-				   mu);
-
-      //  __
-      // |
-      // |__
-      //
-      //
-    
-      //tmp1 = U_mu^dag(x-mu) U_nu(x-mu)
-      //tmp2 = U_nu^dag(x-nu) tmp1(x-nu) = U_nu^dag(x-nu) U_mu^dag(x-mu-nu) U_nu(x-mu-nu)
-      //tmp3 = tmp2(x+mu) = U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
-      staple += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftBackward(U[nu], nu,
-							   Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
-				   mu);
-    }
-  }
-
-  static void StaplePadded(GaugeMat &staple, const GaugeLorentz &U, int mu,
-			   int nu) {
-    if(nu==mu){
-      staple = Zero();
-      return;
-    }
-    double peek = 0, construct = 0, exchange = 0, coord = 0, stencil =0, kernel = 0, extract = 0, total = 0;
-    
-    double tstart = usecond();
-    double t=tstart;
-    
-    PaddedCell Ghost(1, (GridCartesian*)U.Grid());
-
-    construct += usecond() - t;
-      
-    t=usecond();      
-    GaugeMat U_mu = PeekIndex<LorentzIndex>(U, mu);
-    GaugeMat U_nu = PeekIndex<LorentzIndex>(U, nu);
-    peek += usecond() - t;
-
-    t=usecond();
-    CshiftImplGauge<Gimpl> cshift_impl;
-    GaugeMat Ug_mu = Ghost.Exchange(U_mu, cshift_impl);
-    GaugeMat Ug_nu = Ghost.Exchange(U_nu, cshift_impl);
-    exchange += usecond() - t;
-    
-    GridBase *ggrid = Ug_mu.Grid();
-
-    GaugeMat gStaple(ggrid);
-
-    t=usecond();
-    Coordinate shift_0(Nd,0);
-    Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
-    Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
-    Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
-    Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
-
-    std::vector<Coordinate> shifts;
-
-    //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
-    shifts.push_back(shift_0);
-    shifts.push_back(shift_nu);
-    shifts.push_back(shift_mu);
-
-    //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
-    shifts.push_back(shift_mnu);
-    shifts.push_back(shift_mnu);
-    shifts.push_back(shift_mnu_pmu);
-    coord += usecond()-t;
-
-    t=usecond();
-    GeneralLocalStencil gStencil(ggrid,shifts);
-    stencil += usecond() -t;
-
-    t=usecond();
-    {
-      autoView( gStaple_v , gStaple, AcceleratorWrite);
-      auto gStencil_v = gStencil.View();
-      autoView( Ug_mu_v , Ug_mu, AcceleratorRead);
-      autoView( Ug_nu_v , Ug_nu, AcceleratorRead);
-  
-      accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
-	  GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss);
-	  auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
-	  e = gStencil_v.GetEntry(1,ss);
-	  auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
-	  e = gStencil_v.GetEntry(2,ss);
-	  auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
-      
-	  auto stencil_ss = U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x;
-
-	  e = gStencil_v.GetEntry(3,ss);
-	  auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
-	  e = gStencil_v.GetEntry(4,ss);
-	  auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
-	  e = gStencil_v.GetEntry(5,ss);
-	  auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
-
-	  stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu;
-      
-	  coalescedWrite(gStaple_v[ss],stencil_ss);
-	}
-	);
-    } //ensure views are all closed!
-    kernel += usecond() - t;
-
-    t=usecond();
-    staple = Ghost.Extract(gStaple);
-    extract += usecond()-t;
-    
-    total += usecond() - tstart;
-    std::cout << GridLogMessage << "StaplePadded timings peek:" << peek << " construct:" << construct << " exchange:" << exchange << " coord:" << coord << " stencil:" << stencil << " kernel:" << kernel << " extract:" << extract << " total:" << total << std::endl;
-  }
-
-  static void RectStapleOrig(GaugeMat &Stap, const GaugeLorentz &Umu,
-			     int mu) {
-    GridBase *grid = Umu.Grid();
-
-    std::vector<GaugeMat> U(Nd, grid);
-    for (int d = 0; d < Nd; d++) {
-      U[d] = PeekIndex<LorentzIndex>(Umu, d);
-    }
-
-    Stap = Zero();
-
-    for (int nu = 0; nu < Nd; nu++) {
-      if (nu != mu) {
-        //           __ ___
-        //          |    __ |
-        //
-	//tmp1 = U_nu^dag(x-nu)
-	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu) U_nu^dag(x-nu-mu)
-	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu) U_mu^dag(x-2mu) U_nu^dag(x-nu-2mu)
-	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu) U_mu^dag(x-2mu+nu) U_nu^dag(x-2mu)
-	//tmp5 = U_mu(x)tmp4(x+mu) = U_mu(x)U_nu(x+mu)U_mu^dag(x+nu) U_mu^dag(x-mu+nu) U_nu^dag(x-mu)
-	//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
-	
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftForward(
-							  U[mu], mu,
-							  Gimpl::CovShiftForward(
-										 U[nu], nu,
-										 Gimpl::CovShiftBackward(
-													 U[mu], mu,
-													 Gimpl::CovShiftBackward(
-																 U[mu], mu,
-																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
-				   mu);
-
-        //              __
-        //          |__ __ |
-
-	//tmp1 = U^dag_mu(x-mu)U_nu(x-mu)
-	//tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)
-	//tmp3 = U^dag_nu(x-nu)tmp2(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)
-	//tmp4 = U_mu(x)tmp3(x+mu) = U_mu(x)U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)
-	//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
-	
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftForward(
-							  U[mu], mu,
-							  Gimpl::CovShiftBackward(
-										  U[nu], nu,
-										  Gimpl::CovShiftBackward(
-													  U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))),
-				   mu);
-
-        //           __
-        //          |__ __ |
-	//Forward: Out(x) = Link(x)*field(x+mu)
-	//Backward: Out(x) = Link^dag(x-mu)*field(x-mu)
-	//ShiftStaple: Link(x) = Link(x+mu)
-
-	//tmp1 = U_nu(x)U_mu(x+nu)
-	//tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U_nu(x-mu)U_mu(x+nu-mu)
-	//tmp3 = U^dag_mu(x-mu)tmp2(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)U_mu(x+nu-2mu)
-	//tmp4 = U^dag_nu(x-nu)tmp3(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)U_mu(x-2mu)
-	//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftBackward(
-							   U[nu], nu,
-							   Gimpl::CovShiftBackward(
-										   U[mu], mu,
-										   Gimpl::CovShiftBackward(
-													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))),
-				   mu);
-
-        //           __ ___
-        //          |__    |
-	//tmp1 = U_nu^dag(x-nu)U_mu(x-nu)
-	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_mu(x-mu-nu)
-	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_mu^dag(x-2mu)U_nu^dag(x-2mu-nu)U_mu(x-2mu-nu)
-	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_mu^dag(x-2mu+nu)U_nu^dag(x-2mu)U_mu(x-2mu)
-	//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftForward(
-							  U[nu], nu,
-							  Gimpl::CovShiftBackward(
-										  U[mu], mu,
-										  Gimpl::CovShiftBackward(
-													  U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))),
-				   mu);
-
-        //       --
-        //      |  |
-        //
-        //      |  |
-	//tmp1 = U_nu^dag(x-nu)
-	//tmp2 = U_nu^dag(x-nu)tmp1(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)
-	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_nu^dag(x-mu-2nu)
-	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_nu^dag(x-mu-nu)
-	//tmp5 = U_nu(x)tmp4(x+nu) = U_nu(x)U_nu(x+nu)U_mu^dag(x-mu+2nu)U_nu^dag(x-mu+nu)U_nu^dag(x-mu)
-	//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftForward(
-							  U[nu], nu,
-							  Gimpl::CovShiftForward(
-										 U[nu], nu,
-										 Gimpl::CovShiftBackward(
-													 U[mu], mu,
-													 Gimpl::CovShiftBackward(
-																 U[nu], nu,
-																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
-				   mu);
-
-        //      |  |
-        //
-        //      |  |
-        //       --
-	//tmp1 = U_nu(x)U_nu(x+nu)
-	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu(x-mu)U_nu(x-mu+nu)
-	//tmp3 = U_nu^dag(x-nu)tmp2(x-nu) = U_nu^dag(x-nu)U_mu^dag(x-mu-nu)U_nu(x-mu-nu)U_nu(x-mu)
-	//tmp4 = U_nu^dag(x-nu)tmp3(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)U_mu^dag(x-mu-2nu)U_nu(x-mu-2nu)U_nu(x-mu-nu)
-	//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftBackward(
-							   U[nu], nu,
-							   Gimpl::CovShiftBackward(
-										   U[nu], nu,
-										   Gimpl::CovShiftBackward(
-													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))),
-				   mu);
-      }
-    }
-  }
-
-
-  static void RectStaplePadded(GaugeMat &Stap, const GaugeLorentz &U,
-			       int mu) {
-    PaddedCell Ghost(2,(GridCartesian*)U.Grid());
-    GridBase *ggrid = Ghost.grids.back();
-    
-    CshiftImplGauge<Gimpl> cshift_impl;
-    std::vector<GaugeMat> Ug_dirs(Nd,ggrid);
-    for(int i=0;i<Nd;i++) Ug_dirs[i] = Ghost.Exchange(PeekIndex<LorentzIndex>(U, i), cshift_impl);
-
-    GaugeMat gStaple(ggrid);
-
-    std::vector<Coordinate> shifts;
-    for (int nu = 0; nu < Nd; nu++) {
-      if (nu != mu) {
-	auto genShift = [&](int mushift,int nushift){
-	  Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
-	};
-
-	//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
-	shifts.push_back(genShift(0,0));
-	shifts.push_back(genShift(0,+1));
-	shifts.push_back(genShift(+1,+1));
-	shifts.push_back(genShift(+2,0));
-	shifts.push_back(genShift(+1,0));
-
-	//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
-	shifts.push_back(genShift(0,-1));
-	shifts.push_back(genShift(0,-1));
-	shifts.push_back(genShift(+1,-1));
-	shifts.push_back(genShift(+2,-1));
-	shifts.push_back(genShift(+1,0));
-
-	//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
-	shifts.push_back(genShift(-1,0));
-	shifts.push_back(genShift(-1,-1));
-	shifts.push_back(genShift(-1,-1));
-	shifts.push_back(genShift(0,-1));
-	shifts.push_back(genShift(+1,-1));
-
-	//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
-	shifts.push_back(genShift(-1,0));
-	shifts.push_back(genShift(-1,0));
-	shifts.push_back(genShift(-1,+1));
-	shifts.push_back(genShift(0,+1));
-	shifts.push_back(genShift(+1,0));
-
-	//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
-	shifts.push_back(genShift(0,0));
-	shifts.push_back(genShift(0,+1));
-	shifts.push_back(genShift(0,+2));
-	shifts.push_back(genShift(+1,+1));
-	shifts.push_back(genShift(+1,0));
-
-	//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
-	shifts.push_back(genShift(0,-1));
-	shifts.push_back(genShift(0,-2));
-	shifts.push_back(genShift(0,-2));
-	shifts.push_back(genShift(+1,-2));
-	shifts.push_back(genShift(+1,-1));
-      }
-    }
-    size_t nshift = shifts.size();
-
-    GeneralLocalStencil gStencil(ggrid,shifts);
-    {
-      autoView( gStaple_v , gStaple, AcceleratorWrite);
-      auto gStencil_v = gStencil.View();
-
-      typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
-      size_t vsize = Nd*sizeof(GaugeViewType);
-      GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
-      for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = Ug_dirs[i].View(AcceleratorRead);
-      GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
-      acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
-
-      accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
-	  decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
-	  stencil_ss = Zero();
-	  int s=0;
-	  for(int nu=0;nu<Nd;nu++){
-	    if(nu != mu){
-	      //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
-	      GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
-	      auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-	    
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-	      //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-	      //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-	      //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-	      //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
-
-	      //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
-
-	    }
-	  }
-	  assert(s==nshift);
-	  coalescedWrite(gStaple_v[ss],stencil_ss);
-	}
-	);
-  
-      for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
-      free(Ug_dirs_v_host);
-      acceleratorFreeDevice(Ug_dirs_v);
-    }   
-    Stap = Ghost.Extract(gStaple);    
-  }
-
-
-
-};  
-  
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  Coordinate latt_size  = GridDefaultLatt();
-  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
-  Coordinate mpi_layout = GridDefaultMpi();
-  std::cout << " mpi "<<mpi_layout<<std::endl;
-  std::cout << " simd "<<simd_layout<<std::endl;
-  std::cout << " latt "<<latt_size<<std::endl;
-  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
-
-  GridParallelRNG   pRNG(&GRID);
-  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-  LatticeGaugeField U(&GRID);
-
-  SU<Nc>::HotConfiguration(pRNG,U);
-
-  //typedef PeriodicGimplD Gimpl;
-  typedef ConjugateGimplD Gimpl;
-  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
-  Gimpl::setDirections(conj_dirs);
-
-  typedef typename WilsonLoopsTest<Gimpl>::GaugeMat GaugeMat;
-  typedef typename WilsonLoopsTest<Gimpl>::GaugeLorentz GaugeLorentz;
-
-  std::cout << GridLogMessage << "Checking Staple" << std::endl;
-  int count = 0;
-  double torig=0, tpadded=0;
-  
-  for(int mu=0;mu<Nd;mu++){
-    for(int nu=0;nu<Nd;nu++){
-      if(mu != nu){
-	GaugeMat staple_orig(&GRID), staple_padded(&GRID);
-	double t0 = usecond();
-	WilsonLoopsTest<Gimpl>::StapleOrig(staple_orig,U,mu,nu);
-	double t1 = usecond();
-	WilsonLoopsTest<Gimpl>::StaplePadded(staple_padded,U,mu,nu);
-	double t2 = usecond();
-	torig += t1-t0;  tpadded += t2-t1;
-	++count;
-	
-	GaugeMat diff = staple_orig - staple_padded;
-	double n = norm2(diff);
-	std::cout << GridLogMessage << mu << " " << nu << " " << n << std::endl;
-	assert(n<1e-10);
-      }
-    }
-  }
-  std::cout << GridLogMessage << "Staple timings orig: " << torig/1000/count << "ms,  padded: " << tpadded/1000/count << "ms" << std::endl;
-  count=0; torig=tpadded=0;
-    
-  std::cout << GridLogMessage << "Checking RectStaple" << std::endl;
-  for(int mu=0;mu<Nd;mu++){
-    GaugeMat staple_orig(&GRID), staple_padded(&GRID);
-    double t0 = usecond();
-    WilsonLoopsTest<Gimpl>::RectStapleOrig(staple_orig,U,mu);
-    double t1 = usecond();
-    WilsonLoopsTest<Gimpl>::RectStaplePadded(staple_padded,U,mu);
-    double t2 = usecond();
-    torig += t1-t0;  tpadded += t2-t1;
-    ++count;
-    
-    GaugeMat diff = staple_orig - staple_padded;
-    double n = norm2(diff);
-    std::cout << GridLogMessage << mu << " " << n << std::endl;
-    assert(n<1e-10);
-  }
-  std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  padded: " << tpadded/1000/count << "ms" << std::endl;
-  
-  Grid_finalize();
-}
--- a/tests/qdpxx/Test_qdpxx_munprec.cc
+++ b/tests/qdpxx/Test_qdpxx_munprec.cc
@@ -1,7 +1,6 @@
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
-
    Source file: ./tests/qdpxx/Test_qdpxx_munprec.cc

    Copyright (C) 2015
@@ -26,13 +25,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include <chroma.h>
+#include <actions/ferm/invert/syssolver_linop_cg_array.h>
+#include <actions/ferm/invert/syssolver_linop_aggregate.h>
+
 #include <Grid/Grid.h>

 int    Ls=8;
 double M5=1.6;
 double mq=0.01;
-double zolo_lo = 0.1;
-double zolo_hi = 2.0;
+double zolo_lo = 0.01;
+double zolo_hi = 7.0;
 double mobius_scale=2.0;

 enum ChromaAction {
@@ -55,11 +58,6 @@ enum ChromaAction {
 void calc_grid      (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag);
 void calc_chroma    (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag);

-#include <chroma.h>
-#include <actions/ferm/invert/syssolver_linop_cg_array.h>
-#include <actions/ferm/invert/syssolver_linop_aggregate.h>
-
-

 namespace Chroma { 

@@ -81,7 +79,7 @@ public:

    std::vector<int> x(4);
    QDP::multi1d<int> cx(4);
-    std::vector<int> gd= gr.Grid()->GlobalDimensions();
+    Grid::Coordinate gd = gr.Grid()->GlobalDimensions();

    for (x[0]=0;x[0]<gd[0];x[0]++){
    for (x[1]=0;x[1]<gd[1];x[1]++){
@@ -124,7 +122,7 @@ public:

    std::vector<int> x(5);
    QDP::multi1d<int> cx(4);
-    std::vector<int> gd= gr.Grid()->GlobalDimensions();
+    Grid::Coordinate gd= gr.Grid()->GlobalDimensions();

    for (x[0]=0;x[0]<gd[0];x[0]++){
    for (x[1]=0;x[1]<gd[1];x[1]++){
@@ -166,7 +164,7 @@ public:

    std::vector<int> x(5);
    QDP::multi1d<int> cx(4);
-    std::vector<int> gd= gr.Grid()->GlobalDimensions();
+    Grid::Coordinate gd= gr.Grid()->GlobalDimensions();

    for (x[0]=0;x[0]<gd[0];x[0]++){
    for (x[1]=0;x[1]<gd[1];x[1]++){
@@ -304,7 +302,30 @@ public:
     //     param.approximation_type=COEFF_TYPE_TANH_UNSCALED;
     //     param.approximation_type=COEFF_TYPE_TANH;
     param.tuning_strategy_xml=
-"<TuningStrategy><Name>OVEXT_CONSTANT_STRATEGY</Name></TuningStrategy>\n";
+"<TuningStrategy><Name>OVEXT_CONSTANT_STRATEGY</Name><TuningConstant>1.0</TuningConstant></TuningStrategy>\n";
+     UnprecOvExtFermActArray S_f(cfs,param);
+     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
+     Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
+     return M;
+   }
+   if ( parms == HwPartFracTanh ) {
+     if ( Ls%2 == 0 ) { 
+       printf("Ls is not odd\n");
+       exit(-1);
+     }
+     UnprecOvExtFermActArrayParams param;
+     param.OverMass=M5; 
+     param.Mass=_mq;
+     param.RatPolyDeg = Ls;
+     param.ApproxMin =eps_lo;
+     param.ApproxMax =eps_hi;
+     param.b5 =1.0;
+     param.c5 =1.0;
+     //     param.approximation_type=COEFF_TYPE_ZOLOTAREV;
+     param.approximation_type=COEFF_TYPE_TANH_UNSCALED;
+     //param.approximation_type=COEFF_TYPE_TANH;
+     param.tuning_strategy_xml=
+       "<TuningStrategy><Name>OVEXT_CONSTANT_STRATEGY</Name><TuningConstant>1.0</TuningConstant></TuningStrategy>\n";
     UnprecOvExtFermActArray S_f(cfs,param);
     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
     Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
@@ -316,7 +337,35 @@ public:
     param.ApproxMin=eps_lo;
     param.ApproxMax=eps_hi;
     param.approximation_type=COEFF_TYPE_ZOLOTAREV;
-     param.RatPolyDeg=Ls;
+     param.RatPolyDeg=Ls-1;
+     // The following is why I think Chroma made some directional errors:
+     param.AuxFermAct= std::string(
+"<AuxFermAct>\n"
+"  <FermAct>UNPRECONDITIONED_WILSON</FermAct>\n"
+"  <Mass>-1.8</Mass>\n"
+"  <b5>1</b5>\n"
+"  <c5>0</c5>\n"
+"  <MaxCG>1000</MaxCG>\n"
+"  <RsdCG>1.0e-9</RsdCG>\n"
+"  <FermionBC>\n"
+"      <FermBC>SIMPLE_FERMBC</FermBC>\n"
+"      <boundary>1 1 1 1</boundary>\n"
+"   </FermionBC> \n"
+"</AuxFermAct>"
+);
+     param.AuxFermActGrp= std::string("");
+     UnprecOvlapContFrac5DFermActArray S_f(fbc,param);
+     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
+     Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
+     return  M;
+   }
+   if ( parms == HwContFracTanh ) {
+     UnprecOvlapContFrac5DFermActParams param;
+     param.Mass=_mq; // How is M5 set? Wilson mass In AuxFermAct
+     param.ApproxMin=eps_lo;
+     param.ApproxMax=eps_hi;
+     param.approximation_type=COEFF_TYPE_TANH_UNSCALED;
+     param.RatPolyDeg=Ls-1;
     // The following is why I think Chroma made some directional errors:
     param.AuxFermAct= std::string(
 "<AuxFermAct>\n"
@@ -378,7 +427,14 @@ int main (int argc,char **argv )
   * Setup QDP
   *********************************************************/
  Chroma::initialize(&argc,&argv);
-  Chroma::WilsonTypeFermActs4DEnv::registerAll(); 
+  //  Chroma::WilsonTypeFermActs4DEnv::registerAll(); 
+  Chroma::WilsonTypeFermActsEnv::registerAll(); 
+  //bool linkageHack(void)
+  //{
+  //  bool foo = true;
+  // Inline Measurements
+  //  InlineAggregateEnv::registerAll();
+  //  GaugeInitEnv::registerAll();

  /********************************************************
   * Setup Grid
@@ -388,26 +444,34 @@ int main (int argc,char **argv )
                                                                       Grid::GridDefaultSimd(Grid::Nd,Grid::vComplex::Nsimd()),
                                                                       Grid::GridDefaultMpi());
  
-  std::vector<int> gd = UGrid->GlobalDimensions();
+  Grid::Coordinate gd = UGrid->GlobalDimensions();
  QDP::multi1d<int> nrow(QDP::Nd);
  for(int mu=0;mu<4;mu++) nrow[mu] = gd[mu];

  QDP::Layout::setLattSize(nrow);
  QDP::Layout::create();

-  Grid::GridCartesian         * FGrid   = Grid::SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  Grid::LatticeGaugeField lat(UGrid);
-  Grid::LatticeFermion    src(FGrid);
-  Grid::LatticeFermion    res_chroma(FGrid);
-  Grid::LatticeFermion    res_grid  (FGrid);
-  
  std::vector<ChromaAction> ActionList({
 		 HtCayleyTanh, // Plain old DWF.
 		 HmCayleyTanh,
 		 HwCayleyTanh,
 		 HtCayleyZolo, // Plain old DWF.
 		 HmCayleyZolo,
-		 HwCayleyZolo
+		 HwCayleyZolo,
+		 HwPartFracZolo,
+		 HwContFracZolo,
+		 HwContFracTanh
+  });
+  std::vector<int> LsList({
+      8,//HtCayleyTanh, // Plain old DWF.
+      8,//HmCayleyTanh,
+      8,//HwCayleyTanh,
+      8,//HtCayleyZolo, // Plain old DWF.
+      8,//HmCayleyZolo,
+      8,//HwCayleyZolo,
+      9,//HwPartFracZolo
+      9, //HwContFracZolo
+      9 //HwContFracTanh
  });
  std::vector<std::string> ActionName({
        "HtCayleyTanh",
@@ -415,10 +479,19 @@ int main (int argc,char **argv )
 	"HwCayleyTanh",
 	"HtCayleyZolo",
 	"HmCayleyZolo",
-        "HwCayleyZolo"
+        "HwCayleyZolo",
+	"HwPartFracZolo",
+	"HwContFracZolo",
+	"HwContFracTanh"
  });

  for(int i=0;i<ActionList.size();i++) {
+    Ls = LsList[i];
+    Grid::GridCartesian      * FGrid   = Grid::SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+    Grid::LatticeGaugeField lat(UGrid);
+    Grid::LatticeFermion    src(FGrid);
+    Grid::LatticeFermion    res_chroma(FGrid);
+    Grid::LatticeFermion    res_grid  (FGrid);
    std::cout << "*****************************"<<std::endl;
    std::cout << "Action "<<ActionName[i]<<std::endl;
    std::cout << "*****************************"<<std::endl;
@@ -439,6 +512,7 @@ int main (int argc,char **argv )
      
      std::cout << "Norm of difference "<<Grid::norm2(res_chroma)<<std::endl;
    }
+    delete FGrid;
  }

  std::cout << "Finished test "<<std::endl;
@@ -502,7 +576,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF
  Grid::gaussian(RNG5,src);
  Grid::gaussian(RNG5,res);

-  Grid::SU<Nc>::HotConfiguration(RNG4,Umu);
+  Grid::SU<Grid::Nc>::HotConfiguration(RNG4,Umu);

  /*
  Grid::LatticeColourMatrix U(UGrid);
@@ -519,7 +593,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HtCayleyTanh ) { 

-    Grid::DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5);
+    Grid::DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5);

    std::cout << Grid::GridLogMessage <<" Calling domain wall multiply "<<std::endl;

@@ -535,7 +609,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

    Grid::Real _b = 0.5*(mobius_scale +1.0);
    Grid::Real _c = 0.5*(mobius_scale -1.0);
-    Grid::MobiusZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c,zolo_lo,zolo_hi);
+    Grid::MobiusZolotarevFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c,zolo_lo,zolo_hi);

    std::cout << Grid::GridLogMessage <<" Calling mobius zolo multiply "<<std::endl;

@@ -549,7 +623,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HtCayleyZolo ) {

-    Grid::ShamirZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+    Grid::ShamirZolotarevFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);

    std::cout << Grid::GridLogMessage <<" Calling shamir zolo multiply "<<std::endl;

@@ -561,6 +635,60 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF
    return;
  }

+  if ( action == HwPartFracTanh ) {
+
+    Grid::OverlapWilsonPartialFractionTanhFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
+
+    std::cout << Grid::GridLogMessage <<" Calling part frac tanh multiply "<<std::endl;
+
+    if ( dag ) 
+      Dov.Mdag(src,res);  
+    else 
+      Dov.M(src,res);  
+
+    return;
+  }
+
+  if ( action == HwContFracTanh ) {
+
+    Grid::OverlapWilsonContFracTanhFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
+
+    std::cout << Grid::GridLogMessage <<" Calling cont frac tanh multiply "<<std::endl;
+
+    if ( dag ) 
+      Dov.Mdag(src,res);  
+    else 
+      Dov.M(src,res);  
+
+    return;
+  }
+  if ( action == HwContFracZolo ) {
+
+    Grid::OverlapWilsonContFracZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+
+    std::cout << Grid::GridLogMessage <<" Calling cont frac zolo multiply "<<std::endl;
+
+    if ( dag ) 
+      Dov.Mdag(src,res);  
+    else 
+      Dov.M(src,res);  
+
+    return;
+  }
+
+  if ( action == HwPartFracZolo ) {
+
+    Grid::OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+    std::cout << Grid::GridLogMessage <<" Calling part frac zolotarev multiply "<<std::endl;
+
+    if ( dag ) 
+      Dov.Mdag(src,res);  
+    else 
+      Dov.M(src,res);  
+
+    return;
+  }
+  
  /*
  if ( action == HmCayleyTanh ) {
    Grid::Real _b = 0.5*(mobius_scale +1.0);
@@ -581,7 +709,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HmCayleyTanh ) {

-    Grid::ScaledShamirFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,mobius_scale);
+    Grid::ScaledShamirFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,mobius_scale);

    std::cout << Grid::GridLogMessage <<" Calling scaled shamir multiply "<<std::endl;

@@ -595,7 +723,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HwCayleyTanh ) {

-    Grid::OverlapWilsonCayleyTanhFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
+    Grid::OverlapWilsonCayleyTanhFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);

    if ( dag ) 
      D.Mdag(src,res);  
@@ -607,7 +735,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HwCayleyZolo ) {

-    Grid::OverlapWilsonCayleyZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+    Grid::OverlapWilsonCayleyZolotarevFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);

    if ( dag ) 
      D.Mdag(src,res);  
--- a/tests/solver/Test_dwf_cg_prec.cc
+++ b/tests/solver/Test_dwf_cg_prec.cc
@@ -1,4 +1,4 @@
-/*************************************************************************************
+*************************************************************************************

 Grid physics library, www.github.com/paboyle/Grid

@@ -67,7 +67,13 @@ int main(int argc, char** argv) {
  result = Zero();
  LatticeGaugeField Umu(UGrid);

+#if 0
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+#else  
  SU<Nc>::HotConfiguration(RNG4, Umu);
+#endif

  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt()
            << "   Ls: " << Ls << std::endl;
--- a/tests/solver/Test_dwf_cg_unprec.cc
+++ b/tests/solver/Test_dwf_cg_unprec.cc
@@ -54,15 +54,30 @@ int main (int argc, char ** argv)
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

+  std::vector<ComplexD> qmu;
+  qmu.push_back(ComplexD(0.1,0.0));
+  qmu.push_back(ComplexD(0.0,0.0));
+  qmu.push_back(ComplexD(0.0,0.0));
+  qmu.push_back(ComplexD(0.0,0.01));
+  
+
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);

+  LatticeFermion    tmp(FGrid);
  LatticeFermion    src(FGrid); random(RNG5,src);
  LatticeFermion result(FGrid); result=Zero();
-  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
-
+  LatticeGaugeField Umu(UGrid); 
+#if 0
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+#else  
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+#endif
+  
  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@@ -71,8 +86,15 @@ int main (int argc, char ** argv)
  RealD mass=0.1;
  RealD M5=1.8;
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  Ddwf.qmu = qmu;

+  Ddwf.M(src,tmp);
+  std::cout << " |M src|^2 "<<norm2(tmp)<<std::endl;
  MdagMLinearOperator<DomainWallFermionD,LatticeFermion> HermOp(Ddwf);
+  HermOp.HermOp(src,tmp);
+
+  std::cout << " <src|MdagM| src> "<<innerProduct(src,tmp)<<std::endl;
+  
  ConjugateGradient<LatticeFermion> CG(1.0e-6,10000);
  CG(HermOp,src,result);
Author	SHA1	Message	Date
Peter Boyle	6815e138b4	Boosted fermion attempt	2024-10-17 18:37:33 +01:00
Peter Boyle	e29b97b3ea	Qslash term added	2023-09-14 16:14:03 -04:00
Peter Boyle	ad2b699d2b	Better macos	2023-09-14 16:12:21 -04:00