Merge 1dfaa08afb into b8a7004365

The stencils for the staple and rect-staple padded cell implementations are now created and stored by workspace classes that allow for reuse providing the grids remain consistent
The workspaces are now used by the plaq+rectangle gauge action resulting in a further 2x performance improvement as measured on a 16^4 local volume for 2 nodes (16 ranks) of Crusher
2026-01-25 10:54:43 +00:00 · 2023-09-03 09:29:21 -07:00 · 2023-06-28 15:11:24 -04:00 · 2023-06-27 14:58:10 -04:00 · 2023-06-27 11:23:30 -04:00 · 2023-06-26 15:48:47 -04:00
23 changed files with 1661 additions and 530 deletions
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -47,3 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
+#include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -697,8 +697,68 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
-
  // the above should guarantee that the operations are local
+  
+#if 1
+
+  size_t nsite = 1;
+  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
+  
+  size_t tbytes = 4*nsite*sizeof(int);
+  int *table = (int*)malloc(tbytes);
+ 
+  thread_for(idx, nsite, {
+      Coordinate from_coor, to_coor;
+      size_t rem = idx;
+      for(int i=0;i<nd;i++){
+	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
+	from_coor[i] = base_i + FromLowerLeft[i];
+	to_coor[i] = base_i + ToLowerLeft[i];
+      }
+      
+      int foidx = Fg->oIndex(from_coor);
+      int fiidx = Fg->iIndex(from_coor);
+      int toidx = Tg->oIndex(to_coor);
+      int tiidx = Tg->iIndex(to_coor);
+      int* tt = table + 4*idx;
+      tt[0] = foidx;
+      tt[1] = fiidx;
+      tt[2] = toidx;
+      tt[3] = tiidx;
+    });
+  
+  int* table_d = (int*)acceleratorAllocDevice(tbytes);
+  acceleratorCopyToDevice(table,table_d,tbytes);
+
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  autoView(from_v,From,AcceleratorRead);
+  autoView(to_v,To,AcceleratorWrite);
+  
+  accelerator_for(idx,nsite,1,{
+      static const int words=sizeof(vobj)/sizeof(vector_type);
+      int* tt = table_d + 4*idx;
+      int from_oidx = *tt++;
+      int from_lane = *tt++;
+      int to_oidx = *tt++;
+      int to_lane = *tt;
+
+      const vector_type* from = (const vector_type *)&from_v[from_oidx];
+      vector_type* to = (vector_type *)&to_v[to_oidx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	putlane(to[w], stmp, to_lane);
+      }
+    });
+  
+  acceleratorFreeDevice(table_d);    
+  free(table);
+  
+
+#else  
  Coordinate ldf = Fg->_ldimensions;
  Coordinate rdf = Fg->_rdimensions;
  Coordinate isf = Fg->_istride;
@@ -738,6 +798,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 #endif
    }
  });
+
+#endif
 }


@@ -830,6 +892,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 }


+//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
+//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@@ -851,6 +915,65 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
    }
  }

+#if 1
+  size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
+  size_t tbytes = 4*nsite*sizeof(int);
+  int *table = (int*)malloc(tbytes);
+  
+  thread_for(idx,nsite,{
+    Coordinate lcoor(nl);
+    Coordinate hcoor(nh);
+    lcoor[orthog] = slice_lo;
+    hcoor[orthog] = slice_hi;
+    size_t rem = idx;
+    for(int mu=0;mu<nl;mu++){
+      if(mu != orthog){
+	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu];
+	lcoor[mu] = hcoor[mu] = xmu;
+      }
+    }
+    int loidx = lg->oIndex(lcoor);
+    int liidx = lg->iIndex(lcoor);
+    int hoidx = hg->oIndex(hcoor);
+    int hiidx = hg->iIndex(hcoor);
+    int* tt = table + 4*idx;
+    tt[0] = loidx;
+    tt[1] = liidx;
+    tt[2] = hoidx;
+    tt[3] = hiidx;
+    });
+   
+  int* table_d = (int*)acceleratorAllocDevice(tbytes);
+  acceleratorCopyToDevice(table,table_d,tbytes);
+
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  autoView(lowDim_v,lowDim,AcceleratorRead);
+  autoView(higherDim_v,higherDim,AcceleratorWrite);
+  
+  accelerator_for(idx,nsite,1,{
+      static const int words=sizeof(vobj)/sizeof(vector_type);
+      int* tt = table_d + 4*idx;
+      int from_oidx = *tt++;
+      int from_lane = *tt++;
+      int to_oidx = *tt++;
+      int to_lane = *tt;
+
+      const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
+      vector_type* to = (vector_type *)&higherDim_v[to_oidx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	putlane(to[w], stmp, to_lane);
+      }
+    });
+  
+  acceleratorFreeDevice(table_d);    
+  free(table);
+  
+#else
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
@@ -866,6 +989,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
      pokeLocalSite(s,higherDimv,hcoor);
    }
  });
+#endif
 }


--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -26,14 +26,32 @@ Author: Peter Boyle pboyle@bnl.gov
 /*  END LEGAL */
 #pragma once

+#include<Grid/cshift/Cshift.h>
+
 NAMESPACE_BEGIN(Grid);

+//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
+template<typename vobj>
+struct CshiftImplBase{
+  virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
+  virtual ~CshiftImplBase(){}
+};
+template<typename vobj>
+struct CshiftImplDefault: public CshiftImplBase<vobj>{
+  Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
+};
+template<typename Gimpl>
+struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
+  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
+};  
+
 class PaddedCell {
 public:
  GridCartesian * unpadded_grid;
  int dims;
  int depth;
  std::vector<GridCartesian *> grids;
+
  ~PaddedCell()
  {
    DeleteGrids();
@@ -77,7 +95,7 @@ public:
    }
  };
  template<class vobj>
-  inline Lattice<vobj> Extract(Lattice<vobj> &in)
+  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
  {
    Lattice<vobj> out(unpadded_grid);

@@ -88,19 +106,19 @@ public:
    return out;
  }
  template<class vobj>
-  inline Lattice<vobj> Exchange(Lattice<vobj> &in)
+  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    GridBase *old_grid = in.Grid();
    int dims = old_grid->Nd();
    Lattice<vobj> tmp = in;
    for(int d=0;d<dims;d++){
-      tmp = Expand(d,tmp); // rvalue && assignment
+      tmp = Expand(d,tmp,cshift); // rvalue && assignment
    }
    return tmp;
  }
  // expand up one dim at a time
  template<class vobj>
-  inline Lattice<vobj> Expand(int dim,Lattice<vobj> &in)
+  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
@@ -112,20 +130,40 @@ public:
    else       conformable(old_grid,grids[dim-1]);

    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
+
+    double tins=0, tshift=0;
+    
    // Middle bit
+    double t = usecond();
    for(int x=0;x<local[dim];x++){
      InsertSliceLocal(in,padded,x,depth+x,dim);
    }
+    tins += usecond() - t;
+    
    // High bit
-    shifted = Cshift(in,dim,depth);
+    t = usecond();
+    shifted = cshift.Cshift(in,dim,depth);
+    tshift += usecond() - t;
+
+    t=usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
    }
+    tins += usecond() - t;
+    
    // Low bit
-    shifted = Cshift(in,dim,-depth);
+    t = usecond();
+    shifted = cshift.Cshift(in,dim,-depth);
+    tshift += usecond() - t;
+    
+    t = usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,x,x,dim);
    }
+    tins += usecond() - t;
+
+    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
+    
    return padded;
  }

--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -124,11 +124,6 @@ public:
  RealD                _b;
  RealD                _c;

-  // possible boost
-  std::vector<ComplexD> qmu;
-  void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
-  void addQmu(const FermionField &in, FermionField &out, int dag);
-  
  // Cayley form Moebius (tanh and zolotarev)
  Vector<Coeff_t> omega;
  Vector<Coeff_t> bs;    // S dependent coeffs
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -60,50 +60,6 @@ public:
  //      virtual void   Instantiatable(void)=0;
  virtual void   Instantiatable(void) =0;

-  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
-  {
-    std::cout << "Free Propagator for PartialFraction"<<std::endl;
-    FermionField in_k(in.Grid());
-    FermionField prop_k(in.Grid());
-    
-    FFT theFFT((GridCartesian *) in.Grid());
-
-    //phase for boundary condition
-    ComplexField coor(in.Grid());
-    ComplexField ph(in.Grid());  ph = Zero();
-    FermionField in_buf(in.Grid()); in_buf = Zero();
-    typedef typename Simd::scalar_type Scalar;
-    Scalar ci(0.0,1.0);
-    assert(twist.size() == Nd);//check that twist is Nd
-    assert(boundary.size() == Nd);//check that boundary conditions is Nd
-    int shift = 0;
-    for(unsigned int nu = 0; nu < Nd; nu++)
-      {
-	// Shift coordinate lattice index by 1 to account for 5th dimension.
-	LatticeCoordinate(coor, nu + shift);
-	double boundary_phase = ::acos(real(boundary[nu]));
-	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
-	//momenta for propagator shifted by twist+boundary
-	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
-      }
-    in_buf = exp(ci*ph*(-1.0))*in;
-
-    theFFT.FFT_all_dim(in_k,in,FFT::forward);
-    this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
-    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
-    
-    //phase for boundary condition
-    out = out * exp(ci*ph);
-  };
-
-  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
-    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
-    std::vector<Complex> boundary;
-    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
-    FreePropagator(in,out,mass,boundary,twist);
-  };
-
-  
  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
 public:
  INHERIT_IMPL_TYPES(Impl);

-  const int part_frac_chroma_convention=0;
+  const int part_frac_chroma_convention=1;

  void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
  void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
@@ -83,63 +83,12 @@ public:
 			   GridRedBlackCartesian &FourDimRedBlackGrid,
 			   RealD _mass,RealD M5,const ImplParams &p= ImplParams());

-  PartialFractionFermion5D(GaugeField &_Umu,
-			   GridCartesian         &FiveDimGrid,
-			   GridRedBlackCartesian &FiveDimRedBlackGrid,
-			   GridCartesian         &FourDimGrid,
-			   GridRedBlackCartesian &FourDimRedBlackGrid,
-			   RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
-
-  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
-  {
-    std::cout << "Free Propagator for PartialFraction"<<std::endl;
-    FermionField in_k(in.Grid());
-    FermionField prop_k(in.Grid());
-    
-    FFT theFFT((GridCartesian *) in.Grid());
-
-    //phase for boundary condition
-    ComplexField coor(in.Grid());
-    ComplexField ph(in.Grid());  ph = Zero();
-    FermionField in_buf(in.Grid()); in_buf = Zero();
-    typedef typename Simd::scalar_type Scalar;
-    Scalar ci(0.0,1.0);
-    assert(twist.size() == Nd);//check that twist is Nd
-    assert(boundary.size() == Nd);//check that boundary conditions is Nd
-    int shift = 0;
-    for(unsigned int nu = 0; nu < Nd; nu++)
-      {
-	// Shift coordinate lattice index by 1 to account for 5th dimension.
-	LatticeCoordinate(coor, nu + shift);
-	double boundary_phase = ::acos(real(boundary[nu]));
-	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
-	//momenta for propagator shifted by twist+boundary
-	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
-      }
-    in_buf = exp(ci*ph*(-1.0))*in;
-
-    theFFT.FFT_all_dim(in_k,in,FFT::forward);
-    this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
-    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
-    
-    //phase for boundary condition
-    out = out * exp(ci*ph);
-  };
-
-  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
-    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
-    std::vector<Complex> boundary;
-    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
-    FreePropagator(in,out,mass,boundary,twist);
-  };
-  
 protected:

  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
  virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);

  // Part frac
-  std::vector<RealD> qmu;
  RealD mass;
  RealD dw_diag;
  RealD R;
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -48,8 +48,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
  mass_plus(_mass), mass_minus(_mass)
-{
-  // qmu defaults to zero size;
+{ 
 }

 ///////////////////////////////////////////////////////////////
@@ -271,34 +270,6 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  M5Ddag(psi,psi,Din,lower,diag,upper);
 }

-template<class Impl>
-void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
-{
-  if ( qmu.size() ) {
-
-    Gamma::Algebra Gmu [] = {
-      Gamma::Algebra::GammaX,
-      Gamma::Algebra::GammaY,
-      Gamma::Algebra::GammaZ,
-      Gamma::Algebra::GammaT
-    };
-    std::vector<ComplexD> coeff(Nd);
-    ComplexD ci(0,1);
-
-    assert(qmu.size()==Nd);
-
-    for(int mu=0;mu<Nd;mu++){
-       coeff[mu] = ci*qmu[mu];
-       if ( dag ) coeff[mu] = conjugate(coeff[mu]);
-    }
-
-    chi = chi + Gamma(Gmu[0])*psi*coeff[0];
-    for(int mu=1;mu<Nd;mu++){
-      chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
-    }
-  }
-}
-
 template<class Impl>
 void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
@@ -306,12 +277,8 @@ void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
  
  // Assemble Din
  Meooe5D(psi,Din);
-
-  this->DW(Din,chi,DaggerNo);
-
-  // add i q_mu gamma_mu here
-  addQmu(Din,chi,DaggerNo);
  
+  this->DW(Din,chi,DaggerNo);
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby(chi,1.0,1.0,chi,psi); 
  
@@ -328,9 +295,6 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  FermionField Din(psi.Grid());
  // Apply Dw
  this->DW(psi,Din,DaggerYes); 
-
-  // add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
-  addQmu(psi,Din,DaggerYes);
  
  MeooeDag5D(Din,chi);
  
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -42,13 +42,13 @@ template<class Impl>
 void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
 {
  // How to check Ls matches??
-  std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-  std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-  std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-  std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-  std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
  int Ls = this->Ls;
-  std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
  assert(zdata->db==Ls);// Beta has Ls coeffs

  R=(1+this->mass)/(1-this->mass);
@@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, 0);
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
    }
    template<class Impl>
    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, 0);
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -255,76 +255,15 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
  }
 	
  {
-    // The 'conventional' Cayley overlap operator is
-    //
-    // Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
-    //
-    //
-    // With massless limit 1/2(1+g5 sgnHw)
-    //
-    // Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
-    //
-    // However, the conventional normalisation has both a leading order factor of 2 in Zq
-    // at tree level AND a mass dependent (1-m) that are convenient to absorb.
-    //
-    // In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
-    //
-    // num = -i sin kmu gmu
-    //
-    // denom ( sqrt(sk^2 + (2shk^2 - 1)^2
-    //    b_k = sk2 - M5;
-    //     
-    //    w_k = sqrt(sk + b_k*b_k);
-    //
-    //    denom= ( w_k + b_k + mass*mass) ;
-    //
-    //    denom= one/denom;
-    //    out = num*denom;
-    //
-    // Chroma, and Grid define partial fraction via 4d operator
-    //
-    //   Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
-    //
-    // Now since:
-    //
-    //      (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
-    //
-    // This corresponds to a modified mass parameter
-    //
-    // It has an annoying 
-    //
-    // 
    double R=(1+this->mass)/(1-this->mass);
    //R g5 psi[Ls] + p[0] H
    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
-    
+	
    for(int b=0;b<nblock;b++){
      int s = 2*b+1;
      double pp = p[nblock-1-b];
      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
    }
-
-    if ( qmu.size() ) {
-
-      FermionField qslash_psi(psi.Grid());
-      
-      Gamma::Algebra Gmu [] = {
-			 Gamma::Algebra::GammaX,
-			 Gamma::Algebra::GammaY,
-			 Gamma::Algebra::GammaZ,
-			 Gamma::Algebra::GammaT
-      };
-      ComplexD ci(0,1);
-      assert(qmu.size()==Nd);
-      qslash_psi = Gamma(Gmu[0])*psi;
-      for(int mu=1;mu<Nd;mu++){
-	qslash_psi = Gamma(Gmu[mu])*psi;
-      }
-      //      RealD coeff = 1.0;
-      qslash_psi = Gamma(Gamma::Algebra::Gamma5)*qslash_psi*ci ; // i g5 qslash -- 1-m factor???
-      axpby_ssp(chi,1.0,chi,1.0, qslash_psi,Ls-1,Ls-1);
-    }
-    
  }

 }
@@ -472,7 +411,7 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, 0);
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
    }
    template<class Impl>
    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@@ -482,8 +421,7 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      std::cout << " importing to slice " << Ls-1 <<std::endl;
-      InsertSlice(input4d, tmp, Ls-1, 0);
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
@@ -504,7 +442,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,

 {
  int Ls = this->Ls;
-  qmu.resize(0);
+
  assert((Ls&0x1)==1); // Odd Ls required
  int nrational=Ls-1;

@@ -522,22 +460,6 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
  Approx::zolotarev_free(zdata);

 }
-template<class Impl>
-PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
-							 GridCartesian         &FiveDimGrid,
-							 GridRedBlackCartesian &FiveDimRedBlackGrid,
-							 GridCartesian         &FourDimGrid,
-							 GridRedBlackCartesian &FourDimRedBlackGrid,
-							 RealD _mass,RealD M5,
-							 std::vector<RealD> &_qmu,
-							 const ImplParams &p)
-  : PartialFractionFermion5D<Impl>(_Umu,
-			     FiveDimGrid,FiveDimRedBlackGrid,
-			     FourDimGrid,FourDimRedBlackGrid,
-			     _mass,M5,p)
-{
-  qmu=_qmu;
-}

 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -176,7 +176,7 @@ public:
      return PeriodicBC::CshiftLink(Link,mu,shift);
  }

-  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
+  static inline void       setDirections(const std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
 };
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@@ -43,7 +43,7 @@ public:
 private:
  RealD c_plaq;
  RealD c_rect;
-
+  typename WilsonLoops<Gimpl>::StapleAndRectStapleAllWorkspace workspace;
 public:
  PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){};

@@ -79,27 +79,18 @@ public:
    GridBase *grid = Umu.Grid();

    std::vector<GaugeLinkField> U (Nd,grid);
-    std::vector<GaugeLinkField> U2(Nd,grid);
-
    for(int mu=0;mu<Nd;mu++){
      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-      WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
    }
+    std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
+    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);

    GaugeLinkField dSdU_mu(grid);
    GaugeLinkField staple(grid);

    for (int mu=0; mu < Nd; mu++){
-
-      // Staple in direction mu
-
-      WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
-
-      dSdU_mu = Ta(U[mu]*staple)*factor_p;
-
-      WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
-
-      dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
+      dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
+      dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
 	  
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -37,13 +37,14 @@ NAMESPACE_BEGIN(Grid);
 // Make these members of an Impl class for BC's.

 namespace PeriodicBC { 
-
+  //Out(x) = Link(x)*field(x+mu)
  template<class covariant,class gauge> Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link, 
 									   int mu,
 									   const Lattice<covariant> &field)
  {
    return Link*Cshift(field,mu,1);// moves towards negative mu
  }
+  //Out(x) = Link^dag(x-mu)*field(x-mu)
  template<class covariant,class gauge> Lattice<covariant> CovShiftBackward(const Lattice<gauge> &Link, 
 									    int mu,
 									    const Lattice<covariant> &field)
@@ -52,19 +53,19 @@ namespace PeriodicBC {
    tmp = adj(Link)*field;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
-
+  //Out(x) = Link^dag(x-mu)
  template<class gauge> Lattice<gauge>
  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) 
  {
    return Cshift(adj(Link), mu, -1);
  }
-
+  //Out(x) = Link(x)
  template<class gauge> Lattice<gauge>
  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu)
  {
    return Link;
  }
-
+  //Link(x) = Link(x+mu)
  template<class gauge> Lattice<gauge>
  ShiftStaple(const Lattice<gauge> &Link, int mu)
  {
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -290,7 +290,7 @@ public:
  }
 */
  //////////////////////////////////////////////////
-  // the sum over all staples on each site
+  // the sum over all nu-oriented staples for nu != mu on each site
  //////////////////////////////////////////////////
  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

@@ -300,6 +300,10 @@ public:
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
+    Staple(staple, U, mu);
+  }
+
+  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
    staple = Zero();

    for (int nu = 0; nu < Nd; nu++) {
@@ -335,6 +339,202 @@ public:
    }
  }

+  /////////////
+  //Staples for each direction mu, summed over nu != mu
+  //staple: output staples for each mu (Nd)
+  //U: link array (Nd)
+  /////////////
+  static void StapleAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U) {
+    assert(staple.size() == Nd); assert(U.size() == Nd);
+    for(int mu=0;mu<Nd;mu++) Staple(staple[mu], U, mu);
+  }
+
+
+  //A workspace class allowing reuse of the stencil
+  class WilsonLoopPaddedStencilWorkspace{
+    std::unique_ptr<GeneralLocalStencil> stencil;
+    size_t nshift;
+
+    void generateStencil(GridBase* padded_grid){
+      double t0 = usecond();
+      
+      //Generate shift arrays
+      std::vector<Coordinate> shifts = this->getShifts();
+      nshift = shifts.size();
+      
+      double t1 = usecond();
+      //Generate local stencil
+      stencil.reset(new GeneralLocalStencil(padded_grid,shifts));
+      double t2 = usecond();
+      std::cout << GridLogPerformance << " WilsonLoopPaddedWorkspace timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms" << std::endl;   
+    }
+  public:
+    //Get the stencil. If not already generated, or if generated using a different Grid than in PaddedCell, it will be created on-the-fly
+    const GeneralLocalStencil & getStencil(const PaddedCell &pcell){
+      assert(pcell.depth >= this->paddingDepth());
+      if(!stencil || stencil->Grid() != (GridBase*)pcell.grids.back() ) generateStencil((GridBase*)pcell.grids.back());
+      return *stencil;
+    }
+    size_t Nshift() const{ return nshift; }
+    
+    virtual std::vector<Coordinate> getShifts() const = 0;
+    virtual int paddingDepth() const = 0; //padding depth required
+    
+    virtual ~WilsonLoopPaddedStencilWorkspace(){}
+  };
+
+  //This workspace allows the sharing of a common PaddedCell object between multiple stencil workspaces
+  class WilsonLoopPaddedWorkspace{
+    std::vector<WilsonLoopPaddedStencilWorkspace*> stencil_wk;
+    std::unique_ptr<PaddedCell> pcell;
+
+    void generatePcell(GridBase* unpadded_grid){
+      assert(stencil_wk.size());
+      int max_depth = 0;
+      for(auto const &s : stencil_wk) max_depth=std::max(max_depth, s->paddingDepth());
+      
+      pcell.reset(new PaddedCell(max_depth, dynamic_cast<GridCartesian*>(unpadded_grid)));
+    }
+    
+  public:
+    //Add a stencil definition. This should be done before the first call to retrieve a stencil object.
+    //Takes ownership of the pointer
+    void addStencil(WilsonLoopPaddedStencilWorkspace *stencil){
+      assert(!pcell);
+      stencil_wk.push_back(stencil);
+    }
+
+    const GeneralLocalStencil & getStencil(const size_t stencil_idx, GridBase* unpadded_grid){
+      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
+      return stencil_wk[stencil_idx]->getStencil(*pcell);
+    }      
+    const PaddedCell & getPaddedCell(GridBase* unpadded_grid){
+      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
+      return *pcell;
+    }
+    
+    ~WilsonLoopPaddedWorkspace(){
+      for(auto &s : stencil_wk) delete s;
+    }
+  };
+
+  //A workspace class allowing reuse of the stencil
+  class StaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
+  public:
+    std::vector<Coordinate> getShifts() const override{
+      std::vector<Coordinate> shifts;
+      for(int mu=0;mu<Nd;mu++){
+	for(int nu=0;nu<Nd;nu++){
+	  if(nu != mu){
+	    Coordinate shift_0(Nd,0);
+	    Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
+	    Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
+	    Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
+	    Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
+      
+	    //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
+	    shifts.push_back(shift_0);
+	    shifts.push_back(shift_nu);
+	    shifts.push_back(shift_mu);
+      
+	    //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
+	    shifts.push_back(shift_mnu);
+	    shifts.push_back(shift_mnu);
+	    shifts.push_back(shift_mnu_pmu);
+	  }
+	}
+      }
+      return shifts;
+    }
+
+    int paddingDepth() const override{ return 1; }
+  }; 
+
+  //Padded cell implementation of the staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
+    StaplePaddedAllWorkspace wk;
+    StaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
+  }
+  
+  //Padded cell implementation of the staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  //gStencil: the precomputed generalized local stencil for the staple
+  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
+    double t0 = usecond();
+    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
+    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
+    assert(Cell.depth >= 1);
+    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
+
+    int shift_mu_off = gStencil._npoints/Nd;
+    
+    //Open views to padded gauge links and keep open over mu loop
+    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
+    size_t vsize = Nd*sizeof(GaugeViewType);
+    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
+    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
+    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
+    
+    GaugeMat gStaple(ggrid);
+
+    int outer_off = 0;
+    for(int mu=0;mu<Nd;mu++){
+      { //view scope
+	autoView( gStaple_v , gStaple, AcceleratorWrite);
+	auto gStencil_v = gStencil.View();
+	
+	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
+	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
+	    stencil_ss = Zero();
+	    int off = outer_off;
+	    
+	    for(int nu=0;nu<Nd;nu++){
+	      if(nu != mu){	  
+		GeneralStencilEntry const* e = gStencil_v.GetEntry(off++,ss);
+		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+      
+		stencil_ss = stencil_ss + U2 * U1 * U0;
+
+		e = gStencil_v.GetEntry(off++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(off++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+		stencil_ss = stencil_ss + U2 * U1 * U0;
+	      }
+	    }
+		
+	    coalescedWrite(gStaple_v[ss],stencil_ss);
+	  }
+	  );
+      } //ensure views are all closed!
+      
+      staple[mu] = Cell.Extract(gStaple);
+      outer_off += shift_mu_off;
+    }//mu loop
+
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
+    free(Ug_dirs_v_host);
+    acceleratorFreeDevice(Ug_dirs_v);
+    
+    double t1=usecond();
+    
+    std::cout << GridLogPerformance << "StaplePaddedAll timing:" << (t1-t0)/1000 << "ms" << std::endl;   
+  }
+
+   
  //////////////////////////////////////////////////
  // the sum over all staples on each site in direction mu,nu, upper part
  //////////////////////////////////////////////////
@@ -707,18 +907,14 @@ public:
  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
-    U2 = U * Cshift(U, mu, 1);
+    U2 = U * Gimpl::CshiftLink(U, mu, 1);
  }

  ////////////////////////////////////////////////////////////////////////////
-  // Hop by two optimisation strategy does not work nicely with Gparity. (could
-  // do,
-  // but need to track two deep where cross boundary and apply a conjugation).
-  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do
-  // so .
+  // Hop by two optimisation strategy. Use RectStapleDouble to obtain 'U2'
  ////////////////////////////////////////////////////////////////////////////
-  static void RectStapleOptimised(GaugeMat &Stap, std::vector<GaugeMat> &U2,
-                                  std::vector<GaugeMat> &U, int mu) {
+  static void RectStapleOptimised(GaugeMat &Stap, const std::vector<GaugeMat> &U2,
+                                  const std::vector<GaugeMat> &U, int mu) {

    Stap = Zero();

@@ -732,9 +928,9 @@ public:

        // Up staple    ___ ___
        //             |       |
-        tmp = Cshift(adj(U[nu]), nu, -1);
+        tmp = Gimpl::CshiftLink(adj(U[nu]), nu, -1);
        tmp = adj(U2[mu]) * tmp;
-        tmp = Cshift(tmp, mu, -2);
+        tmp = Gimpl::CshiftLink(tmp, mu, -2);

        Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);

@@ -742,14 +938,14 @@ public:
        //             |___ ___|
        //
        tmp = adj(U2[mu]) * U[nu];
-        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Cshift(tmp, mu, -2));
+        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CshiftLink(tmp, mu, -2));

        //              ___ ___
        //             |    ___|
        //             |___ ___|
        //

-        Stap += Cshift(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
+        Stap += Gimpl::CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);

        //              ___ ___
        //             |___    |
@@ -758,7 +954,7 @@ public:

        //  tmp= Staple2x1* Cshift(U[mu],mu,-2);
        //  Stap+= Cshift(tmp,mu,1) ;
-        Stap += Cshift(Staple2x1, mu, 1) * Cshift(U[mu], mu, -1);
+        Stap += Gimpl::CshiftLink(Staple2x1, mu, 1) * Gimpl::CshiftLink(U[mu], mu, -1);
        ;

        //       --
@@ -766,10 +962,10 @@ public:
        //
        //      |  |

-        tmp = Cshift(adj(U2[nu]), nu, -2);
+        tmp = Gimpl::CshiftLink(adj(U2[nu]), nu, -2);
        tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
-        tmp = U2[nu] * Cshift(tmp, nu, 2);
-        Stap += Cshift(tmp, mu, 1);
+        tmp = U2[nu] * Gimpl::CshiftLink(tmp, nu, 2);
+        Stap += Gimpl::CshiftLink(tmp, mu, 1);

        //      |  |
        //
@@ -778,25 +974,12 @@ public:

        tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
        tmp = adj(U2[nu]) * tmp;
-        tmp = Cshift(tmp, nu, -2);
-        Stap += Cshift(tmp, mu, 1);
+        tmp = Gimpl::CshiftLink(tmp, nu, -2);
+        Stap += Gimpl::CshiftLink(tmp, mu, 1);
      }
    }
  }

-  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
-    RectStapleUnoptimised(Stap, Umu, mu);
-  }
-  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
-                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
-                         int mu) {
-    if (Gimpl::isPeriodicGaugeField()) {
-      RectStapleOptimised(Stap, U2, U, mu);
-    } else {
-      RectStapleUnoptimised(Stap, Umu, mu);
-    }
-  }
-
  static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu,
                                    int mu) {
    GridBase *grid = Umu.Grid();
@@ -895,6 +1078,288 @@ public:
    }
  }

+  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
+    RectStapleUnoptimised(Stap, Umu, mu);
+  }
+  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
+                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
+                         int mu) {
+    RectStapleOptimised(Stap, U2, U, mu);
+  }
+  //////////////////////////////////////////////////////
+  //Compute the rectangular staples for all orientations
+  //Stap : Array of staples (Nd)
+  //U: Gauge links in each direction (Nd)
+  /////////////////////////////////////////////////////
+  static void RectStapleAll(std::vector<GaugeMat> &Stap, const std::vector<GaugeMat> &U){
+    assert(Stap.size() == Nd); assert(U.size() == Nd);
+    std::vector<GaugeMat> U2(Nd,U[0].Grid());
+    for(int mu=0;mu<Nd;mu++) RectStapleDouble(U2[mu], U[mu], mu);
+    for(int mu=0;mu<Nd;mu++) RectStapleOptimised(Stap[mu], U2, U, mu);
+  }
+
+  //A workspace class allowing reuse of the stencil
+  class RectStaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
+  public:
+    std::vector<Coordinate> getShifts() const override{
+      std::vector<Coordinate> shifts;
+      for (int mu = 0; mu < Nd; mu++){
+	for (int nu = 0; nu < Nd; nu++) {
+	  if (nu != mu) {
+	    auto genShift = [&](int mushift,int nushift){
+	      Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
+	    };
+
+	    //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+	    shifts.push_back(genShift(0,0));
+	    shifts.push_back(genShift(0,+1));
+	    shifts.push_back(genShift(+1,+1));
+	    shifts.push_back(genShift(+2,0));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(+1,-1));
+	    shifts.push_back(genShift(+2,-1));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+	    shifts.push_back(genShift(-1,0));
+	    shifts.push_back(genShift(-1,-1));
+	    shifts.push_back(genShift(-1,-1));
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(+1,-1));
+
+	    //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+	    shifts.push_back(genShift(-1,0));
+	    shifts.push_back(genShift(-1,0));
+	    shifts.push_back(genShift(-1,+1));
+	    shifts.push_back(genShift(0,+1));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+	    shifts.push_back(genShift(0,0));
+	    shifts.push_back(genShift(0,+1));
+	    shifts.push_back(genShift(0,+2));
+	    shifts.push_back(genShift(+1,+1));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(0,-2));
+	    shifts.push_back(genShift(0,-2));
+	    shifts.push_back(genShift(+1,-2));
+	    shifts.push_back(genShift(+1,-1));
+	  }
+	}
+      }
+      return shifts;
+    }
+
+    int paddingDepth() const override{ return 2; }
+  }; 
+
+  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
+    RectStaplePaddedAllWorkspace wk;
+    RectStaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
+  }
+  
+  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  //gStencil: the stencil
+  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
+    double t0 = usecond();
+    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
+    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
+    assert(Cell.depth >= 2);
+    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
+
+    size_t nshift = gStencil._npoints;
+    int mu_off_delta = nshift / Nd;
+    
+    //Open views to padded gauge links and keep open over mu loop
+    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
+    size_t vsize = Nd*sizeof(GaugeViewType);
+    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
+    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
+    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
+
+    GaugeMat gStaple(ggrid); //temp staple object on padded grid
+
+    int offset = 0;
+    for(int mu=0; mu<Nd; mu++){
+
+      { //view scope
+	autoView( gStaple_v , gStaple, AcceleratorWrite);
+	auto gStencil_v = gStencil.View();
+
+	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
+	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
+	    stencil_ss = Zero();
+	    int s=offset;
+	    for(int nu=0;nu<Nd;nu++){
+	      if(nu != mu){
+		//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+		GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
+		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+	    
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
+
+		//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
+
+	      }
+	    }
+	    coalescedWrite(gStaple_v[ss],stencil_ss);
+	  }
+	  );
+	offset += mu_off_delta;
+      }//kernel/view scope
+
+      staple[mu] = Cell.Extract(gStaple);    
+    }//mu loop
+  
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
+    free(Ug_dirs_v_host);
+    acceleratorFreeDevice(Ug_dirs_v);
+    
+    double t1 = usecond();
+    
+    std::cout << GridLogPerformance << "RectStaplePaddedAll timings:" << (t1-t0)/1000 << "ms" << std::endl;   
+  }
+
+  //A workspace for reusing the PaddedCell and GeneralLocalStencil objects
+  class StapleAndRectStapleAllWorkspace: public WilsonLoopPaddedWorkspace{
+  public:
+    StapleAndRectStapleAllWorkspace(){
+      this->addStencil(new StaplePaddedAllWorkspace);
+      this->addStencil(new RectStaplePaddedAllWorkspace);
+    }
+  };     
+    
+  //////////////////////////////////////////////////////
+  //Compute the 1x1 and 1x2 staples for all orientations
+  //Stap : Array of staples (Nd)
+  //RectStap: Array of rectangular staples (Nd)
+  //U: Gauge links in each direction (Nd)
+  /////////////////////////////////////////////////////
+  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U){
+    StapleAndRectStapleAllWorkspace wk;
+    StapleAndRectStapleAll(Stap,RectStap,U,wk);
+  }
+  
+  //////////////////////////////////////////////////////
+  //Compute the 1x1 and 1x2 staples for all orientations
+  //Stap : Array of staples (Nd)
+  //RectStap: Array of rectangular staples (Nd)
+  //U: Gauge links in each direction (Nd)
+  //wk: a workspace containing stored PaddedCell and GeneralLocalStencil objects to maximize reuse
+  /////////////////////////////////////////////////////
+  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U, StapleAndRectStapleAllWorkspace &wk){
+#if 0
+    StapleAll(Stap, U);
+    RectStapleAll(RectStap, U);
+#else
+    double t0 = usecond();
+
+    GridCartesian* unpadded_grid = dynamic_cast<GridCartesian*>(U[0].Grid());
+    const PaddedCell &Ghost = wk.getPaddedCell(unpadded_grid);
+        
+    CshiftImplGauge<Gimpl> cshift_impl;
+    std::vector<GaugeMat> U_pad(Nd, Ghost.grids.back());
+    for(int mu=0;mu<Nd;mu++) U_pad[mu] = Ghost.Exchange(U[mu], cshift_impl);
+    double t1 = usecond();
+    StaplePaddedAll(Stap, U_pad, Ghost, wk.getStencil(0,unpadded_grid) );
+    double t2 = usecond();
+    RectStaplePaddedAll(RectStap, U_pad, Ghost, wk.getStencil(1,unpadded_grid));
+    double t3 = usecond();
+    std::cout << GridLogPerformance << "StapleAndRectStapleAll timings: pad:" << (t1-t0)/1000 << "ms, staple:" << (t2-t1)/1000 << "ms, rect-staple:" << (t3-t2)/1000 << "ms" << std::endl;
+#endif
+  }
+
  //////////////////////////////////////////////////
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
  //////////////////////////////////////////////////
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@@ -79,60 +79,60 @@ public:
    this->_entries.resize(npoints* osites);
    this->_entries_p = &_entries[0];

+    thread_for(site, osites, {
+	Coordinate Coor;
+	Coordinate NbrCoor;

-    Coordinate Coor;
-    Coordinate NbrCoor;
-    for(Integer site=0;site<osites;site++){
-      for(Integer ii=0;ii<npoints;ii++){
-	Integer lex = site*npoints+ii;
-	GeneralStencilEntry SE;
-	////////////////////////////////////////////////
-	// Outer index of neighbour Offset calculation
-	////////////////////////////////////////////////
-	grid->oCoorFromOindex(Coor,site);
-	for(int d=0;d<Coor.size();d++){
-	  int rd = grid->_rdimensions[d];
-	  NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
+	for(Integer ii=0;ii<npoints;ii++){
+	  Integer lex = site*npoints+ii;
+	  GeneralStencilEntry SE;
+	  ////////////////////////////////////////////////
+	  // Outer index of neighbour Offset calculation
+	  ////////////////////////////////////////////////
+	  grid->oCoorFromOindex(Coor,site);
+	  for(int d=0;d<Coor.size();d++){
+	    int rd = grid->_rdimensions[d];
+	    NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
+	  }
+	  SE._offset      = grid->oIndexReduced(NbrCoor);
+
+	  ////////////////////////////////////////////////
+	  // Inner index permute calculation
+	  // Simpler version using icoor calculation
+	  ////////////////////////////////////////////////
+	  SE._permute =0;
+	  for(int d=0;d<Coor.size();d++){
+
+	    int fd = grid->_fdimensions[d];
+	    int rd = grid->_rdimensions[d];
+	    int ly = grid->_simd_layout[d];
+
+	    assert((ly==1)||(ly==2));
+
+	    int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
+	    int x = Coor[d];                // x in [0... rd-1] as an oSite 
+
+	    int permute_dim  = grid->PermuteDim(d);
+	    int permute_slice=0;
+	    if(permute_dim){    
+	      int  num = shift%rd; // Slice within dest osite cell of slice zero
+	      int wrap = shift/rd; // Number of osite local volume cells crossed through
+	      // x+num < rd dictates whether we are in same permute state as slice 0
+	      if ( x< rd-num ) permute_slice=wrap;
+	      else             permute_slice=(wrap+1)%ly;
+	    }
+	    if ( permute_slice ) {
+	      int ptype       =grid->PermuteType(d);
+	      uint8_t mask    =0x1<<ptype;
+	      SE._permute    |= mask;
+	    }
+	  }	
+	  ////////////////////////////////////////////////
+	  // Store in look up table
+	  ////////////////////////////////////////////////
+	  this->_entries[lex] = SE;
 	}
-	SE._offset      = grid->oIndexReduced(NbrCoor);
-
-	////////////////////////////////////////////////
-	// Inner index permute calculation
-	// Simpler version using icoor calculation
-	////////////////////////////////////////////////
-	SE._permute =0;
-	for(int d=0;d<Coor.size();d++){
-
-	  int fd = grid->_fdimensions[d];
-	  int rd = grid->_rdimensions[d];
-	  int ly = grid->_simd_layout[d];
-
-	  assert((ly==1)||(ly==2));
-
-	  int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
-	  int x = Coor[d];                // x in [0... rd-1] as an oSite 
-
-	  int permute_dim  = grid->PermuteDim(d);
-	  int permute_slice=0;
-	  if(permute_dim){    
-	    int  num = shift%rd; // Slice within dest osite cell of slice zero
-	    int wrap = shift/rd; // Number of osite local volume cells crossed through
-                                  // x+num < rd dictates whether we are in same permute state as slice 0
-	    if ( x< rd-num ) permute_slice=wrap;
-	    else             permute_slice=(wrap+1)%ly;
-	  }
-	  if ( permute_slice ) {
-	    int ptype       =grid->PermuteType(d);
-	    uint8_t mask    =0x1<<ptype;
-	    SE._permute    |= mask;
-	  }
-	}	
-	////////////////////////////////////////////////
-	// Store in look up table
-	////////////////////////////////////////////////
-	this->_entries[lex] = SE;
-      }
-    }      
+      });
  }
  
 };
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -32,6 +32,7 @@

 #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate
 #include <Grid/stencil/Lebesgue.h>   // subdir aggregate
+#include <Grid/stencil/GeneralLocalStencil.h>

 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
--- a/Grid/tensors/Tensor_SIMT.h
+++ b/Grid/tensors/Tensor_SIMT.h
@@ -73,6 +73,16 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
    return vec;
  }
 }
+//'perm_mask' acts as a bitmask
+template<class vobj> accelerator_inline
+vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0)
+{
+  auto obj = vec, tmp = vec;
+  for (int d=0;d<nd;d++)
+    if (perm_mask & (0x1 << d)) { permute(obj,tmp,d); tmp=obj;}
+  return obj;
+}
+
 template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
 {
@@ -83,7 +93,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 {
  vstream(vec, extracted);
 }
-#else
+#else //==GRID_SIMT


 //#ifndef GRID_SYCL
@@ -166,6 +176,14 @@ typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,
  return extractLane(plane,vec);
 }
 template<class vobj> accelerator_inline
+typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd()))
+{
+  int plane = lane;
+  for (int d=0;d<nd;d++)
+    plane = (perm_mask & (0x1 << d)) ? plane ^ (vobj::Nsimd() >> (d + 1)) : plane;
+  return extractLane(plane,vec);
+}
+template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 {
  insertLane(lane,vec,extracted);
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@@ -1,4 +1,4 @@
 BREW=/opt/local/
-CXX=mpicxx-openmpi-mp ../../configure --enable-simd=GEN --enable-comms=mpi --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug
+MPICXX=mpicxx CXX=c++-12 ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug


--- a/tests/debug/Test_iwasaki_action_newstaple.cc
+++ b/tests/debug/Test_iwasaki_action_newstaple.cc
@@ -0,0 +1,188 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_iwasaki_action_newstaple.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+////////////////////////////////////////////////////////////////////////
+// PlaqPlusRectangleActoin
+////////////////////////////////////////////////////////////////////////
+template<class Gimpl>
+class PlaqPlusRectangleActionOrig : public Action<typename Gimpl::GaugeField> {
+public:
+
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+private:
+  RealD c_plaq;
+  RealD c_rect;
+
+public:
+  PlaqPlusRectangleActionOrig(RealD b,RealD c): c_plaq(b),c_rect(c){};
+
+  virtual std::string action_name(){return "PlaqPlusRectangleActionOrig";}
+      
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
+      
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    sstream << GridLogMessage << "["<<action_name() <<"] c_plaq: " << c_plaq << std::endl;
+    sstream << GridLogMessage << "["<<action_name() <<"] c_rect: " << c_rect << std::endl;
+    return sstream.str();
+  }
+
+
+  virtual RealD S(const GaugeField &U) {
+    RealD vol = U.Grid()->gSites();
+
+    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
+    RealD rect = WilsonLoops<Gimpl>::avgRectangle(U);
+
+    RealD action=c_plaq*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5
+      +c_rect*(1.0 -rect)*(Nd*(Nd-1.0))*vol;
+
+    return action;
+  };
+
+  virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
+    //extend Ta to include Lorentz indexes
+    RealD factor_p = c_plaq/RealD(Nc)*0.5;
+    RealD factor_r = c_rect/RealD(Nc)*0.5;
+
+    GridBase *grid = Umu.Grid();
+
+    std::vector<GaugeLinkField> U (Nd,grid);
+    std::vector<GaugeLinkField> U2(Nd,grid);
+
+    for(int mu=0;mu<Nd;mu++){
+      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+      WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
+    }
+
+    GaugeLinkField dSdU_mu(grid);
+    GaugeLinkField staple(grid);
+
+    for (int mu=0; mu < Nd; mu++){
+
+      // Staple in direction mu
+
+      WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
+
+      dSdU_mu = Ta(U[mu]*staple)*factor_p;
+
+      WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
+
+      dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
+	  
+      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
+    }
+
+  };
+
+};
+
+// Convenience for common physically defined cases.
+//
+// RBC c1 parameterisation is not really RBC but don't have good
+// reference and we are happy to change name if prior use of this plaq coeff
+// parameterisation is made known to us. 
+template<class Gimpl>
+class RBCGaugeActionOrig : public PlaqPlusRectangleActionOrig<Gimpl> {
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+  RBCGaugeActionOrig(RealD beta,RealD c1) : PlaqPlusRectangleActionOrig<Gimpl>(beta*(1.0-8.0*c1), beta*c1) {};
+  virtual std::string action_name(){return "RBCGaugeActionOrig";}
+};
+
+template<class Gimpl>
+class IwasakiGaugeActionOrig : public RBCGaugeActionOrig<Gimpl> {
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+  IwasakiGaugeActionOrig(RealD beta) : RBCGaugeActionOrig<Gimpl>(beta,-0.331) {};
+  virtual std::string action_name(){return "IwasakiGaugeActionOrig";}
+};
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size  = GridDefaultLatt();
+  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout = GridDefaultMpi();
+  std::cout << " mpi "<<mpi_layout<<std::endl;
+  std::cout << " simd "<<simd_layout<<std::endl;
+  std::cout << " latt "<<latt_size<<std::endl;
+  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
+
+  GridParallelRNG   pRNG(&GRID);
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  LatticeGaugeField U(&GRID);
+
+  SU<Nc>::HotConfiguration(pRNG,U);
+
+  //#define PRD
+#ifdef PRD
+  typedef PeriodicGimplD Gimpl;
+#else
+  typedef ConjugateGimplD Gimpl;
+  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
+  Gimpl::setDirections(conj_dirs);
+#endif
+
+  typedef typename WilsonLoops<Gimpl>::GaugeMat GaugeMat;
+  typedef typename WilsonLoops<Gimpl>::GaugeLorentz GaugeLorentz;
+
+  GaugeLorentz derivOrig(&GRID), derivNew(&GRID);
+  double beta = 2.13;
+  IwasakiGaugeActionOrig<Gimpl> action_orig(beta);
+  IwasakiGaugeAction<Gimpl> action_new(beta);
+
+  double torig=0, tnew=0;
+  int ntest = 10;
+  for(int i=0;i<ntest;i++){
+    double t0 = usecond();
+    action_orig.deriv(U, derivOrig);
+    double t1 = usecond();
+    action_new.deriv(U, derivNew);
+    double t2 = usecond();
+
+    GaugeLorentz diff = derivOrig - derivNew;
+    double n = norm2(diff);
+    std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
+    assert(n<1e-10);
+
+    std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms,  new: " << (t2-t1)/1000 << "ms" << std::endl;
+    torig += (t1-t0)/1000; tnew += (t2-t1)/1000;
+  }
+  std::cout << GridLogMessage << "Avg timings " << ntest << " iterations: orig:" << torig/ntest << "ms,   new:" << tnew/ntest << "ms" << std::endl;
+  
+  Grid_finalize();
+}
--- a/tests/debug/Test_optimized_staple_gaugebc.cc
+++ b/tests/debug/Test_optimized_staple_gaugebc.cc
@@ -0,0 +1,94 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_optimized_staple_gaugebc.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+using namespace std;
+using namespace Grid;
+ 
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size  = GridDefaultLatt();
+  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout = GridDefaultMpi();
+  std::cout << " mpi "<<mpi_layout<<std::endl;
+  std::cout << " simd "<<simd_layout<<std::endl;
+  std::cout << " latt "<<latt_size<<std::endl;
+  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
+
+  GridParallelRNG   pRNG(&GRID);
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  LatticeGaugeField U(&GRID);
+
+  SU<Nc>::HotConfiguration(pRNG,U);
+
+  //#define PRD
+#ifdef PRD
+  typedef PeriodicGimplD Gimpl;
+#else
+  typedef ConjugateGimplD Gimpl;
+  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
+  Gimpl::setDirections(conj_dirs);
+#endif
+
+  typedef typename WilsonLoops<Gimpl>::GaugeMat GaugeMat;
+  typedef typename WilsonLoops<Gimpl>::GaugeLorentz GaugeLorentz;
+
+  int count = 0;
+  double torig=0, topt=0;
+     
+  std::vector<GaugeMat> Umu(Nd,&GRID), U2(Nd,&GRID);
+  for(int mu=0;mu<Nd;mu++){
+    Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
+    WilsonLoops<Gimpl>::RectStapleDouble(U2[mu], Umu[mu], mu);
+  }
+
+  std::cout << GridLogMessage << "Checking optimized vs unoptimized RectStaple" << std::endl;
+  for(int mu=0;mu<Nd;mu++){
+    GaugeMat staple_orig(&GRID), staple_opt(&GRID), staple_U2(&GRID);
+    double t0 = usecond();
+    WilsonLoops<Gimpl>::RectStapleUnoptimised(staple_orig,U,mu);
+    double t1 = usecond();
+    WilsonLoops<Gimpl>::RectStapleOptimised(staple_opt, U2, Umu, mu);
+    double t2 = usecond();
+    torig += t1-t0;  topt += t2-t1;
+    ++count;
+    
+    GaugeMat diff = staple_orig - staple_opt;
+    double n = norm2(diff);
+    std::cout << GridLogMessage << mu << " " << n << std::endl;
+    assert(n<1e-10);
+  }
+  std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  optimized: " << topt/1000/count << "ms" << std::endl;
+  
+  Grid_finalize();
+}
--- a/tests/debug/Test_padded_cell_staple.cc
+++ b/tests/debug/Test_padded_cell_staple.cc
@@ -0,0 +1,580 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell_staple.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+using namespace std;
+using namespace Grid;
+
+template <class Gimpl> class WilsonLoopsTest : public Gimpl {
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  typedef typename Gimpl::GaugeLinkField GaugeMat;
+  typedef typename Gimpl::GaugeField GaugeLorentz;
+
+
+  //Original implementation
+  static void StapleOrig(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
+			 int nu) {
+
+    GridBase *grid = Umu.Grid();
+
+    std::vector<GaugeMat> U(Nd, grid);
+    for (int d = 0; d < Nd; d++) {
+      U[d] = PeekIndex<LorentzIndex>(Umu, d);
+    }
+    staple = Zero();
+
+    if (nu != mu) {
+
+      // mu
+      // ^
+      // |__>  nu
+
+      //    __
+      //      |
+      //    __|
+      //
+
+      //Forward: Out(x) = Link(x)*field(x+mu)
+      //Backward: Out(x) = Link^dag(x-mu)*field(x-mu)
+      //ShiftStaple: Link(x) = Link(x+mu)
+
+      //tmp1 = U^dag_nu(x-nu)
+      //tmp2 = U^dag_mu(x-mu) tmp1(x-mu) = U^dag_mu(x-mu) U^dag_nu(x-nu-mu)
+      //tmp3 = U_nu(x) tmp2(x+nu) = U_nu(x)U^dag_mu(x-mu+nu) U^dag_nu(x-mu)
+      //tmp4 = tmp(x+mu) = U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
+
+      staple += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftBackward(
+										  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+				   mu);
+
+      //  __
+      // |
+      // |__
+      //
+      //
+    
+      //tmp1 = U_mu^dag(x-mu) U_nu(x-mu)
+      //tmp2 = U_nu^dag(x-nu) tmp1(x-nu) = U_nu^dag(x-nu) U_mu^dag(x-mu-nu) U_nu(x-mu-nu)
+      //tmp3 = tmp2(x+mu) = U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
+      staple += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftBackward(U[nu], nu,
+							   Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
+				   mu);
+    }
+  }
+
+  static void StaplePadded(GaugeMat &staple, const GaugeLorentz &U, int mu,
+			   int nu) {
+    if(nu==mu){
+      staple = Zero();
+      return;
+    }
+    double peek = 0, construct = 0, exchange = 0, coord = 0, stencil =0, kernel = 0, extract = 0, total = 0;
+    
+    double tstart = usecond();
+    double t=tstart;
+    
+    PaddedCell Ghost(1, (GridCartesian*)U.Grid());
+
+    construct += usecond() - t;
+      
+    t=usecond();      
+    GaugeMat U_mu = PeekIndex<LorentzIndex>(U, mu);
+    GaugeMat U_nu = PeekIndex<LorentzIndex>(U, nu);
+    peek += usecond() - t;
+
+    t=usecond();
+    CshiftImplGauge<Gimpl> cshift_impl;
+    GaugeMat Ug_mu = Ghost.Exchange(U_mu, cshift_impl);
+    GaugeMat Ug_nu = Ghost.Exchange(U_nu, cshift_impl);
+    exchange += usecond() - t;
+    
+    GridBase *ggrid = Ug_mu.Grid();
+
+    GaugeMat gStaple(ggrid);
+
+    t=usecond();
+    Coordinate shift_0(Nd,0);
+    Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
+    Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
+    Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
+    Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
+
+    std::vector<Coordinate> shifts;
+
+    //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
+    shifts.push_back(shift_0);
+    shifts.push_back(shift_nu);
+    shifts.push_back(shift_mu);
+
+    //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
+    shifts.push_back(shift_mnu);
+    shifts.push_back(shift_mnu);
+    shifts.push_back(shift_mnu_pmu);
+    coord += usecond()-t;
+
+    t=usecond();
+    GeneralLocalStencil gStencil(ggrid,shifts);
+    stencil += usecond() -t;
+
+    t=usecond();
+    {
+      autoView( gStaple_v , gStaple, AcceleratorWrite);
+      auto gStencil_v = gStencil.View();
+      autoView( Ug_mu_v , Ug_mu, AcceleratorRead);
+      autoView( Ug_nu_v , Ug_nu, AcceleratorRead);
+  
+      accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
+	  GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss);
+	  auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
+	  e = gStencil_v.GetEntry(1,ss);
+	  auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
+	  e = gStencil_v.GetEntry(2,ss);
+	  auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
+      
+	  auto stencil_ss = U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x;
+
+	  e = gStencil_v.GetEntry(3,ss);
+	  auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
+	  e = gStencil_v.GetEntry(4,ss);
+	  auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
+	  e = gStencil_v.GetEntry(5,ss);
+	  auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
+
+	  stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu;
+      
+	  coalescedWrite(gStaple_v[ss],stencil_ss);
+	}
+	);
+    } //ensure views are all closed!
+    kernel += usecond() - t;
+
+    t=usecond();
+    staple = Ghost.Extract(gStaple);
+    extract += usecond()-t;
+    
+    total += usecond() - tstart;
+    std::cout << GridLogMessage << "StaplePadded timings peek:" << peek << " construct:" << construct << " exchange:" << exchange << " coord:" << coord << " stencil:" << stencil << " kernel:" << kernel << " extract:" << extract << " total:" << total << std::endl;
+  }
+
+  static void RectStapleOrig(GaugeMat &Stap, const GaugeLorentz &Umu,
+			     int mu) {
+    GridBase *grid = Umu.Grid();
+
+    std::vector<GaugeMat> U(Nd, grid);
+    for (int d = 0; d < Nd; d++) {
+      U[d] = PeekIndex<LorentzIndex>(Umu, d);
+    }
+
+    Stap = Zero();
+
+    for (int nu = 0; nu < Nd; nu++) {
+      if (nu != mu) {
+        //           __ ___
+        //          |    __ |
+        //
+	//tmp1 = U_nu^dag(x-nu)
+	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu) U_nu^dag(x-nu-mu)
+	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu) U_mu^dag(x-2mu) U_nu^dag(x-nu-2mu)
+	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu) U_mu^dag(x-2mu+nu) U_nu^dag(x-2mu)
+	//tmp5 = U_mu(x)tmp4(x+mu) = U_mu(x)U_nu(x+mu)U_mu^dag(x+nu) U_mu^dag(x-mu+nu) U_nu^dag(x-mu)
+	//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+	
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftForward(
+							  U[mu], mu,
+							  Gimpl::CovShiftForward(
+										 U[nu], nu,
+										 Gimpl::CovShiftBackward(
+													 U[mu], mu,
+													 Gimpl::CovShiftBackward(
+																 U[mu], mu,
+																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
+				   mu);
+
+        //              __
+        //          |__ __ |
+
+	//tmp1 = U^dag_mu(x-mu)U_nu(x-mu)
+	//tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)
+	//tmp3 = U^dag_nu(x-nu)tmp2(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)
+	//tmp4 = U_mu(x)tmp3(x+mu) = U_mu(x)U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)
+	//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+	
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftForward(
+							  U[mu], mu,
+							  Gimpl::CovShiftBackward(
+										  U[nu], nu,
+										  Gimpl::CovShiftBackward(
+													  U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))),
+				   mu);
+
+        //           __
+        //          |__ __ |
+	//Forward: Out(x) = Link(x)*field(x+mu)
+	//Backward: Out(x) = Link^dag(x-mu)*field(x-mu)
+	//ShiftStaple: Link(x) = Link(x+mu)
+
+	//tmp1 = U_nu(x)U_mu(x+nu)
+	//tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U_nu(x-mu)U_mu(x+nu-mu)
+	//tmp3 = U^dag_mu(x-mu)tmp2(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)U_mu(x+nu-2mu)
+	//tmp4 = U^dag_nu(x-nu)tmp3(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)U_mu(x-2mu)
+	//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftBackward(
+							   U[nu], nu,
+							   Gimpl::CovShiftBackward(
+										   U[mu], mu,
+										   Gimpl::CovShiftBackward(
+													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))),
+				   mu);
+
+        //           __ ___
+        //          |__    |
+	//tmp1 = U_nu^dag(x-nu)U_mu(x-nu)
+	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_mu(x-mu-nu)
+	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_mu^dag(x-2mu)U_nu^dag(x-2mu-nu)U_mu(x-2mu-nu)
+	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_mu^dag(x-2mu+nu)U_nu^dag(x-2mu)U_mu(x-2mu)
+	//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftBackward(
+										  U[mu], mu,
+										  Gimpl::CovShiftBackward(
+													  U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))),
+				   mu);
+
+        //       --
+        //      |  |
+        //
+        //      |  |
+	//tmp1 = U_nu^dag(x-nu)
+	//tmp2 = U_nu^dag(x-nu)tmp1(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)
+	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_nu^dag(x-mu-2nu)
+	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_nu^dag(x-mu-nu)
+	//tmp5 = U_nu(x)tmp4(x+nu) = U_nu(x)U_nu(x+nu)U_mu^dag(x-mu+2nu)U_nu^dag(x-mu+nu)U_nu^dag(x-mu)
+	//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftForward(
+										 U[nu], nu,
+										 Gimpl::CovShiftBackward(
+													 U[mu], mu,
+													 Gimpl::CovShiftBackward(
+																 U[nu], nu,
+																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
+				   mu);
+
+        //      |  |
+        //
+        //      |  |
+        //       --
+	//tmp1 = U_nu(x)U_nu(x+nu)
+	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu(x-mu)U_nu(x-mu+nu)
+	//tmp3 = U_nu^dag(x-nu)tmp2(x-nu) = U_nu^dag(x-nu)U_mu^dag(x-mu-nu)U_nu(x-mu-nu)U_nu(x-mu)
+	//tmp4 = U_nu^dag(x-nu)tmp3(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)U_mu^dag(x-mu-2nu)U_nu(x-mu-2nu)U_nu(x-mu-nu)
+	//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftBackward(
+							   U[nu], nu,
+							   Gimpl::CovShiftBackward(
+										   U[nu], nu,
+										   Gimpl::CovShiftBackward(
+													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))),
+				   mu);
+      }
+    }
+  }
+
+
+  static void RectStaplePadded(GaugeMat &Stap, const GaugeLorentz &U,
+			       int mu) {
+    PaddedCell Ghost(2,(GridCartesian*)U.Grid());
+    GridBase *ggrid = Ghost.grids.back();
+    
+    CshiftImplGauge<Gimpl> cshift_impl;
+    std::vector<GaugeMat> Ug_dirs(Nd,ggrid);
+    for(int i=0;i<Nd;i++) Ug_dirs[i] = Ghost.Exchange(PeekIndex<LorentzIndex>(U, i), cshift_impl);
+
+    GaugeMat gStaple(ggrid);
+
+    std::vector<Coordinate> shifts;
+    for (int nu = 0; nu < Nd; nu++) {
+      if (nu != mu) {
+	auto genShift = [&](int mushift,int nushift){
+	  Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
+	};
+
+	//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+	shifts.push_back(genShift(0,0));
+	shifts.push_back(genShift(0,+1));
+	shifts.push_back(genShift(+1,+1));
+	shifts.push_back(genShift(+2,0));
+	shifts.push_back(genShift(+1,0));
+
+	//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+	shifts.push_back(genShift(0,-1));
+	shifts.push_back(genShift(0,-1));
+	shifts.push_back(genShift(+1,-1));
+	shifts.push_back(genShift(+2,-1));
+	shifts.push_back(genShift(+1,0));
+
+	//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+	shifts.push_back(genShift(-1,0));
+	shifts.push_back(genShift(-1,-1));
+	shifts.push_back(genShift(-1,-1));
+	shifts.push_back(genShift(0,-1));
+	shifts.push_back(genShift(+1,-1));
+
+	//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+	shifts.push_back(genShift(-1,0));
+	shifts.push_back(genShift(-1,0));
+	shifts.push_back(genShift(-1,+1));
+	shifts.push_back(genShift(0,+1));
+	shifts.push_back(genShift(+1,0));
+
+	//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+	shifts.push_back(genShift(0,0));
+	shifts.push_back(genShift(0,+1));
+	shifts.push_back(genShift(0,+2));
+	shifts.push_back(genShift(+1,+1));
+	shifts.push_back(genShift(+1,0));
+
+	//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+	shifts.push_back(genShift(0,-1));
+	shifts.push_back(genShift(0,-2));
+	shifts.push_back(genShift(0,-2));
+	shifts.push_back(genShift(+1,-2));
+	shifts.push_back(genShift(+1,-1));
+      }
+    }
+    size_t nshift = shifts.size();
+
+    GeneralLocalStencil gStencil(ggrid,shifts);
+    {
+      autoView( gStaple_v , gStaple, AcceleratorWrite);
+      auto gStencil_v = gStencil.View();
+
+      typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
+      size_t vsize = Nd*sizeof(GaugeViewType);
+      GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
+      for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = Ug_dirs[i].View(AcceleratorRead);
+      GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
+      acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
+
+      accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
+	  decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
+	  stencil_ss = Zero();
+	  int s=0;
+	  for(int nu=0;nu<Nd;nu++){
+	    if(nu != mu){
+	      //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+	      GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
+	      auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+	    
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+	      //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+	      //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+	      //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+	      //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
+
+	      //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
+
+	    }
+	  }
+	  assert(s==nshift);
+	  coalescedWrite(gStaple_v[ss],stencil_ss);
+	}
+	);
+  
+      for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
+      free(Ug_dirs_v_host);
+      acceleratorFreeDevice(Ug_dirs_v);
+    }   
+    Stap = Ghost.Extract(gStaple);    
+  }
+
+
+
+};  
+  
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size  = GridDefaultLatt();
+  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout = GridDefaultMpi();
+  std::cout << " mpi "<<mpi_layout<<std::endl;
+  std::cout << " simd "<<simd_layout<<std::endl;
+  std::cout << " latt "<<latt_size<<std::endl;
+  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
+
+  GridParallelRNG   pRNG(&GRID);
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  LatticeGaugeField U(&GRID);
+
+  SU<Nc>::HotConfiguration(pRNG,U);
+
+  //typedef PeriodicGimplD Gimpl;
+  typedef ConjugateGimplD Gimpl;
+  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
+  Gimpl::setDirections(conj_dirs);
+
+  typedef typename WilsonLoopsTest<Gimpl>::GaugeMat GaugeMat;
+  typedef typename WilsonLoopsTest<Gimpl>::GaugeLorentz GaugeLorentz;
+
+  std::cout << GridLogMessage << "Checking Staple" << std::endl;
+  int count = 0;
+  double torig=0, tpadded=0;
+  
+  for(int mu=0;mu<Nd;mu++){
+    for(int nu=0;nu<Nd;nu++){
+      if(mu != nu){
+	GaugeMat staple_orig(&GRID), staple_padded(&GRID);
+	double t0 = usecond();
+	WilsonLoopsTest<Gimpl>::StapleOrig(staple_orig,U,mu,nu);
+	double t1 = usecond();
+	WilsonLoopsTest<Gimpl>::StaplePadded(staple_padded,U,mu,nu);
+	double t2 = usecond();
+	torig += t1-t0;  tpadded += t2-t1;
+	++count;
+	
+	GaugeMat diff = staple_orig - staple_padded;
+	double n = norm2(diff);
+	std::cout << GridLogMessage << mu << " " << nu << " " << n << std::endl;
+	assert(n<1e-10);
+      }
+    }
+  }
+  std::cout << GridLogMessage << "Staple timings orig: " << torig/1000/count << "ms,  padded: " << tpadded/1000/count << "ms" << std::endl;
+  count=0; torig=tpadded=0;
+    
+  std::cout << GridLogMessage << "Checking RectStaple" << std::endl;
+  for(int mu=0;mu<Nd;mu++){
+    GaugeMat staple_orig(&GRID), staple_padded(&GRID);
+    double t0 = usecond();
+    WilsonLoopsTest<Gimpl>::RectStapleOrig(staple_orig,U,mu);
+    double t1 = usecond();
+    WilsonLoopsTest<Gimpl>::RectStaplePadded(staple_padded,U,mu);
+    double t2 = usecond();
+    torig += t1-t0;  tpadded += t2-t1;
+    ++count;
+    
+    GaugeMat diff = staple_orig - staple_padded;
+    double n = norm2(diff);
+    std::cout << GridLogMessage << mu << " " << n << std::endl;
+    assert(n<1e-10);
+  }
+  std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  padded: " << tpadded/1000/count << "ms" << std::endl;
+  
+  Grid_finalize();
+}
--- a/tests/qdpxx/Test_qdpxx_munprec.cc
+++ b/tests/qdpxx/Test_qdpxx_munprec.cc
@@ -1,6 +1,7 @@
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
+
    Source file: ./tests/qdpxx/Test_qdpxx_munprec.cc

    Copyright (C) 2015
@@ -25,17 +26,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <chroma.h>
-#include <actions/ferm/invert/syssolver_linop_cg_array.h>
-#include <actions/ferm/invert/syssolver_linop_aggregate.h>
-
 #include <Grid/Grid.h>

 int    Ls=8;
 double M5=1.6;
 double mq=0.01;
-double zolo_lo = 0.01;
-double zolo_hi = 7.0;
+double zolo_lo = 0.1;
+double zolo_hi = 2.0;
 double mobius_scale=2.0;

 enum ChromaAction {
@@ -58,6 +55,11 @@ enum ChromaAction {
 void calc_grid      (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag);
 void calc_chroma    (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag);

+#include <chroma.h>
+#include <actions/ferm/invert/syssolver_linop_cg_array.h>
+#include <actions/ferm/invert/syssolver_linop_aggregate.h>
+
+

 namespace Chroma { 

@@ -79,7 +81,7 @@ public:

    std::vector<int> x(4);
    QDP::multi1d<int> cx(4);
-    Grid::Coordinate gd = gr.Grid()->GlobalDimensions();
+    std::vector<int> gd= gr.Grid()->GlobalDimensions();

    for (x[0]=0;x[0]<gd[0];x[0]++){
    for (x[1]=0;x[1]<gd[1];x[1]++){
@@ -122,7 +124,7 @@ public:

    std::vector<int> x(5);
    QDP::multi1d<int> cx(4);
-    Grid::Coordinate gd= gr.Grid()->GlobalDimensions();
+    std::vector<int> gd= gr.Grid()->GlobalDimensions();

    for (x[0]=0;x[0]<gd[0];x[0]++){
    for (x[1]=0;x[1]<gd[1];x[1]++){
@@ -164,7 +166,7 @@ public:

    std::vector<int> x(5);
    QDP::multi1d<int> cx(4);
-    Grid::Coordinate gd= gr.Grid()->GlobalDimensions();
+    std::vector<int> gd= gr.Grid()->GlobalDimensions();

    for (x[0]=0;x[0]<gd[0];x[0]++){
    for (x[1]=0;x[1]<gd[1];x[1]++){
@@ -302,30 +304,7 @@ public:
     //     param.approximation_type=COEFF_TYPE_TANH_UNSCALED;
     //     param.approximation_type=COEFF_TYPE_TANH;
     param.tuning_strategy_xml=
-"<TuningStrategy><Name>OVEXT_CONSTANT_STRATEGY</Name><TuningConstant>1.0</TuningConstant></TuningStrategy>\n";
-     UnprecOvExtFermActArray S_f(cfs,param);
-     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
-     Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
-     return M;
-   }
-   if ( parms == HwPartFracTanh ) {
-     if ( Ls%2 == 0 ) { 
-       printf("Ls is not odd\n");
-       exit(-1);
-     }
-     UnprecOvExtFermActArrayParams param;
-     param.OverMass=M5; 
-     param.Mass=_mq;
-     param.RatPolyDeg = Ls;
-     param.ApproxMin =eps_lo;
-     param.ApproxMax =eps_hi;
-     param.b5 =1.0;
-     param.c5 =1.0;
-     //     param.approximation_type=COEFF_TYPE_ZOLOTAREV;
-     param.approximation_type=COEFF_TYPE_TANH_UNSCALED;
-     //param.approximation_type=COEFF_TYPE_TANH;
-     param.tuning_strategy_xml=
-       "<TuningStrategy><Name>OVEXT_CONSTANT_STRATEGY</Name><TuningConstant>1.0</TuningConstant></TuningStrategy>\n";
+"<TuningStrategy><Name>OVEXT_CONSTANT_STRATEGY</Name></TuningStrategy>\n";
     UnprecOvExtFermActArray S_f(cfs,param);
     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
     Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
@@ -337,35 +316,7 @@ public:
     param.ApproxMin=eps_lo;
     param.ApproxMax=eps_hi;
     param.approximation_type=COEFF_TYPE_ZOLOTAREV;
-     param.RatPolyDeg=Ls-1;
-     // The following is why I think Chroma made some directional errors:
-     param.AuxFermAct= std::string(
-"<AuxFermAct>\n"
-"  <FermAct>UNPRECONDITIONED_WILSON</FermAct>\n"
-"  <Mass>-1.8</Mass>\n"
-"  <b5>1</b5>\n"
-"  <c5>0</c5>\n"
-"  <MaxCG>1000</MaxCG>\n"
-"  <RsdCG>1.0e-9</RsdCG>\n"
-"  <FermionBC>\n"
-"      <FermBC>SIMPLE_FERMBC</FermBC>\n"
-"      <boundary>1 1 1 1</boundary>\n"
-"   </FermionBC> \n"
-"</AuxFermAct>"
-);
-     param.AuxFermActGrp= std::string("");
-     UnprecOvlapContFrac5DFermActArray S_f(fbc,param);
-     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
-     Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
-     return  M;
-   }
-   if ( parms == HwContFracTanh ) {
-     UnprecOvlapContFrac5DFermActParams param;
-     param.Mass=_mq; // How is M5 set? Wilson mass In AuxFermAct
-     param.ApproxMin=eps_lo;
-     param.ApproxMax=eps_hi;
-     param.approximation_type=COEFF_TYPE_TANH_UNSCALED;
-     param.RatPolyDeg=Ls-1;
+     param.RatPolyDeg=Ls;
     // The following is why I think Chroma made some directional errors:
     param.AuxFermAct= std::string(
 "<AuxFermAct>\n"
@@ -427,14 +378,7 @@ int main (int argc,char **argv )
   * Setup QDP
   *********************************************************/
  Chroma::initialize(&argc,&argv);
-  //  Chroma::WilsonTypeFermActs4DEnv::registerAll(); 
-  Chroma::WilsonTypeFermActsEnv::registerAll(); 
-  //bool linkageHack(void)
-  //{
-  //  bool foo = true;
-  // Inline Measurements
-  //  InlineAggregateEnv::registerAll();
-  //  GaugeInitEnv::registerAll();
+  Chroma::WilsonTypeFermActs4DEnv::registerAll(); 

  /********************************************************
   * Setup Grid
@@ -444,34 +388,26 @@ int main (int argc,char **argv )
                                                                       Grid::GridDefaultSimd(Grid::Nd,Grid::vComplex::Nsimd()),
                                                                       Grid::GridDefaultMpi());
  
-  Grid::Coordinate gd = UGrid->GlobalDimensions();
+  std::vector<int> gd = UGrid->GlobalDimensions();
  QDP::multi1d<int> nrow(QDP::Nd);
  for(int mu=0;mu<4;mu++) nrow[mu] = gd[mu];

  QDP::Layout::setLattSize(nrow);
  QDP::Layout::create();

+  Grid::GridCartesian         * FGrid   = Grid::SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  Grid::LatticeGaugeField lat(UGrid);
+  Grid::LatticeFermion    src(FGrid);
+  Grid::LatticeFermion    res_chroma(FGrid);
+  Grid::LatticeFermion    res_grid  (FGrid);
+  
  std::vector<ChromaAction> ActionList({
 		 HtCayleyTanh, // Plain old DWF.
 		 HmCayleyTanh,
 		 HwCayleyTanh,
 		 HtCayleyZolo, // Plain old DWF.
 		 HmCayleyZolo,
-		 HwCayleyZolo,
-		 HwPartFracZolo,
-		 HwContFracZolo,
-		 HwContFracTanh
-  });
-  std::vector<int> LsList({
-      8,//HtCayleyTanh, // Plain old DWF.
-      8,//HmCayleyTanh,
-      8,//HwCayleyTanh,
-      8,//HtCayleyZolo, // Plain old DWF.
-      8,//HmCayleyZolo,
-      8,//HwCayleyZolo,
-      9,//HwPartFracZolo
-      9, //HwContFracZolo
-      9 //HwContFracTanh
+		 HwCayleyZolo
  });
  std::vector<std::string> ActionName({
        "HtCayleyTanh",
@@ -479,19 +415,10 @@ int main (int argc,char **argv )
 	"HwCayleyTanh",
 	"HtCayleyZolo",
 	"HmCayleyZolo",
-        "HwCayleyZolo",
-	"HwPartFracZolo",
-	"HwContFracZolo",
-	"HwContFracTanh"
+        "HwCayleyZolo"
  });

  for(int i=0;i<ActionList.size();i++) {
-    Ls = LsList[i];
-    Grid::GridCartesian      * FGrid   = Grid::SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-    Grid::LatticeGaugeField lat(UGrid);
-    Grid::LatticeFermion    src(FGrid);
-    Grid::LatticeFermion    res_chroma(FGrid);
-    Grid::LatticeFermion    res_grid  (FGrid);
    std::cout << "*****************************"<<std::endl;
    std::cout << "Action "<<ActionName[i]<<std::endl;
    std::cout << "*****************************"<<std::endl;
@@ -512,7 +439,6 @@ int main (int argc,char **argv )
      
      std::cout << "Norm of difference "<<Grid::norm2(res_chroma)<<std::endl;
    }
-    delete FGrid;
  }

  std::cout << "Finished test "<<std::endl;
@@ -576,7 +502,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF
  Grid::gaussian(RNG5,src);
  Grid::gaussian(RNG5,res);

-  Grid::SU<Grid::Nc>::HotConfiguration(RNG4,Umu);
+  Grid::SU<Nc>::HotConfiguration(RNG4,Umu);

  /*
  Grid::LatticeColourMatrix U(UGrid);
@@ -593,7 +519,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HtCayleyTanh ) { 

-    Grid::DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5);
+    Grid::DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5);

    std::cout << Grid::GridLogMessage <<" Calling domain wall multiply "<<std::endl;

@@ -609,7 +535,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

    Grid::Real _b = 0.5*(mobius_scale +1.0);
    Grid::Real _c = 0.5*(mobius_scale -1.0);
-    Grid::MobiusZolotarevFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c,zolo_lo,zolo_hi);
+    Grid::MobiusZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c,zolo_lo,zolo_hi);

    std::cout << Grid::GridLogMessage <<" Calling mobius zolo multiply "<<std::endl;

@@ -623,7 +549,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HtCayleyZolo ) {

-    Grid::ShamirZolotarevFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+    Grid::ShamirZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);

    std::cout << Grid::GridLogMessage <<" Calling shamir zolo multiply "<<std::endl;

@@ -635,60 +561,6 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF
    return;
  }

-  if ( action == HwPartFracTanh ) {
-
-    Grid::OverlapWilsonPartialFractionTanhFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
-
-    std::cout << Grid::GridLogMessage <<" Calling part frac tanh multiply "<<std::endl;
-
-    if ( dag ) 
-      Dov.Mdag(src,res);  
-    else 
-      Dov.M(src,res);  
-
-    return;
-  }
-
-  if ( action == HwContFracTanh ) {
-
-    Grid::OverlapWilsonContFracTanhFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
-
-    std::cout << Grid::GridLogMessage <<" Calling cont frac tanh multiply "<<std::endl;
-
-    if ( dag ) 
-      Dov.Mdag(src,res);  
-    else 
-      Dov.M(src,res);  
-
-    return;
-  }
-  if ( action == HwContFracZolo ) {
-
-    Grid::OverlapWilsonContFracZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
-
-    std::cout << Grid::GridLogMessage <<" Calling cont frac zolo multiply "<<std::endl;
-
-    if ( dag ) 
-      Dov.Mdag(src,res);  
-    else 
-      Dov.M(src,res);  
-
-    return;
-  }
-
-  if ( action == HwPartFracZolo ) {
-
-    Grid::OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
-    std::cout << Grid::GridLogMessage <<" Calling part frac zolotarev multiply "<<std::endl;
-
-    if ( dag ) 
-      Dov.Mdag(src,res);  
-    else 
-      Dov.M(src,res);  
-
-    return;
-  }
-  
  /*
  if ( action == HmCayleyTanh ) {
    Grid::Real _b = 0.5*(mobius_scale +1.0);
@@ -709,7 +581,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HmCayleyTanh ) {

-    Grid::ScaledShamirFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,mobius_scale);
+    Grid::ScaledShamirFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,mobius_scale);

    std::cout << Grid::GridLogMessage <<" Calling scaled shamir multiply "<<std::endl;

@@ -723,7 +595,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HwCayleyTanh ) {

-    Grid::OverlapWilsonCayleyTanhFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
+    Grid::OverlapWilsonCayleyTanhFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);

    if ( dag ) 
      D.Mdag(src,res);  
@@ -735,7 +607,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HwCayleyZolo ) {

-    Grid::OverlapWilsonCayleyZolotarevFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+    Grid::OverlapWilsonCayleyZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);

    if ( dag ) 
      D.Mdag(src,res);  
--- a/tests/solver/Test_dwf_cg_prec.cc
+++ b/tests/solver/Test_dwf_cg_prec.cc
@@ -1,4 +1,4 @@
-*************************************************************************************
+/*************************************************************************************

 Grid physics library, www.github.com/paboyle/Grid

@@ -67,13 +67,7 @@ int main(int argc, char** argv) {
  result = Zero();
  LatticeGaugeField Umu(UGrid);

-#if 0
-  FieldMetaData header;
-  std::string file("ckpoint_lat.4000");
-  NerscIO::readConfiguration(Umu,header,file);
-#else  
  SU<Nc>::HotConfiguration(RNG4, Umu);
-#endif

  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt()
            << "   Ls: " << Ls << std::endl;
--- a/tests/solver/Test_dwf_cg_unprec.cc
+++ b/tests/solver/Test_dwf_cg_unprec.cc
@@ -54,30 +54,15 @@ int main (int argc, char ** argv)
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

-  std::vector<ComplexD> qmu;
-  qmu.push_back(ComplexD(0.1,0.0));
-  qmu.push_back(ComplexD(0.0,0.0));
-  qmu.push_back(ComplexD(0.0,0.0));
-  qmu.push_back(ComplexD(0.0,0.01));
-  
-
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);

-  LatticeFermion    tmp(FGrid);
  LatticeFermion    src(FGrid); random(RNG5,src);
  LatticeFermion result(FGrid); result=Zero();
-  LatticeGaugeField Umu(UGrid); 
-#if 0
-  FieldMetaData header;
-  std::string file("ckpoint_lat.4000");
-  NerscIO::readConfiguration(Umu,header,file);
-#else  
-  SU<Nc>::HotConfiguration(RNG4,Umu);
-#endif
-  
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
+
  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@@ -86,15 +71,8 @@ int main (int argc, char ** argv)
  RealD mass=0.1;
  RealD M5=1.8;
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  Ddwf.qmu = qmu;

-  Ddwf.M(src,tmp);
-  std::cout << " |M src|^2 "<<norm2(tmp)<<std::endl;
  MdagMLinearOperator<DomainWallFermionD,LatticeFermion> HermOp(Ddwf);
-  HermOp.HermOp(src,tmp);
-
-  std::cout << " <src|MdagM| src> "<<innerProduct(src,tmp)<<std::endl;
-  
  ConjugateGradient<LatticeFermion> CG(1.0e-6,10000);
  CG(HermOp,src,result);
Author	SHA1	Message	Date
Christopher Kelly	e28cf2b8be	Merge `1dfaa08afb` into `b8a7004365`	2023-09-03 09:29:21 -07:00
Christopher Kelly	1dfaa08afb	The stencils for the staple and rect-staple padded cell implementations are now created and stored by workspace classes that allow for reuse providing the grids remain consistent The workspaces are now used by the plaq+rectangle gauge action resulting in a further 2x performance improvement as measured on a 16^4 local volume for 2 nodes (16 ranks) of Crusher	2023-06-28 15:11:24 -04:00
Christopher Kelly	f44dce390f	Implemented acclerator-optimized versions of localCopyRegion and insertSliceLocal to speed up padding Fixed const correctness on PaddedCell methods Fixed compile issues on Crusher Added timing breakdowns for PaddedCell::Expand and the padded implementations of the staples, visible under --log Performance Optimized kernel for StaplePadded Test_iwasaki_action_newstaple now repeats the calculation 10 times and reports average timings	2023-06-27 14:58:10 -04:00
Christopher Kelly	bb71e9a96a	Added PaddedCell and GeneralisedLocalStencil header includes to standard base headers Moved versions of the padded-cell implementations of staple and rect-staple from test code to WilsonLoops header Added StapleAndRectStapleAll which is now called by the plaq+rectangle action class. Under the hood it uses the padded cell implementations with maximal reuse of the padded gauge links	2023-06-27 11:23:30 -04:00
Christopher Kelly	6f6844ccf1	Added new StapleAll and RectStapleAll functions that return the staples for all mu as an array Modified plaq+rectangle gauge actions to use the above Added a test code to confirm the above changes	2023-06-26 15:48:47 -04:00
Christopher Kelly	4c6613d72c	Modified RectStapleDouble and RectStapleOptimised to use Gauge-BC respecting CshiftLink Added test code tests/debug/Test_optimized_staple_gaugebc demonstrating equivalence of above to RectStapleUnoptimised for cconj gauge BCs Removed optimized staple only being used for periodic gauge BCs; it is now always used	2023-06-26 10:20:23 -04:00
Christopher Kelly	36cc9c524f	Threaded the constructor of GeneralLocalStencil	2023-06-23 09:57:38 -04:00
Christopher Kelly	4241c7d4a3	Imported coalescedReadGeneralPermute GPU implementation from Christoph Fixed bug in padded staple code where extract was being called on the result before the GPU view was closed Fixed compile issue with pointer cast in padded staple code Added timing summaries of padded staple code and timing breakdown of staple implementation to Test_padded_cell_staple	2023-06-21 16:01:01 -04:00
Christopher Kelly	7b11075102	The user can now specify the implementation of Cshift used by the PaddedCell class through a virtual base class API. Implementations for default (regular Cshift) and for gauge links (which respects the gauge BCs) Fixed const-correctness for PaddedCell and ConjugateGimpl::setDirections Modified test code for padded-cell implementation of staple, rect-staple to use cconj BCs	2023-06-20 17:09:56 -04:00
Christopher Kelly	abc658dca5	Added coalescedReadGeneralPermute CPU implementation based on Christoph's GPT code In a test code, implemented a padded-cell version of the staple and rectangular-staple calculation	2023-06-20 16:14:25 -04:00