Profiling temporary code until optimised

Optional checkpoint smeared configs for FTHMC
Additional tests
2025-06-22 09:42:02 +01:00 · 2023-06-15 06:54:10 -04:00 · 2023-06-14 04:54:29 -04:00 · 2023-06-13 11:57:11 -04:00 · 2023-06-13 11:56:37 -04:00 · 2023-06-13 11:56:11 -04:00
34 changed files with 197 additions and 2479 deletions
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -419,15 +419,14 @@ until convergence
 	}
      }
-      if ( Nconv < Nstop ) {
+      if ( Nconv < Nstop )
 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
-	std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
+
      }
      eval=eval2;
      //Keep only converged
-      eval.resize(Nstop);// was Nconv
+      eval.resize(Nconv);// Nstop?
-      evec.resize(Nstop,grid);// was Nconv
+      evec.resize(Nconv,grid);// Nstop?
      basisSortInPlace(evec,eval,reverse);
    }
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -604,8 +604,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
+    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
+    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@ -47,4 +47,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -697,68 +697,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
  // the above should guarantee that the operations are local
 #if 1
  size_t nsite = 1;
  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
  size_t tbytes = 4*nsite*sizeof(int);
  int *table = (int*)malloc(tbytes);
  thread_for(idx, nsite, {
      Coordinate from_coor, to_coor;
      size_t rem = idx;
      for(int i=0;i<nd;i++){
 	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
 	from_coor[i] = base_i + FromLowerLeft[i];
 	to_coor[i] = base_i + ToLowerLeft[i];
      }
      int foidx = Fg->oIndex(from_coor);
      int fiidx = Fg->iIndex(from_coor);
      int toidx = Tg->oIndex(to_coor);
      int tiidx = Tg->iIndex(to_coor);
      int* tt = table + 4*idx;
      tt[0] = foidx;
      tt[1] = fiidx;
      tt[2] = toidx;
      tt[3] = tiidx;
    });
  int* table_d = (int*)acceleratorAllocDevice(tbytes);
  acceleratorCopyToDevice(table,table_d,tbytes);
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  autoView(from_v,From,AcceleratorRead);
  autoView(to_v,To,AcceleratorWrite);
  accelerator_for(idx,nsite,1,{
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int* tt = table_d + 4*idx;
      int from_oidx = *tt++;
      int from_lane = *tt++;
      int to_oidx = *tt++;
      int to_lane = *tt;
      const vector_type* from = (const vector_type *)&from_v[from_oidx];
      vector_type* to = (vector_type *)&to_v[to_oidx];
      scalar_type stmp;
      for(int w=0;w<words;w++){
 	stmp = getlane(from[w], from_lane);
 	putlane(to[w], stmp, to_lane);
      }
    });
  acceleratorFreeDevice(table_d);    
  free(table);
 #else  
  Coordinate ldf = Fg->_ldimensions;
  Coordinate rdf = Fg->_rdimensions;
  Coordinate isf = Fg->_istride;
@ -798,8 +738,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 #endif
    }
  });
 #endif
 }
@ -892,8 +830,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 }
 //Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
 //The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@ -915,65 +851,6 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
    }
  }
 #if 1
  size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
  size_t tbytes = 4*nsite*sizeof(int);
  int *table = (int*)malloc(tbytes);
  thread_for(idx,nsite,{
    Coordinate lcoor(nl);
    Coordinate hcoor(nh);
    lcoor[orthog] = slice_lo;
    hcoor[orthog] = slice_hi;
    size_t rem = idx;
    for(int mu=0;mu<nl;mu++){
      if(mu != orthog){
 	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu];
 	lcoor[mu] = hcoor[mu] = xmu;
      }
    }
    int loidx = lg->oIndex(lcoor);
    int liidx = lg->iIndex(lcoor);
    int hoidx = hg->oIndex(hcoor);
    int hiidx = hg->iIndex(hcoor);
    int* tt = table + 4*idx;
    tt[0] = loidx;
    tt[1] = liidx;
    tt[2] = hoidx;
    tt[3] = hiidx;
    });
  int* table_d = (int*)acceleratorAllocDevice(tbytes);
  acceleratorCopyToDevice(table,table_d,tbytes);
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  autoView(lowDim_v,lowDim,AcceleratorRead);
  autoView(higherDim_v,higherDim,AcceleratorWrite);
  accelerator_for(idx,nsite,1,{
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int* tt = table_d + 4*idx;
      int from_oidx = *tt++;
      int from_lane = *tt++;
      int to_oidx = *tt++;
      int to_lane = *tt;
      const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
      vector_type* to = (vector_type *)&higherDim_v[to_oidx];
      scalar_type stmp;
      for(int w=0;w<words;w++){
 	stmp = getlane(from[w], from_lane);
 	putlane(to[w], stmp, to_lane);
      }
    });
  acceleratorFreeDevice(table_d);    
  free(table);
 #else
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
@ -989,7 +866,6 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
      pokeLocalSite(s,higherDimv,hcoor);
    }
  });
 #endif
 }
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -26,32 +26,14 @@ Author: Peter Boyle pboyle@bnl.gov
 /*  END LEGAL */
 #pragma once
 #include<Grid/cshift/Cshift.h>
 NAMESPACE_BEGIN(Grid);
 //Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
 template<typename vobj>
 struct CshiftImplBase{
  virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
  virtual ~CshiftImplBase(){}
 };
 template<typename vobj>
 struct CshiftImplDefault: public CshiftImplBase<vobj>{
  Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
 };
 template<typename Gimpl>
 struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
 };  
 class PaddedCell {
 public:
  GridCartesian * unpadded_grid;
  int dims;
  int depth;
  std::vector<GridCartesian *> grids;
  ~PaddedCell()
  {
    DeleteGrids();
@ -95,7 +77,7 @@ public:
    }
  };
  template<class vobj>
-  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
+  inline Lattice<vobj> Extract(Lattice<vobj> &in)
  {
    Lattice<vobj> out(unpadded_grid);
@ -106,19 +88,19 @@ public:
    return out;
  }
  template<class vobj>
-  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
+  inline Lattice<vobj> Exchange(Lattice<vobj> &in)
  {
    GridBase *old_grid = in.Grid();
    int dims = old_grid->Nd();
    Lattice<vobj> tmp = in;
    for(int d=0;d<dims;d++){
-      tmp = Expand(d,tmp,cshift); // rvalue && assignment
+      tmp = Expand(d,tmp); // rvalue && assignment
    }
    return tmp;
  }
  // expand up one dim at a time
  template<class vobj>
-  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
+  inline Lattice<vobj> Expand(int dim,Lattice<vobj> &in)
  {
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
@ -130,40 +112,20 @@ public:
    else       conformable(old_grid,grids[dim-1]);
    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
    double tins=0, tshift=0;
    // Middle bit
    double t = usecond();
    for(int x=0;x<local[dim];x++){
      InsertSliceLocal(in,padded,x,depth+x,dim);
    }
    tins += usecond() - t;
    // High bit
-    t = usecond();
+    shifted = Cshift(in,dim,depth);
    shifted = cshift.Cshift(in,dim,depth);
    tshift += usecond() - t;
    t=usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
    }
    tins += usecond() - t;
    // Low bit
-    t = usecond();
+    shifted = Cshift(in,dim,-depth);
    shifted = cshift.Cshift(in,dim,-depth);
    tshift += usecond() - t;
    t = usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,x,x,dim);
    }
    tins += usecond() - t;
    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
    return padded;
  }
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -423,6 +423,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
 #define KERNEL_CALL_EXT(A)						\
  const uint64_t    NN = Nsite*Ls;					\
  const uint64_t    sz = st.surface_list.size();			\
  auto ptr = &st.surface_list[0];					\
  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@ -176,7 +176,7 @@ public:
      return PeriodicBC::CshiftLink(Link,mu,shift);
  }
-  static inline void       setDirections(const std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
+  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
 };
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@ -43,7 +43,7 @@ public:
 private:
  RealD c_plaq;
  RealD c_rect;
-  typename WilsonLoops<Gimpl>::StapleAndRectStapleAllWorkspace workspace;
+
 public:
  PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){};
@ -79,18 +79,27 @@ public:
    GridBase *grid = Umu.Grid();
    std::vector<GaugeLinkField> U (Nd,grid);
    std::vector<GaugeLinkField> U2(Nd,grid);
    for(int mu=0;mu<Nd;mu++){
      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
      WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
    }
    std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
    GaugeLinkField dSdU_mu(grid);
    GaugeLinkField staple(grid);
    for (int mu=0; mu < Nd; mu++){
-      dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
+
-      dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
+      // Staple in direction mu
      WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
      dSdU_mu = Ta(U[mu]*staple)*factor_p;
      WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
      dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@ -37,14 +37,13 @@ NAMESPACE_BEGIN(Grid);
 // Make these members of an Impl class for BC's.
 namespace PeriodicBC { 
-  //Out(x) = Link(x)*field(x+mu)
+
  template<class covariant,class gauge> Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link, 
 									   int mu,
 									   const Lattice<covariant> &field)
  {
    return Link*Cshift(field,mu,1);// moves towards negative mu
  }
  //Out(x) = Link^dag(x-mu)*field(x-mu)
  template<class covariant,class gauge> Lattice<covariant> CovShiftBackward(const Lattice<gauge> &Link, 
 									    int mu,
 									    const Lattice<covariant> &field)
@ -53,19 +52,19 @@ namespace PeriodicBC {
    tmp = adj(Link)*field;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
-  //Out(x) = Link^dag(x-mu)
+
  template<class gauge> Lattice<gauge>
  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) 
  {
    return Cshift(adj(Link), mu, -1);
  }
-  //Out(x) = Link(x)
+
  template<class gauge> Lattice<gauge>
  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu)
  {
    return Link;
  }
-  //Link(x) = Link(x+mu)
+
  template<class gauge> Lattice<gauge>
  ShiftStaple(const Lattice<gauge> &Link, int mu)
  {
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@ -40,20 +40,18 @@ Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iSc
  GridBase *grid=Umu.Grid();
  auto lvol = grid->lSites();
  Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
-  typedef typename Vec::scalar_type scalar;
+
  autoView(Umu_v,Umu,CpuRead);
  autoView(ret_v,ret,CpuWrite);
  thread_for(site,lvol,{
    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
    Coordinate lcoor;
    grid->LocalIndexToLocalCoor(site, lcoor);
-    iScalar<iScalar<iMatrix<scalar, N> > > Us;
+    iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
    peekLocalSite(Us, Umu_v, lcoor);
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
-	scalar tmp= Us()()(i,j);
+	EigenU(i,j) = Us()()(i,j);
 	ComplexD ztmp(real(tmp),imag(tmp));
 	EigenU(i,j)=ztmp;
      }}
    ComplexD detD  = EigenU.determinant();
    typename Vec::scalar_type det(detD.real(),detD.imag());
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -290,7 +290,7 @@ public:
  }
 */
  //////////////////////////////////////////////////
-  // the sum over all nu-oriented staples for nu != mu on each site
+  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
@ -300,10 +300,6 @@ public:
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
    Staple(staple, U, mu);
  }
  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
    staple = Zero();
    for (int nu = 0; nu < Nd; nu++) {
@ -339,202 +335,6 @@ public:
    }
  }
  /////////////
  //Staples for each direction mu, summed over nu != mu
  //staple: output staples for each mu (Nd)
  //U: link array (Nd)
  /////////////
  static void StapleAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U) {
    assert(staple.size() == Nd); assert(U.size() == Nd);
    for(int mu=0;mu<Nd;mu++) Staple(staple[mu], U, mu);
  }
  //A workspace class allowing reuse of the stencil
  class WilsonLoopPaddedStencilWorkspace{
    std::unique_ptr<GeneralLocalStencil> stencil;
    size_t nshift;
    void generateStencil(GridBase* padded_grid){
      double t0 = usecond();
      //Generate shift arrays
      std::vector<Coordinate> shifts = this->getShifts();
      nshift = shifts.size();
      double t1 = usecond();
      //Generate local stencil
      stencil.reset(new GeneralLocalStencil(padded_grid,shifts));
      double t2 = usecond();
      std::cout << GridLogPerformance << " WilsonLoopPaddedWorkspace timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms" << std::endl;   
    }
  public:
    //Get the stencil. If not already generated, or if generated using a different Grid than in PaddedCell, it will be created on-the-fly
    const GeneralLocalStencil & getStencil(const PaddedCell &pcell){
      assert(pcell.depth >= this->paddingDepth());
      if(!stencil || stencil->Grid() != (GridBase*)pcell.grids.back() ) generateStencil((GridBase*)pcell.grids.back());
      return *stencil;
    }
    size_t Nshift() const{ return nshift; }
    virtual std::vector<Coordinate> getShifts() const = 0;
    virtual int paddingDepth() const = 0; //padding depth required
    virtual ~WilsonLoopPaddedStencilWorkspace(){}
  };
  //This workspace allows the sharing of a common PaddedCell object between multiple stencil workspaces
  class WilsonLoopPaddedWorkspace{
    std::vector<WilsonLoopPaddedStencilWorkspace*> stencil_wk;
    std::unique_ptr<PaddedCell> pcell;
    void generatePcell(GridBase* unpadded_grid){
      assert(stencil_wk.size());
      int max_depth = 0;
      for(auto const &s : stencil_wk) max_depth=std::max(max_depth, s->paddingDepth());
      pcell.reset(new PaddedCell(max_depth, dynamic_cast<GridCartesian*>(unpadded_grid)));
    }
  public:
    //Add a stencil definition. This should be done before the first call to retrieve a stencil object.
    //Takes ownership of the pointer
    void addStencil(WilsonLoopPaddedStencilWorkspace *stencil){
      assert(!pcell);
      stencil_wk.push_back(stencil);
    }
    const GeneralLocalStencil & getStencil(const size_t stencil_idx, GridBase* unpadded_grid){
      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
      return stencil_wk[stencil_idx]->getStencil(*pcell);
    }      
    const PaddedCell & getPaddedCell(GridBase* unpadded_grid){
      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
      return *pcell;
    }
    ~WilsonLoopPaddedWorkspace(){
      for(auto &s : stencil_wk) delete s;
    }
  };
  //A workspace class allowing reuse of the stencil
  class StaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
  public:
    std::vector<Coordinate> getShifts() const override{
      std::vector<Coordinate> shifts;
      for(int mu=0;mu<Nd;mu++){
 	for(int nu=0;nu<Nd;nu++){
 	  if(nu != mu){
 	    Coordinate shift_0(Nd,0);
 	    Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
 	    Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
 	    Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
 	    Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
 	    //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
 	    shifts.push_back(shift_0);
 	    shifts.push_back(shift_nu);
 	    shifts.push_back(shift_mu);
 	    //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
 	    shifts.push_back(shift_mnu);
 	    shifts.push_back(shift_mnu);
 	    shifts.push_back(shift_mnu_pmu);
 	  }
 	}
      }
      return shifts;
    }
    int paddingDepth() const override{ return 1; }
  }; 
  //Padded cell implementation of the staple method for all mu, summed over nu != mu
  //staple: output staple for each mu, summed over nu != mu (Nd)
  //U_padded: the gauge link fields padded out using the PaddedCell class
  //Cell: the padded cell class
  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
    StaplePaddedAllWorkspace wk;
    StaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
  }
  //Padded cell implementation of the staple method for all mu, summed over nu != mu
  //staple: output staple for each mu, summed over nu != mu (Nd)
  //U_padded: the gauge link fields padded out using the PaddedCell class
  //Cell: the padded cell class
  //gStencil: the precomputed generalized local stencil for the staple
  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
    double t0 = usecond();
    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
    assert(Cell.depth >= 1);
    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
    int shift_mu_off = gStencil._npoints/Nd;
    //Open views to padded gauge links and keep open over mu loop
    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
    size_t vsize = Nd*sizeof(GaugeViewType);
    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
    GaugeMat gStaple(ggrid);
    int outer_off = 0;
    for(int mu=0;mu<Nd;mu++){
      { //view scope
 	autoView( gStaple_v , gStaple, AcceleratorWrite);
 	auto gStencil_v = gStencil.View();
 	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
 	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
 	    stencil_ss = Zero();
 	    int off = outer_off;
 	    for(int nu=0;nu<Nd;nu++){
 	      if(nu != mu){	  
 		GeneralStencilEntry const* e = gStencil_v.GetEntry(off++,ss);
 		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(off++,ss);
 		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(off++,ss);
 		auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 		stencil_ss = stencil_ss + U2 * U1 * U0;
 		e = gStencil_v.GetEntry(off++,ss);
 		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 		e = gStencil_v.GetEntry(off++,ss);
 		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(off++,ss);
 		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 		stencil_ss = stencil_ss + U2 * U1 * U0;
 	      }
 	    }
 	    coalescedWrite(gStaple_v[ss],stencil_ss);
 	  }
 	  );
      } //ensure views are all closed!
      staple[mu] = Cell.Extract(gStaple);
      outer_off += shift_mu_off;
    }//mu loop
    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
    free(Ug_dirs_v_host);
    acceleratorFreeDevice(Ug_dirs_v);
    double t1=usecond();
    std::cout << GridLogPerformance << "StaplePaddedAll timing:" << (t1-t0)/1000 << "ms" << std::endl;   
  }
  //////////////////////////////////////////////////
  // the sum over all staples on each site in direction mu,nu, upper part
  //////////////////////////////////////////////////
@ -907,14 +707,18 @@ public:
  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
-    U2 = U * Gimpl::CshiftLink(U, mu, 1);
+    U2 = U * Cshift(U, mu, 1);
  }
  ////////////////////////////////////////////////////////////////////////////
-  // Hop by two optimisation strategy. Use RectStapleDouble to obtain 'U2'
+  // Hop by two optimisation strategy does not work nicely with Gparity. (could
  // do,
  // but need to track two deep where cross boundary and apply a conjugation).
  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do
  // so .
  ////////////////////////////////////////////////////////////////////////////
-  static void RectStapleOptimised(GaugeMat &Stap, const std::vector<GaugeMat> &U2,
+  static void RectStapleOptimised(GaugeMat &Stap, std::vector<GaugeMat> &U2,
-                                  const std::vector<GaugeMat> &U, int mu) {
+                                  std::vector<GaugeMat> &U, int mu) {
    Stap = Zero();
@ -928,9 +732,9 @@ public:
        // Up staple    ___ ___
        //             |       |
-        tmp = Gimpl::CshiftLink(adj(U[nu]), nu, -1);
+        tmp = Cshift(adj(U[nu]), nu, -1);
        tmp = adj(U2[mu]) * tmp;
-        tmp = Gimpl::CshiftLink(tmp, mu, -2);
+        tmp = Cshift(tmp, mu, -2);
        Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);
@ -938,14 +742,14 @@ public:
        //             |___ ___|
        //
        tmp = adj(U2[mu]) * U[nu];
-        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CshiftLink(tmp, mu, -2));
+        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Cshift(tmp, mu, -2));
        //              ___ ___
        //             |    ___|
        //             |___ ___|
        //
-        Stap += Gimpl::CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
+        Stap += Cshift(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
        //              ___ ___
        //             |___    |
@ -954,7 +758,7 @@ public:
        //  tmp= Staple2x1* Cshift(U[mu],mu,-2);
        //  Stap+= Cshift(tmp,mu,1) ;
-        Stap += Gimpl::CshiftLink(Staple2x1, mu, 1) * Gimpl::CshiftLink(U[mu], mu, -1);
+        Stap += Cshift(Staple2x1, mu, 1) * Cshift(U[mu], mu, -1);
        ;
        //       --
@ -962,10 +766,10 @@ public:
        //
        //      |  |
-        tmp = Gimpl::CshiftLink(adj(U2[nu]), nu, -2);
+        tmp = Cshift(adj(U2[nu]), nu, -2);
        tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
-        tmp = U2[nu] * Gimpl::CshiftLink(tmp, nu, 2);
+        tmp = U2[nu] * Cshift(tmp, nu, 2);
-        Stap += Gimpl::CshiftLink(tmp, mu, 1);
+        Stap += Cshift(tmp, mu, 1);
        //      |  |
        //
@ -974,12 +778,25 @@ public:
        tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
        tmp = adj(U2[nu]) * tmp;
-        tmp = Gimpl::CshiftLink(tmp, nu, -2);
+        tmp = Cshift(tmp, nu, -2);
-        Stap += Gimpl::CshiftLink(tmp, mu, 1);
+        Stap += Cshift(tmp, mu, 1);
      }
    }
  }
  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
    RectStapleUnoptimised(Stap, Umu, mu);
  }
  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
                         int mu) {
    if (Gimpl::isPeriodicGaugeField()) {
      RectStapleOptimised(Stap, U2, U, mu);
    } else {
      RectStapleUnoptimised(Stap, Umu, mu);
    }
  }
  static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu,
                                    int mu) {
    GridBase *grid = Umu.Grid();
@ -1078,288 +895,6 @@ public:
    }
  }
  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
    RectStapleUnoptimised(Stap, Umu, mu);
  }
  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
                         int mu) {
    RectStapleOptimised(Stap, U2, U, mu);
  }
  //////////////////////////////////////////////////////
  //Compute the rectangular staples for all orientations
  //Stap : Array of staples (Nd)
  //U: Gauge links in each direction (Nd)
  /////////////////////////////////////////////////////
  static void RectStapleAll(std::vector<GaugeMat> &Stap, const std::vector<GaugeMat> &U){
    assert(Stap.size() == Nd); assert(U.size() == Nd);
    std::vector<GaugeMat> U2(Nd,U[0].Grid());
    for(int mu=0;mu<Nd;mu++) RectStapleDouble(U2[mu], U[mu], mu);
    for(int mu=0;mu<Nd;mu++) RectStapleOptimised(Stap[mu], U2, U, mu);
  }
  //A workspace class allowing reuse of the stencil
  class RectStaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
  public:
    std::vector<Coordinate> getShifts() const override{
      std::vector<Coordinate> shifts;
      for (int mu = 0; mu < Nd; mu++){
 	for (int nu = 0; nu < Nd; nu++) {
 	  if (nu != mu) {
 	    auto genShift = [&](int mushift,int nushift){
 	      Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
 	    };
 	    //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
 	    shifts.push_back(genShift(0,0));
 	    shifts.push_back(genShift(0,+1));
 	    shifts.push_back(genShift(+1,+1));
 	    shifts.push_back(genShift(+2,0));
 	    shifts.push_back(genShift(+1,0));
 	    //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
 	    shifts.push_back(genShift(0,-1));
 	    shifts.push_back(genShift(0,-1));
 	    shifts.push_back(genShift(+1,-1));
 	    shifts.push_back(genShift(+2,-1));
 	    shifts.push_back(genShift(+1,0));
 	    //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
 	    shifts.push_back(genShift(-1,0));
 	    shifts.push_back(genShift(-1,-1));
 	    shifts.push_back(genShift(-1,-1));
 	    shifts.push_back(genShift(0,-1));
 	    shifts.push_back(genShift(+1,-1));
 	    //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
 	    shifts.push_back(genShift(-1,0));
 	    shifts.push_back(genShift(-1,0));
 	    shifts.push_back(genShift(-1,+1));
 	    shifts.push_back(genShift(0,+1));
 	    shifts.push_back(genShift(+1,0));
 	    //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
 	    shifts.push_back(genShift(0,0));
 	    shifts.push_back(genShift(0,+1));
 	    shifts.push_back(genShift(0,+2));
 	    shifts.push_back(genShift(+1,+1));
 	    shifts.push_back(genShift(+1,0));
 	    //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
 	    shifts.push_back(genShift(0,-1));
 	    shifts.push_back(genShift(0,-2));
 	    shifts.push_back(genShift(0,-2));
 	    shifts.push_back(genShift(+1,-2));
 	    shifts.push_back(genShift(+1,-1));
 	  }
 	}
      }
      return shifts;
    }
    int paddingDepth() const override{ return 2; }
  }; 
  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
  //staple: output staple for each mu, summed over nu != mu (Nd)
  //U_padded: the gauge link fields padded out using the PaddedCell class
  //Cell: the padded cell class
  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
    RectStaplePaddedAllWorkspace wk;
    RectStaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
  }
  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
  //staple: output staple for each mu, summed over nu != mu (Nd)
  //U_padded: the gauge link fields padded out using the PaddedCell class
  //Cell: the padded cell class
  //gStencil: the stencil
  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
    double t0 = usecond();
    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
    assert(Cell.depth >= 2);
    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
    size_t nshift = gStencil._npoints;
    int mu_off_delta = nshift / Nd;
    //Open views to padded gauge links and keep open over mu loop
    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
    size_t vsize = Nd*sizeof(GaugeViewType);
    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
    GaugeMat gStaple(ggrid); //temp staple object on padded grid
    int offset = 0;
    for(int mu=0; mu<Nd; mu++){
      { //view scope
 	autoView( gStaple_v , gStaple, AcceleratorWrite);
 	auto gStencil_v = gStencil.View();
 	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
 	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
 	    stencil_ss = Zero();
 	    int s=offset;
 	    for(int nu=0;nu<Nd;nu++){
 	      if(nu != mu){
 		//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
 		GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
 		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 		e = gStencil_v.GetEntry(s++,ss);
 		auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
 		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
 		//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
 		e = gStencil_v.GetEntry(s++,ss);
 		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 		e = gStencil_v.GetEntry(s++,ss);
 		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
 		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
 		//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
 		e = gStencil_v.GetEntry(s++,ss);
 		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
 		e = gStencil_v.GetEntry(s++,ss);
 		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 		e = gStencil_v.GetEntry(s++,ss);
 		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
 		//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
 		e = gStencil_v.GetEntry(s++,ss);
 		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
 		e = gStencil_v.GetEntry(s++,ss);
 		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
 		//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
 		e = gStencil_v.GetEntry(s++,ss);
 		U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 		e = gStencil_v.GetEntry(s++,ss);
 		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
 		//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
 		e = gStencil_v.GetEntry(s++,ss);
 		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 		e = gStencil_v.GetEntry(s++,ss);
 		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 		e = gStencil_v.GetEntry(s++,ss);
 		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(s++,ss);
 		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
 	      }
 	    }
 	    coalescedWrite(gStaple_v[ss],stencil_ss);
 	  }
 	  );
 	offset += mu_off_delta;
      }//kernel/view scope
      staple[mu] = Cell.Extract(gStaple);    
    }//mu loop
    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
    free(Ug_dirs_v_host);
    acceleratorFreeDevice(Ug_dirs_v);
    double t1 = usecond();
    std::cout << GridLogPerformance << "RectStaplePaddedAll timings:" << (t1-t0)/1000 << "ms" << std::endl;   
  }
  //A workspace for reusing the PaddedCell and GeneralLocalStencil objects
  class StapleAndRectStapleAllWorkspace: public WilsonLoopPaddedWorkspace{
  public:
    StapleAndRectStapleAllWorkspace(){
      this->addStencil(new StaplePaddedAllWorkspace);
      this->addStencil(new RectStaplePaddedAllWorkspace);
    }
  };     
  //////////////////////////////////////////////////////
  //Compute the 1x1 and 1x2 staples for all orientations
  //Stap : Array of staples (Nd)
  //RectStap: Array of rectangular staples (Nd)
  //U: Gauge links in each direction (Nd)
  /////////////////////////////////////////////////////
  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U){
    StapleAndRectStapleAllWorkspace wk;
    StapleAndRectStapleAll(Stap,RectStap,U,wk);
  }
  //////////////////////////////////////////////////////
  //Compute the 1x1 and 1x2 staples for all orientations
  //Stap : Array of staples (Nd)
  //RectStap: Array of rectangular staples (Nd)
  //U: Gauge links in each direction (Nd)
  //wk: a workspace containing stored PaddedCell and GeneralLocalStencil objects to maximize reuse
  /////////////////////////////////////////////////////
  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U, StapleAndRectStapleAllWorkspace &wk){
 #if 0
    StapleAll(Stap, U);
    RectStapleAll(RectStap, U);
 #else
    double t0 = usecond();
    GridCartesian* unpadded_grid = dynamic_cast<GridCartesian*>(U[0].Grid());
    const PaddedCell &Ghost = wk.getPaddedCell(unpadded_grid);
    CshiftImplGauge<Gimpl> cshift_impl;
    std::vector<GaugeMat> U_pad(Nd, Ghost.grids.back());
    for(int mu=0;mu<Nd;mu++) U_pad[mu] = Ghost.Exchange(U[mu], cshift_impl);
    double t1 = usecond();
    StaplePaddedAll(Stap, U_pad, Ghost, wk.getStencil(0,unpadded_grid) );
    double t2 = usecond();
    RectStaplePaddedAll(RectStap, U_pad, Ghost, wk.getStencil(1,unpadded_grid));
    double t3 = usecond();
    std::cout << GridLogPerformance << "StapleAndRectStapleAll timings: pad:" << (t1-t0)/1000 << "ms, staple:" << (t2-t1)/1000 << "ms, rect-staple:" << (t3-t2)/1000 << "ms" << std::endl;
 #endif
  }
  //////////////////////////////////////////////////
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
  //////////////////////////////////////////////////
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@ -79,10 +79,10 @@ public:
    this->_entries.resize(npoints* osites);
    this->_entries_p = &_entries[0];
-    thread_for(site, osites, {
+
    Coordinate Coor;
    Coordinate NbrCoor;
-
+    for(Integer site=0;site<osites;site++){
      for(Integer ii=0;ii<npoints;ii++){
 	Integer lex = site*npoints+ii;
 	GeneralStencilEntry SE;
@ -132,7 +132,7 @@ public:
 	////////////////////////////////////////////////
 	this->_entries[lex] = SE;
      }
-      });
+    }      
  }
 };
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -32,7 +32,6 @@
 #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate
 #include <Grid/stencil/Lebesgue.h>   // subdir aggregate
 #include <Grid/stencil/GeneralLocalStencil.h>
 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
@ -706,7 +705,7 @@ public:
 	}
      }
    }
-    std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
--- a/Grid/tensors/Tensor_SIMT.h
+++ b/Grid/tensors/Tensor_SIMT.h
@ -73,16 +73,6 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
    return vec;
  }
 }
 //'perm_mask' acts as a bitmask
 template<class vobj> accelerator_inline
 vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0)
 {
  auto obj = vec, tmp = vec;
  for (int d=0;d<nd;d++)
    if (perm_mask & (0x1 << d)) { permute(obj,tmp,d); tmp=obj;}
  return obj;
 }
 template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
 {
@ -93,7 +83,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 {
  vstream(vec, extracted);
 }
-#else //==GRID_SIMT
+#else
 //#ifndef GRID_SYCL
@ -176,14 +166,6 @@ typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,
  return extractLane(plane,vec);
 }
 template<class vobj> accelerator_inline
 typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 {
  int plane = lane;
  for (int d=0;d<nd;d++)
    plane = (perm_mask & (0x1 << d)) ? plane ^ (vobj::Nsimd() >> (d + 1)) : plane;
  return extractLane(plane,vec);
 }
 template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 {
  insertLane(lane,vec,extracted);
--- a/Grid/tensors/Tensor_Ta.h
+++ b/Grid/tensors/Tensor_Ta.h
@ -90,12 +90,10 @@ template<class vtype,int N> accelerator_inline iVector<vtype,N> ProjectOnGroup(c
 template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
 accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
 {
  typedef typename iMatrix<vtype,N>::scalar_type scalar;
  // need a check for the group type?
  iMatrix<vtype,N> ret(arg);
  vtype nrm;
  vtype inner;
  scalar one(1.0);
  for(int c1=0;c1<N;c1++){
    // Normalises row c1
@ -104,7 +102,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
    nrm = sqrt(inner);
-    nrm = one/nrm;
+    nrm = 1.0/nrm;
    for(int c2=0;c2<N;c2++)
      ret._internal[c1][c2]*= nrm;
@ -129,7 +127,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
    nrm = sqrt(inner);
-    nrm = one/nrm;
+    nrm = 1.0/nrm;
    for(int c2=0;c2<N;c2++)
      ret._internal[c1][c2]*= nrm;
  }
--- a/Grid/tensors/Tensor_exp.h
+++ b/Grid/tensors/Tensor_exp.h
@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
 // Specialisation: Cayley-Hamilton exponential for SU(3)
-#if 0
+#ifndef GRID_ACCELERATED
 template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
 accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
 {
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@ -1,224 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Copyright (C) 2023
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
 using namespace Grid;
 int main(int argc, char **argv)
 {
  std::cout << std::setprecision(12);
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionD FermionAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 12;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  20;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  HMCparams.StartingType     =std::string("HotStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
  CPparams.saveInterval  = 1;
  CPparams.saveSmeared   = true;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.04;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0; // Scale factor two
  RealD c   = 0.0;
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 1.0e-2;
  OFRp.hi       = 64;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-10;
  OFRp.degree   = 14;
  OFRp.precision= 40;
  std::vector<Real> hasenbusch({ 0.1 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeField Uhot(GridPtr);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  bool ApplySmearing = true;
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(2);
  ActionLevel<HMCWrapper::Field> Level3(4);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 CG,
 	 CG, CG,
 	 CG, CG, 
 	 OFRp, false);
  EOFA.is_smeared = ApplySmearing;
  Level1.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Quotients[h]->is_smeared = ApplySmearing;
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // lnDetJacobianAction
  /////////////////////////////////////////////////////////////
  double rho = 0.1;  // smearing parameter
  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
  if( ApplySmearing ) Level2.push_back(&Jacobian);
  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  //  GaugeAction.is_smeared = ApplySmearing;
  GaugeAction.is_smeared = true;
  Level3.push_back(&GaugeAction);
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage <<  std::endl;
  std::cout << GridLogMessage <<  std::endl;
  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  TheHMC.Run(SmearingPolicy); // for smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1f_EOFA_96I_hmc.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc
@ -146,8 +146,6 @@ NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  using namespace Grid;
  std::cout << " Grid Initialise "<<std::endl;
  Grid_init(&argc, &argv);
  CartesianCommunicator::BarrierWorld();
@ -172,24 +170,24 @@ int main(int argc, char **argv) {
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
-  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
-  //  MD.name    = std::string("Force Gradient");
+  MD.name    = std::string("Force Gradient");
-  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  //typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
-  MD.name    = std::string("MinimumNorm2");
+  // MD.name    = std::string("MinimumNorm2");
  // TrajL = 2
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
-  MD.MDsteps =  14;
+  MD.MDsteps =  12;
  MD.trajL   = 0.5;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 1077;
-  HMCparams.Trajectories     = 20;
+  HMCparams.Trajectories     = 1;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  HMCparams.StartingType     =std::string("ColdStart");
+  //  HMCparams.StartingType     =std::string("ColdStart");
-  //  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
@ -225,7 +223,7 @@ int main(int argc, char **argv) {
  Real pv_mass      = 1.0;
  //  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
-  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated
+  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
@ -277,10 +275,10 @@ int main(int argc, char **argv) {
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
-  double StoppingCondition = 1e-9;
+  double StoppingCondition = 1e-8;
-  double MDStoppingCondition = 1e-8;
+  double MDStoppingCondition = 1e-7;
-  double MDStoppingConditionLoose = 1e-8;
+  double MDStoppingConditionLoose = 1e-7;
-  double MDStoppingConditionStrange = 1e-8;
+  double MDStoppingConditionStrange = 1e-7;
  double MaxCGIterations = 300000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
--- a/documentation/GridXcode/readme.md
+++ b/documentation/GridXcode/readme.md
@ -10,8 +10,9 @@ For first time setup of the Xcode and Grid build environment on Mac OS, you will
 1. Install Xcode and the Xcode command-line utilities
 2. Set Grid environment variables
-3. Install and build Grid pre-requisites
+3. Install and build Open MPI ***optional***
-4. Install, Configure and Build Grid
+4. Install and build Grid pre-requisites
 5. Install, Configure and Build Grid
 Apple's [Xcode website][Xcode] is the go-to reference for 1, and the definitive reference for 4 and 5 is the [Grid Documentation][GridDoc].
@ -91,33 +92,60 @@ launchctl setenv GridPkg /opt/local</string>
 </plist>
 ```
-## 3. Install and build Grid pre-requisites
+## 3. Install and build Open MPI -- ***optional***
 Download the latest version of [Open MPI][OMPI] version 3.1 (I used 3.1.5) and build it like so:
 [OMPI]: https://www.open-mpi.org/software/ompi/v3.1/
    ../configure CC=clang CXX=clang++ CXXFLAGS=-g --prefix=$GridPre/bin
    make -j 4 all install
 ***Note the `/bin` at the end of the prefix - this is required. As a quirk of the OpenMPI installer, `--prefix` must point to the `bin` subdirectory, with other files installed in `$GridPre/include`, `$GridPre/lib`, `$GridPre/share`, etc.***
 Grid does not have any dependencies on fortran, however many standard scientific packages do, so you may wish to download GNU fortran (e.g. MacPorts ``gfortran`` package) and add the following to your configure invocation:
    F77=gfortran FC=gfortran
 ## 4. Install and build Grid pre-requisites
 To simplify the installation of **Grid pre-requisites**, you can use your favourite package manager, e.g.:
-### 3.1. [MacPorts][MacPorts]
+### 1. [MacPorts][MacPorts]
 [MacPorts]: https://www.macports.org "MacPorts package manager"
 Install [MacPorts][MacPorts] if you haven't done so already, and then install packages with:
-    sudo port install openmpi git-flow-avh gmp hdf5 mpfr fftw-3-single lapack wget autoconf automake bison cmake gawk libomp
+    sudo port install <portname>
-On a Mac without GPUs:
+These are the `portname`s for mandatory Grid libraries:
-    sudo port install OpenBLAS +native
+* git-flow-avh
 * gmp
 * hdf5
 * mpfr
-To use `Gnu sha256sum`:
+and these are the `portname`s for optional Grid libraries:
-    pushd /opt/local/bin; sudo ln -s gsha256sum sha256sum; popd 
+* fftw-3-single
 * lapack
 * doxygen
 * OpenBLAS
-These `port`s are not strictly necessary, but they are helpful:
+***Please update this list with any packages I've missed! ... and double-check whether OpenBLAS is really for Grid. NB: lapack doesn't seem to work. Should it be scalapack?***
-    sudo port install gnuplot gsl h5utils nasm rclone texinfo tree xorg-server
+### 2. [Homebrew][Homebrew]
-***Please update this list with any packages I've missed!***
+[Homebrew]: https://brew.sh "Homebrew package manager"
-#### Install LIME
+Install [Homebrew][Homebrew] if you haven't done so already, and then install packages with:
    sudo brew install <packagename>
 The same packages are available as from MacPorts.
 ### Install LIME ***optional***
 There isn't currently a port for [C-LIME][C-LIME], so download the source and then build it:
@ -126,19 +154,9 @@ There isn't currently a port for [C-LIME][C-LIME], so download the source and th
    ../configure CC=clang --prefix=$GridPre
    make -j 4 all install
-### 3.2. [Homebrew][Homebrew]
+## 5. Install, Configure and Build Grid
-[Homebrew]: https://brew.sh "Homebrew package manager"
+### 5.1 Install Grid
 Install [Homebrew][Homebrew] if you haven't done so already, and then install packages with:
    sudo brew install <packagename>
 I don't use Homebrew, so I'm not sure what the Brew package name equivalents are. ** Please update if you know **
 ## 4. Install, Configure and Build Grid
 ### 4.1 Install Grid
 [Grid]: https://github.com/paboyle/Grid
@ -156,7 +174,7 @@ or
 depending on how many times you like to enter your password.
-### 4.2 Configure Grid
+### 5.2 Configure Grid
 The Xcode build system supports multiple configurations for each project, by default: `Debug` and `Release`, but more configurations can be defined. We will create separate Grid build directories for each configuration, using the Grid **Autoconf** build system to make each configuration. NB: it is **not** necessary to run `make install` on them once they are built (IDE features such as *jump to definition* will work better of you don't).
@ -180,7 +198,7 @@ Debug configuration with MPI:
    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --prefix=$GridPre/MPIDebug
-### 4.3 Build Grid
+### 5.3 Build Grid
 Each configuration must be built before they can be used. You can either:
--- a/systems/Lumi/benchmarks/bench2.slurm
+++ b/systems/Lumi/benchmarks/bench2.slurm
@ -1,44 +0,0 @@
 #!/bin/bash -l
 #SBATCH --job-name=bench_lehner
 #SBATCH --partition=small-g
 #SBATCH --nodes=2
 #SBATCH --ntasks-per-node=8
 #SBATCH --cpus-per-task=7
 #SBATCH --gpus-per-node=8
 #SBATCH --time=00:10:00
 #SBATCH --account=project_465000546
 #SBATCH --gpu-bind=none
 #SBATCH --exclusive
 #SBATCH --mem=0
 CPU_BIND="map_cpu:48,56,32,40,16,24,1,8"
 echo $CPU_BIND
 cat << EOF > select_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3 4 5 6 7)
 export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
 export HIP_VISIBLE_DEVICES=\$GPU
 unset ROCR_VISIBLE_DEVICES
 echo RANK \$SLURM_LOCALID using GPU \$GPU    
 exec \$*
 EOF
 chmod +x ./select_gpu
 root=/scratch/project_465000546/boylepet/Grid/systems/Lumi
 source ${root}/sourceme.sh
 export OMP_NUM_THREADS=7
 export MPICH_GPU_SUPPORT_ENABLED=1
 export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 for vol in 16.16.16.64 32.32.32.64  32.32.32.128
 do
 srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
 #srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
 srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
 #srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
 done
--- a/systems/Lumi/config-command
+++ b/systems/Lumi/config-command
@ -3,28 +3,30 @@ spack load gmp
 spack load mpfr
 CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
 GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
-MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
+MPFR=`spack find --paths mpfr | grep mpfr | cut -c 12-`
-echo clime X$CLIME
+echo clime $CLIME
-echo gmp X$GMP
+echo gmp $GMP
-echo mpfr X$MPFR
+echo mpfr $MPFR
-../../configure \
+../../configure --enable-comms=mpi-auto \
 --enable-comms=mpi-auto \
 --with-lime=$CLIME \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-tracing=timer \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --enable-simd=GPU \
--enable-accelerator-cshift \
+--disable-accelerator-cshift \
--with-gmp=$GMP \
+--with-gmp=$OLCF_GMP_ROOT \
 --with-mpfr=$MPFR \
 --with-fftw=$FFTW_DIR/.. \
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=hipcc MPICXX=mpicxx \
-  CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
+CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include -L/lib64 --amdgpu-target=gfx90a" \
-  LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp" 
+ LDFLAGS="-L/lib64 -L/opt/rocm-5.2.0/lib/ -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 --amdgpu-target=gfx90a "
 #--enable-simd=GPU-RRII \
--- a/systems/Lumi/sourceme.sh
+++ b/systems/Lumi/sourceme.sh
@ -1,5 +1 @@
-source ~/spack/share/spack/setup-env.sh
+module load CrayEnv LUMI/22.12 partition/G  cray-fftw/3.3.10.1
 module load CrayEnv LUMI/22.12 partition/G  cray-fftw/3.3.10.1 rocm
 spack load c-lime
 spack load gmp
 spack load mpfr
--- a/systems/OEM/README
+++ b/systems/OEM/README
@ -1,53 +0,0 @@
 1. Prerequisites:
 ===================
 Make sure you have the latest Intel ipcx release loaded (via modules or similar)
 Make sure you have SYCL aware MPICH or Intel MPI loaded (assumed as mpicxx)
 2. Obtain Grid:
 ===================
 bash$
 git clone https://github.com/paboyle/Grid
 cd Grid
 ./bootstrap.sh
 cd systems/PVC
 3. Build Grid:
 ===================
 Here, configure command is stored in file config-command:
 bash$
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
 	LDFLAGS="-fiopenmp  -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "
 make all
 4. Run a benchmark:
 ===================
 *** Assumes interactive access to node. ***
 run Benchmark_dwf_fp32 using benchmarks/bench.sh
 bash$
 cd benchmarks
 ./bench.sh
--- a/systems/OEM/benchmarks/bench.sh
+++ b/systems/OEM/benchmarks/bench.sh
@ -1,18 +0,0 @@
 #!/bin/bash
 export EnableImplicitScaling=0
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 mpiexec -launcher ssh -n 1 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 1tile.log
 mpiexec -launcher ssh -n 2 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 2tile.log
 #mpiexec -launcher ssh -n 4 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 16.16.64.64 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 4tile.log
 #mpiexec -launcher ssh -n 8 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.4 --grid 16.16.64.128 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 8tile.log
--- a/systems/OEM/benchmarks/select_gpu.sh
+++ b/systems/OEM/benchmarks/select_gpu.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 num_tile=2
 gpu_id=$(( (MPI_LOCAL_RANKID % num_tile ) ))
 tile_id=$((MPI_LOCAL_RANKID / num_tile))
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 echo "local rank $MPI_LOCALRANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK"
 "$@"
--- a/systems/OEM/config-command
+++ b/systems/OEM/config-command
@ -1,15 +0,0 @@
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
 	LDFLAGS="-fiopenmp  -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "
--- a/systems/OEM/setup.sh
+++ b/systems/OEM/setup.sh
@ -1,3 +0,0 @@
 export https_proxy=http://proxy-chain.intel.com:911
 module load intel-release
 module load intel/mpich
--- a/systems/Sunspot/benchmarks/bench.pbs
+++ b/systems/Sunspot/benchmarks/bench.pbs
@ -1,46 +0,0 @@
 #!/bin/bash
 #PBS -l select=1:system=sunspot,place=scatter
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=01:00:00
 #PBS -N dwf
 #PBS -k doe
 HDIR=/home/paboyle/
 module use /soft/testing/modulefiles/
 module load intel-UMD23.05.25593.11/23.05.25593.11
 module load tools/pti-gpu  
 export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
 export PATH=$HDIR/tools/bin:$PATH
 export TZ='/usr/share/zoneinfo/US/Central'
 export OMP_PROC_BIND=spread
 export OMP_NUM_THREADS=3
 unset OMP_PLACES
 cd $PBS_O_WORKDIR
 qsub jobscript.pbs
 echo Jobid: $PBS_JOBID
 echo Running on host `hostname`
 echo Running on nodes `cat $PBS_NODEFILE`
 echo NODES
 cat $PBS_NODEFILE
 NNODES=`wc -l < $PBS_NODEFILE`
 NRANKS=12         # Number of MPI ranks per node
 NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
 NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
 NTOTRANKS=$(( NNODES * NRANKS ))
 echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
 echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
 CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
 	--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
--- a/systems/Sunspot/benchmarks/gpu_tile_compact.sh
+++ b/systems/Sunspot/benchmarks/gpu_tile_compact.sh
@ -1,52 +0,0 @@
 #!/bin/bash
 display_help() {
  echo " Will map gpu tile to rank in compact and then round-robin fashion"
  echo " Usage (only work for one node of ATS/PVC):"
  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
  echo
  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
  echo "   0 Rank 0.0"
  echo "   1 Rank 0.1"
  echo "   2 Rank 1.0"
  echo "   3 Rank 1.1"
  echo "   4 Rank 2.0"
  echo "   5 Rank 2.1"
  echo "   6 Rank 0.0"
  echo
  echo " Hacked together by apl@anl.gov, please contact if bug found"
  exit 1
 }
 #This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
 #works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
 num_gpu=6
 num_tile=2
 if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
  display_help
 fi
 gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
 tile_id=$((PALS_LOCAL_RANKID % num_tile))
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK"
 if [ $PALS_LOCAL_RANKID = 0 ]
 then
    onetrace --chrome-device-timeline "$@"
 #    "$@"
 else
 "$@"
 fi
--- a/systems/Sunspot/config-command
+++ b/systems/Sunspot/config-command
@ -1,16 +0,0 @@
 TOOLS=$HOME/tools
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -lapmidg -L$TOOLS/lib64/" \
 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
--- a/tests/core/Test_fft_pf.cc
+++ b/tests/core/Test_fft_pf.cc
@ -1,307 +0,0 @@
    /*************************************************************************************
    grid` physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_cshift.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  Coordinate latt_size   = GridDefaultLatt();
  Coordinate simd_layout( { vComplexD::Nsimd(),1,1,1});
  Coordinate mpi_layout  = GridDefaultMpi();
  int vol = 1;
  for(int d=0;d<latt_size.size();d++){
    vol = vol * latt_size[d];
  }
  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian RBGRID(&GRID);
  ComplexD ci(0.0,1.0);
  std::vector<int> seeds({1,2,3,4});
  GridSerialRNG          sRNG;  sRNG.SeedFixedIntegers(seeds); // naughty seeding
  GridParallelRNG          pRNG(&GRID);
  pRNG.SeedFixedIntegers(seeds);
  LatticeGaugeFieldD Umu(&GRID);
  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
  ////////////////////////////////////////////////////
  // PF prop
  ////////////////////////////////////////////////////
  LatticeFermionD    src(&GRID);
  gaussian(pRNG,src);
 #if 1
    Coordinate point(4,0);
    src=Zero();
    SpinColourVectorD ferm; gaussian(sRNG,ferm);
    pokeSite(ferm,src,point);
 #endif
  {
    std::cout<<"****************************************"<<std::endl;
    std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator \n";
    std::cout<<"****************************************"<<std::endl;
    //    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
    LatticeFermionD    tmp(&GRID);
    LatticeFermionD    ref(&GRID);
    LatticeFermionD    diff(&GRID);
    const int Ls=48+1;
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
    RealD mass=0.1;
    RealD M5  =0.8;
    OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0);
    // Momentum space prop
    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
    bool fiveD = false; //calculate 4d free propagator
    std::cout << " Free propagator " <<std::endl;
    Dov.FreePropagator(src,ref,mass) ;
    std::cout << " Free propagator norm "<< norm2(ref) <<std::endl;
    Gamma G5(Gamma::Algebra::Gamma5);
    LatticeFermionD    src5(FGrid); src5=Zero();
    LatticeFermionD    tmp5(FGrid); 
    LatticeFermionD    result5(FGrid); result5=Zero();
    LatticeFermionD    result4(&GRID); 
    const int sdir=0;
    ////////////////////////////////////////////////////////////////////////
    // Import
    ////////////////////////////////////////////////////////////////////////
    std::cout << " Free propagator Import "<< norm2(src) <<std::endl;
    Dov.ImportPhysicalFermionSource  (src,src5);
    std::cout << " Free propagator Imported "<< norm2(src5) <<std::endl;
    ////////////////////////////////////////////////////////////////////////
    // Conjugate gradient on normal equations system
    ////////////////////////////////////////////////////////////////////////
    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
    Dov.Mdag(src5,tmp5);
    src5=tmp5;
    MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov);
    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
    CG(HermOp,src5,result5);
    ////////////////////////////////////////////////////////////////////////
    // Domain wall physical field propagator
    ////////////////////////////////////////////////////////////////////////
    Dov.ExportPhysicalFermionSolution(result5,result4);
    // From DWF4d.pdf :
    //
    // Dov_pf = 2/(1-m) D_cayley_ovlap  [ Page 43 ]
    // Dinv_cayley_ovlap = 2/(1-m) Dinv_pf 
    // Dinv_cayley_surface =1/(1-m) ( Dinv_cayley_ovlap - 1 ) =>  2/(1-m)^2 Dinv_pf - 1/(1-m) * src   [ Eq.2.67 ]
    RealD scale = 2.0/(1.0-mass)/(1.0-mass);
    result4 = result4 * scale;
    result4 = result4 - src*(1.0/(1.0-mass)); // Subtract contact term
    DumpSliceNorm("Src",src);
    DumpSliceNorm("Grid",result4);
    DumpSliceNorm("Fourier",ref);
    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
    diff = result4- ref;
    DumpSliceNorm("diff ",diff);
  }
  ////////////////////////////////////////////////////
  // Dwf prop
  ////////////////////////////////////////////////////
  {
    std::cout<<"****************************************"<<std::endl;
    std::cout << "Testing Dov(Hw) Mom space 4d propagator \n";
    std::cout<<"****************************************"<<std::endl;
    LatticeFermionD    tmp(&GRID);
    LatticeFermionD    ref(&GRID);
    LatticeFermionD    diff(&GRID);
    const int Ls=48;
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
    RealD mass=0.1;
    RealD M5  =0.8;
    OverlapWilsonCayleyTanhFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,1.0);
    // Momentum space prop
    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
    Dov.FreePropagator(src,ref,mass) ;
    Gamma G5(Gamma::Algebra::Gamma5);
    LatticeFermionD    src5(FGrid); src5=Zero();
    LatticeFermionD    tmp5(FGrid); 
    LatticeFermionD    result5(FGrid); result5=Zero();
    LatticeFermionD    result4(&GRID); 
    const int sdir=0;
    ////////////////////////////////////////////////////////////////////////
    // Domain wall physical field source; need D_minus
    ////////////////////////////////////////////////////////////////////////
    /*
 	chi_5[0]   = chiralProjectPlus(chi);
 	chi_5[Ls-1]= chiralProjectMinus(chi);
    */      
    tmp =   (src + G5*src)*0.5;      InsertSlice(tmp,src5,   0,sdir);
    tmp =   (src - G5*src)*0.5;      InsertSlice(tmp,src5,Ls-1,sdir);
    ////////////////////////////////////////////////////////////////////////
    // Conjugate gradient on normal equations system
    ////////////////////////////////////////////////////////////////////////
    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
    Dov.Dminus(src5,tmp5);
    src5=tmp5;
    Dov.Mdag(src5,tmp5);
    src5=tmp5;
    MdagMLinearOperator<OverlapWilsonCayleyTanhFermionD,LatticeFermionD> HermOp(Dov);
    ConjugateGradient<LatticeFermionD> CG(1.0e-16,10000);
    CG(HermOp,src5,result5);
    ////////////////////////////////////////////////////////////////////////
    // Domain wall physical field propagator
    ////////////////////////////////////////////////////////////////////////
    /*
      psi  = chiralProjectMinus(psi_5[0]);
      psi += chiralProjectPlus(psi_5[Ls-1]);
    */
    ExtractSlice(tmp,result5,0   ,sdir);   result4 =         (tmp-G5*tmp)*0.5;
    ExtractSlice(tmp,result5,Ls-1,sdir);   result4 = result4+(tmp+G5*tmp)*0.5;
    std::cout << " Taking difference" <<std::endl;
    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
    DumpSliceNorm("Grid",result4);
    DumpSliceNorm("Fourier",ref);
    diff = ref - result4;
    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
    DumpSliceNorm("diff",diff);
  }
  {
    std::cout<<"****************************************"<<std::endl;
    std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator with q\n";
    std::cout<<"****************************************"<<std::endl;
    //    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
    LatticeFermionD    tmp(&GRID);
    LatticeFermionD    ref(&GRID);
    LatticeFermionD    diff(&GRID);
    const int Ls=48+1;
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
    RealD mass=0.1;
    RealD M5  =0.8;
    OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0);
    // Momentum space prop
    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
    bool fiveD = false; //calculate 4d free propagator
    std::cout << " Free propagator " <<std::endl;
    Dov.FreePropagator(src,ref,mass) ;
    std::cout << " Free propagator norm "<< norm2(ref) <<std::endl;
    Gamma G5(Gamma::Algebra::Gamma5);
    LatticeFermionD    src5(FGrid); src5=Zero();
    LatticeFermionD    tmp5(FGrid); 
    LatticeFermionD    result5(FGrid); result5=Zero();
    LatticeFermionD    result4(&GRID); 
    const int sdir=0;
    ////////////////////////////////////////////////////////////////////////
    // Import
    ////////////////////////////////////////////////////////////////////////
    std::cout << " Free propagator Import "<< norm2(src) <<std::endl;
    Dov.ImportPhysicalFermionSource  (src,src5);
    std::cout << " Free propagator Imported "<< norm2(src5) <<std::endl;
    ////////////////////////////////////////////////////////////////////////
    // Conjugate gradient on normal equations system
    ////////////////////////////////////////////////////////////////////////
    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
    Dov.Mdag(src5,tmp5);
    src5=tmp5;
    MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov);
    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
    CG(HermOp,src5,result5);
    ////////////////////////////////////////////////////////////////////////
    // Domain wall physical field propagator
    ////////////////////////////////////////////////////////////////////////
    Dov.ExportPhysicalFermionSolution(result5,result4);
    // From DWF4d.pdf :
    //
    // Dov_pf = 2/(1-m) D_cayley_ovlap  [ Page 43 ]
    // Dinv_cayley_ovlap = 2/(1-m) Dinv_pf 
    // Dinv_cayley_surface =1/(1-m) ( Dinv_cayley_ovlap - 1 ) =>  2/(1-m)^2 Dinv_pf - 1/(1-m) * src   [ Eq.2.67 ]
    RealD scale = 2.0/(1.0-mass)/(1.0-mass);
    result4 = result4 * scale;
    result4 = result4 - src*(1.0/(1.0-mass)); // Subtract contact term
    DumpSliceNorm("Src",src);
    DumpSliceNorm("Grid",result4);
    DumpSliceNorm("Fourier",ref);
    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
    diff = result4- ref;
    DumpSliceNorm("diff ",diff);
  }
  Grid_finalize();
 }
--- a/tests/debug/Test_iwasaki_action_newstaple.cc
+++ b/tests/debug/Test_iwasaki_action_newstaple.cc
@ -1,188 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_iwasaki_action_newstaple.cc
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 ////////////////////////////////////////////////////////////////////////
 // PlaqPlusRectangleActoin
 ////////////////////////////////////////////////////////////////////////
 template<class Gimpl>
 class PlaqPlusRectangleActionOrig : public Action<typename Gimpl::GaugeField> {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
 private:
  RealD c_plaq;
  RealD c_rect;
 public:
  PlaqPlusRectangleActionOrig(RealD b,RealD c): c_plaq(b),c_rect(c){};
  virtual std::string action_name(){return "PlaqPlusRectangleActionOrig";}
  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
  virtual std::string LogParameters(){
    std::stringstream sstream;
    sstream << GridLogMessage << "["<<action_name() <<"] c_plaq: " << c_plaq << std::endl;
    sstream << GridLogMessage << "["<<action_name() <<"] c_rect: " << c_rect << std::endl;
    return sstream.str();
  }
  virtual RealD S(const GaugeField &U) {
    RealD vol = U.Grid()->gSites();
    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
    RealD rect = WilsonLoops<Gimpl>::avgRectangle(U);
    RealD action=c_plaq*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5
      +c_rect*(1.0 -rect)*(Nd*(Nd-1.0))*vol;
    return action;
  };
  virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
    //extend Ta to include Lorentz indexes
    RealD factor_p = c_plaq/RealD(Nc)*0.5;
    RealD factor_r = c_rect/RealD(Nc)*0.5;
    GridBase *grid = Umu.Grid();
    std::vector<GaugeLinkField> U (Nd,grid);
    std::vector<GaugeLinkField> U2(Nd,grid);
    for(int mu=0;mu<Nd;mu++){
      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
      WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
    }
    GaugeLinkField dSdU_mu(grid);
    GaugeLinkField staple(grid);
    for (int mu=0; mu < Nd; mu++){
      // Staple in direction mu
      WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
      dSdU_mu = Ta(U[mu]*staple)*factor_p;
      WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
      dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
  };
 };
 // Convenience for common physically defined cases.
 //
 // RBC c1 parameterisation is not really RBC but don't have good
 // reference and we are happy to change name if prior use of this plaq coeff
 // parameterisation is made known to us. 
 template<class Gimpl>
 class RBCGaugeActionOrig : public PlaqPlusRectangleActionOrig<Gimpl> {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  RBCGaugeActionOrig(RealD beta,RealD c1) : PlaqPlusRectangleActionOrig<Gimpl>(beta*(1.0-8.0*c1), beta*c1) {};
  virtual std::string action_name(){return "RBCGaugeActionOrig";}
 };
 template<class Gimpl>
 class IwasakiGaugeActionOrig : public RBCGaugeActionOrig<Gimpl> {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  IwasakiGaugeActionOrig(RealD beta) : RBCGaugeActionOrig<Gimpl>(beta,-0.331) {};
  virtual std::string action_name(){return "IwasakiGaugeActionOrig";}
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  Coordinate latt_size  = GridDefaultLatt();
  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
  Coordinate mpi_layout = GridDefaultMpi();
  std::cout << " mpi "<<mpi_layout<<std::endl;
  std::cout << " simd "<<simd_layout<<std::endl;
  std::cout << " latt "<<latt_size<<std::endl;
  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
  GridParallelRNG   pRNG(&GRID);
  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  LatticeGaugeField U(&GRID);
  SU<Nc>::HotConfiguration(pRNG,U);
  //#define PRD
 #ifdef PRD
  typedef PeriodicGimplD Gimpl;
 #else
  typedef ConjugateGimplD Gimpl;
  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
  Gimpl::setDirections(conj_dirs);
 #endif
  typedef typename WilsonLoops<Gimpl>::GaugeMat GaugeMat;
  typedef typename WilsonLoops<Gimpl>::GaugeLorentz GaugeLorentz;
  GaugeLorentz derivOrig(&GRID), derivNew(&GRID);
  double beta = 2.13;
  IwasakiGaugeActionOrig<Gimpl> action_orig(beta);
  IwasakiGaugeAction<Gimpl> action_new(beta);
  double torig=0, tnew=0;
  int ntest = 10;
  for(int i=0;i<ntest;i++){
    double t0 = usecond();
    action_orig.deriv(U, derivOrig);
    double t1 = usecond();
    action_new.deriv(U, derivNew);
    double t2 = usecond();
    GaugeLorentz diff = derivOrig - derivNew;
    double n = norm2(diff);
    std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
    assert(n<1e-10);
    std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms,  new: " << (t2-t1)/1000 << "ms" << std::endl;
    torig += (t1-t0)/1000; tnew += (t2-t1)/1000;
  }
  std::cout << GridLogMessage << "Avg timings " << ntest << " iterations: orig:" << torig/ntest << "ms,   new:" << tnew/ntest << "ms" << std::endl;
  Grid_finalize();
 }
--- a/tests/debug/Test_optimized_staple_gaugebc.cc
+++ b/tests/debug/Test_optimized_staple_gaugebc.cc
@ -1,94 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_optimized_staple_gaugebc.cc
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 using namespace std;
 using namespace Grid;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  Coordinate latt_size  = GridDefaultLatt();
  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
  Coordinate mpi_layout = GridDefaultMpi();
  std::cout << " mpi "<<mpi_layout<<std::endl;
  std::cout << " simd "<<simd_layout<<std::endl;
  std::cout << " latt "<<latt_size<<std::endl;
  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
  GridParallelRNG   pRNG(&GRID);
  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  LatticeGaugeField U(&GRID);
  SU<Nc>::HotConfiguration(pRNG,U);
  //#define PRD
 #ifdef PRD
  typedef PeriodicGimplD Gimpl;
 #else
  typedef ConjugateGimplD Gimpl;
  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
  Gimpl::setDirections(conj_dirs);
 #endif
  typedef typename WilsonLoops<Gimpl>::GaugeMat GaugeMat;
  typedef typename WilsonLoops<Gimpl>::GaugeLorentz GaugeLorentz;
  int count = 0;
  double torig=0, topt=0;
  std::vector<GaugeMat> Umu(Nd,&GRID), U2(Nd,&GRID);
  for(int mu=0;mu<Nd;mu++){
    Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
    WilsonLoops<Gimpl>::RectStapleDouble(U2[mu], Umu[mu], mu);
  }
  std::cout << GridLogMessage << "Checking optimized vs unoptimized RectStaple" << std::endl;
  for(int mu=0;mu<Nd;mu++){
    GaugeMat staple_orig(&GRID), staple_opt(&GRID), staple_U2(&GRID);
    double t0 = usecond();
    WilsonLoops<Gimpl>::RectStapleUnoptimised(staple_orig,U,mu);
    double t1 = usecond();
    WilsonLoops<Gimpl>::RectStapleOptimised(staple_opt, U2, Umu, mu);
    double t2 = usecond();
    torig += t1-t0;  topt += t2-t1;
    ++count;
    GaugeMat diff = staple_orig - staple_opt;
    double n = norm2(diff);
    std::cout << GridLogMessage << mu << " " << n << std::endl;
    assert(n<1e-10);
  }
  std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  optimized: " << topt/1000/count << "ms" << std::endl;
  Grid_finalize();
 }
--- a/tests/debug/Test_padded_cell_staple.cc
+++ b/tests/debug/Test_padded_cell_staple.cc
@ -1,580 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell_staple.cc
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 using namespace std;
 using namespace Grid;
 template <class Gimpl> class WilsonLoopsTest : public Gimpl {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
  //Original implementation
  static void StapleOrig(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
 			 int nu) {
    GridBase *grid = Umu.Grid();
    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
    staple = Zero();
    if (nu != mu) {
      // mu
      // ^
      // |__>  nu
      //    __
      //      |
      //    __|
      //
      //Forward: Out(x) = Link(x)*field(x+mu)
      //Backward: Out(x) = Link^dag(x-mu)*field(x-mu)
      //ShiftStaple: Link(x) = Link(x+mu)
      //tmp1 = U^dag_nu(x-nu)
      //tmp2 = U^dag_mu(x-mu) tmp1(x-mu) = U^dag_mu(x-mu) U^dag_nu(x-nu-mu)
      //tmp3 = U_nu(x) tmp2(x+nu) = U_nu(x)U^dag_mu(x-mu+nu) U^dag_nu(x-mu)
      //tmp4 = tmp(x+mu) = U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
      staple += Gimpl::ShiftStaple(
 				   Gimpl::CovShiftForward(
 							  U[nu], nu,
 							  Gimpl::CovShiftBackward(
 										  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
 				   mu);
      //  __
      // |
      // |__
      //
      //
      //tmp1 = U_mu^dag(x-mu) U_nu(x-mu)
      //tmp2 = U_nu^dag(x-nu) tmp1(x-nu) = U_nu^dag(x-nu) U_mu^dag(x-mu-nu) U_nu(x-mu-nu)
      //tmp3 = tmp2(x+mu) = U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
      staple += Gimpl::ShiftStaple(
 				   Gimpl::CovShiftBackward(U[nu], nu,
 							   Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
 				   mu);
    }
  }
  static void StaplePadded(GaugeMat &staple, const GaugeLorentz &U, int mu,
 			   int nu) {
    if(nu==mu){
      staple = Zero();
      return;
    }
    double peek = 0, construct = 0, exchange = 0, coord = 0, stencil =0, kernel = 0, extract = 0, total = 0;
    double tstart = usecond();
    double t=tstart;
    PaddedCell Ghost(1, (GridCartesian*)U.Grid());
    construct += usecond() - t;
    t=usecond();      
    GaugeMat U_mu = PeekIndex<LorentzIndex>(U, mu);
    GaugeMat U_nu = PeekIndex<LorentzIndex>(U, nu);
    peek += usecond() - t;
    t=usecond();
    CshiftImplGauge<Gimpl> cshift_impl;
    GaugeMat Ug_mu = Ghost.Exchange(U_mu, cshift_impl);
    GaugeMat Ug_nu = Ghost.Exchange(U_nu, cshift_impl);
    exchange += usecond() - t;
    GridBase *ggrid = Ug_mu.Grid();
    GaugeMat gStaple(ggrid);
    t=usecond();
    Coordinate shift_0(Nd,0);
    Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
    Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
    Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
    Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
    std::vector<Coordinate> shifts;
    //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
    shifts.push_back(shift_0);
    shifts.push_back(shift_nu);
    shifts.push_back(shift_mu);
    //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
    shifts.push_back(shift_mnu);
    shifts.push_back(shift_mnu);
    shifts.push_back(shift_mnu_pmu);
    coord += usecond()-t;
    t=usecond();
    GeneralLocalStencil gStencil(ggrid,shifts);
    stencil += usecond() -t;
    t=usecond();
    {
      autoView( gStaple_v , gStaple, AcceleratorWrite);
      auto gStencil_v = gStencil.View();
      autoView( Ug_mu_v , Ug_mu, AcceleratorRead);
      autoView( Ug_nu_v , Ug_nu, AcceleratorRead);
      accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
 	  GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss);
 	  auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
 	  e = gStencil_v.GetEntry(1,ss);
 	  auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
 	  e = gStencil_v.GetEntry(2,ss);
 	  auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
 	  auto stencil_ss = U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x;
 	  e = gStencil_v.GetEntry(3,ss);
 	  auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
 	  e = gStencil_v.GetEntry(4,ss);
 	  auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
 	  e = gStencil_v.GetEntry(5,ss);
 	  auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
 	  stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu;
 	  coalescedWrite(gStaple_v[ss],stencil_ss);
 	}
 	);
    } //ensure views are all closed!
    kernel += usecond() - t;
    t=usecond();
    staple = Ghost.Extract(gStaple);
    extract += usecond()-t;
    total += usecond() - tstart;
    std::cout << GridLogMessage << "StaplePadded timings peek:" << peek << " construct:" << construct << " exchange:" << exchange << " coord:" << coord << " stencil:" << stencil << " kernel:" << kernel << " extract:" << extract << " total:" << total << std::endl;
  }
  static void RectStapleOrig(GaugeMat &Stap, const GaugeLorentz &Umu,
 			     int mu) {
    GridBase *grid = Umu.Grid();
    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
    Stap = Zero();
    for (int nu = 0; nu < Nd; nu++) {
      if (nu != mu) {
        //           __ ___
        //          |    __ |
        //
 	//tmp1 = U_nu^dag(x-nu)
 	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu) U_nu^dag(x-nu-mu)
 	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu) U_mu^dag(x-2mu) U_nu^dag(x-nu-2mu)
 	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu) U_mu^dag(x-2mu+nu) U_nu^dag(x-2mu)
 	//tmp5 = U_mu(x)tmp4(x+mu) = U_mu(x)U_nu(x+mu)U_mu^dag(x+nu) U_mu^dag(x-mu+nu) U_nu^dag(x-mu)
 	//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
        Stap += Gimpl::ShiftStaple(
 				   Gimpl::CovShiftForward(
 							  U[mu], mu,
 							  Gimpl::CovShiftForward(
 										 U[nu], nu,
 										 Gimpl::CovShiftBackward(
 													 U[mu], mu,
 													 Gimpl::CovShiftBackward(
 																 U[mu], mu,
 																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
 				   mu);
        //              __
        //          |__ __ |
 	//tmp1 = U^dag_mu(x-mu)U_nu(x-mu)
 	//tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)
 	//tmp3 = U^dag_nu(x-nu)tmp2(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)
 	//tmp4 = U_mu(x)tmp3(x+mu) = U_mu(x)U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)
 	//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
        Stap += Gimpl::ShiftStaple(
 				   Gimpl::CovShiftForward(
 							  U[mu], mu,
 							  Gimpl::CovShiftBackward(
 										  U[nu], nu,
 										  Gimpl::CovShiftBackward(
 													  U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))),
 				   mu);
        //           __
        //          |__ __ |
 	//Forward: Out(x) = Link(x)*field(x+mu)
 	//Backward: Out(x) = Link^dag(x-mu)*field(x-mu)
 	//ShiftStaple: Link(x) = Link(x+mu)
 	//tmp1 = U_nu(x)U_mu(x+nu)
 	//tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U_nu(x-mu)U_mu(x+nu-mu)
 	//tmp3 = U^dag_mu(x-mu)tmp2(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)U_mu(x+nu-2mu)
 	//tmp4 = U^dag_nu(x-nu)tmp3(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)U_mu(x-2mu)
 	//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
        Stap += Gimpl::ShiftStaple(
 				   Gimpl::CovShiftBackward(
 							   U[nu], nu,
 							   Gimpl::CovShiftBackward(
 										   U[mu], mu,
 										   Gimpl::CovShiftBackward(
 													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))),
 				   mu);
        //           __ ___
        //          |__    |
 	//tmp1 = U_nu^dag(x-nu)U_mu(x-nu)
 	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_mu(x-mu-nu)
 	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_mu^dag(x-2mu)U_nu^dag(x-2mu-nu)U_mu(x-2mu-nu)
 	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_mu^dag(x-2mu+nu)U_nu^dag(x-2mu)U_mu(x-2mu)
 	//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
        Stap += Gimpl::ShiftStaple(
 				   Gimpl::CovShiftForward(
 							  U[nu], nu,
 							  Gimpl::CovShiftBackward(
 										  U[mu], mu,
 										  Gimpl::CovShiftBackward(
 													  U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))),
 				   mu);
        //       --
        //      |  |
        //
        //      |  |
 	//tmp1 = U_nu^dag(x-nu)
 	//tmp2 = U_nu^dag(x-nu)tmp1(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)
 	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_nu^dag(x-mu-2nu)
 	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_nu^dag(x-mu-nu)
 	//tmp5 = U_nu(x)tmp4(x+nu) = U_nu(x)U_nu(x+nu)U_mu^dag(x-mu+2nu)U_nu^dag(x-mu+nu)U_nu^dag(x-mu)
 	//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
        Stap += Gimpl::ShiftStaple(
 				   Gimpl::CovShiftForward(
 							  U[nu], nu,
 							  Gimpl::CovShiftForward(
 										 U[nu], nu,
 										 Gimpl::CovShiftBackward(
 													 U[mu], mu,
 													 Gimpl::CovShiftBackward(
 																 U[nu], nu,
 																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
 				   mu);
        //      |  |
        //
        //      |  |
        //       --
 	//tmp1 = U_nu(x)U_nu(x+nu)
 	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu(x-mu)U_nu(x-mu+nu)
 	//tmp3 = U_nu^dag(x-nu)tmp2(x-nu) = U_nu^dag(x-nu)U_mu^dag(x-mu-nu)U_nu(x-mu-nu)U_nu(x-mu)
 	//tmp4 = U_nu^dag(x-nu)tmp3(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)U_mu^dag(x-mu-2nu)U_nu(x-mu-2nu)U_nu(x-mu-nu)
 	//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
        Stap += Gimpl::ShiftStaple(
 				   Gimpl::CovShiftBackward(
 							   U[nu], nu,
 							   Gimpl::CovShiftBackward(
 										   U[nu], nu,
 										   Gimpl::CovShiftBackward(
 													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))),
 				   mu);
      }
    }
  }
  static void RectStaplePadded(GaugeMat &Stap, const GaugeLorentz &U,
 			       int mu) {
    PaddedCell Ghost(2,(GridCartesian*)U.Grid());
    GridBase *ggrid = Ghost.grids.back();
    CshiftImplGauge<Gimpl> cshift_impl;
    std::vector<GaugeMat> Ug_dirs(Nd,ggrid);
    for(int i=0;i<Nd;i++) Ug_dirs[i] = Ghost.Exchange(PeekIndex<LorentzIndex>(U, i), cshift_impl);
    GaugeMat gStaple(ggrid);
    std::vector<Coordinate> shifts;
    for (int nu = 0; nu < Nd; nu++) {
      if (nu != mu) {
 	auto genShift = [&](int mushift,int nushift){
 	  Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
 	};
 	//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
 	shifts.push_back(genShift(0,0));
 	shifts.push_back(genShift(0,+1));
 	shifts.push_back(genShift(+1,+1));
 	shifts.push_back(genShift(+2,0));
 	shifts.push_back(genShift(+1,0));
 	//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
 	shifts.push_back(genShift(0,-1));
 	shifts.push_back(genShift(0,-1));
 	shifts.push_back(genShift(+1,-1));
 	shifts.push_back(genShift(+2,-1));
 	shifts.push_back(genShift(+1,0));
 	//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
 	shifts.push_back(genShift(-1,0));
 	shifts.push_back(genShift(-1,-1));
 	shifts.push_back(genShift(-1,-1));
 	shifts.push_back(genShift(0,-1));
 	shifts.push_back(genShift(+1,-1));
 	//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
 	shifts.push_back(genShift(-1,0));
 	shifts.push_back(genShift(-1,0));
 	shifts.push_back(genShift(-1,+1));
 	shifts.push_back(genShift(0,+1));
 	shifts.push_back(genShift(+1,0));
 	//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
 	shifts.push_back(genShift(0,0));
 	shifts.push_back(genShift(0,+1));
 	shifts.push_back(genShift(0,+2));
 	shifts.push_back(genShift(+1,+1));
 	shifts.push_back(genShift(+1,0));
 	//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
 	shifts.push_back(genShift(0,-1));
 	shifts.push_back(genShift(0,-2));
 	shifts.push_back(genShift(0,-2));
 	shifts.push_back(genShift(+1,-2));
 	shifts.push_back(genShift(+1,-1));
      }
    }
    size_t nshift = shifts.size();
    GeneralLocalStencil gStencil(ggrid,shifts);
    {
      autoView( gStaple_v , gStaple, AcceleratorWrite);
      auto gStencil_v = gStencil.View();
      typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
      size_t vsize = Nd*sizeof(GaugeViewType);
      GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
      for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = Ug_dirs[i].View(AcceleratorRead);
      GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
      acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
      accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
 	  decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
 	  stencil_ss = Zero();
 	  int s=0;
 	  for(int nu=0;nu<Nd;nu++){
 	    if(nu != mu){
 	      //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
 	      GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
 	      auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 	      e = gStencil_v.GetEntry(s++,ss);
 	      auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
 	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
 	      //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
 	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
 	      //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
 	      //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
 	      //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
 	      //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 	      e = gStencil_v.GetEntry(s++,ss);
 	      U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
 	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
 	    }
 	  }
 	  assert(s==nshift);
 	  coalescedWrite(gStaple_v[ss],stencil_ss);
 	}
 	);
      for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
      free(Ug_dirs_v_host);
      acceleratorFreeDevice(Ug_dirs_v);
    }   
    Stap = Ghost.Extract(gStaple);    
  }
 };  
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  Coordinate latt_size  = GridDefaultLatt();
  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
  Coordinate mpi_layout = GridDefaultMpi();
  std::cout << " mpi "<<mpi_layout<<std::endl;
  std::cout << " simd "<<simd_layout<<std::endl;
  std::cout << " latt "<<latt_size<<std::endl;
  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
  GridParallelRNG   pRNG(&GRID);
  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  LatticeGaugeField U(&GRID);
  SU<Nc>::HotConfiguration(pRNG,U);
  //typedef PeriodicGimplD Gimpl;
  typedef ConjugateGimplD Gimpl;
  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
  Gimpl::setDirections(conj_dirs);
  typedef typename WilsonLoopsTest<Gimpl>::GaugeMat GaugeMat;
  typedef typename WilsonLoopsTest<Gimpl>::GaugeLorentz GaugeLorentz;
  std::cout << GridLogMessage << "Checking Staple" << std::endl;
  int count = 0;
  double torig=0, tpadded=0;
  for(int mu=0;mu<Nd;mu++){
    for(int nu=0;nu<Nd;nu++){
      if(mu != nu){
 	GaugeMat staple_orig(&GRID), staple_padded(&GRID);
 	double t0 = usecond();
 	WilsonLoopsTest<Gimpl>::StapleOrig(staple_orig,U,mu,nu);
 	double t1 = usecond();
 	WilsonLoopsTest<Gimpl>::StaplePadded(staple_padded,U,mu,nu);
 	double t2 = usecond();
 	torig += t1-t0;  tpadded += t2-t1;
 	++count;
 	GaugeMat diff = staple_orig - staple_padded;
 	double n = norm2(diff);
 	std::cout << GridLogMessage << mu << " " << nu << " " << n << std::endl;
 	assert(n<1e-10);
      }
    }
  }
  std::cout << GridLogMessage << "Staple timings orig: " << torig/1000/count << "ms,  padded: " << tpadded/1000/count << "ms" << std::endl;
  count=0; torig=tpadded=0;
  std::cout << GridLogMessage << "Checking RectStaple" << std::endl;
  for(int mu=0;mu<Nd;mu++){
    GaugeMat staple_orig(&GRID), staple_padded(&GRID);
    double t0 = usecond();
    WilsonLoopsTest<Gimpl>::RectStapleOrig(staple_orig,U,mu);
    double t1 = usecond();
    WilsonLoopsTest<Gimpl>::RectStaplePadded(staple_padded,U,mu);
    double t2 = usecond();
    torig += t1-t0;  tpadded += t2-t1;
    ++count;
    GaugeMat diff = staple_orig - staple_padded;
    double n = norm2(diff);
    std::cout << GridLogMessage << mu << " " << n << std::endl;
    assert(n<1e-10);
  }
  std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  padded: " << tpadded/1000/count << "ms" << std::endl;
  Grid_finalize();
 }
Author	SHA1	Message	Date
Peter Boyle	09146cfc43	Profiling temporary code until optimised	2023-06-15 06:54:10 -04:00
Peter Boyle	a450e96827	Optional checkpoint smeared configs for FTHMC	2023-06-14 04:54:29 -04:00
Peter Boyle	0f3678b9be	Additional tests	2023-06-13 11:57:11 -04:00
Peter Boyle	8dd8338e14	Hot start should be properly Hot	2023-06-13 11:56:37 -04:00
Peter Boyle	11e0dc9851	Ta project	2023-06-13 11:56:11 -04:00
Peter Boyle	f4ef6dae43	Keep methods virtual	2023-06-13 11:55:05 -04:00
Peter Boyle	b6e147372b	Clean up	2023-06-13 11:54:11 -04:00
Peter Boyle	3a4a662dc6	Integrator over to smeared force structure	2023-06-13 11:53:38 -04:00
Peter Boyle	8d06bda6fb	Smeared action virtual class	2023-06-13 11:49:09 -04:00