Debug code

2025-06-21 17:22:03 +01:00 · 2023-04-20 14:54:36 -04:00
61 changed files with 327 additions and 4391 deletions
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -419,15 +419,14 @@ until convergence
 	}
      }

-      if ( Nconv < Nstop ) {
+      if ( Nconv < Nstop )
 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
-	std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
-      }
+
      eval=eval2;
      
      //Keep only converged
-      eval.resize(Nstop);// was Nconv
-      evec.resize(Nstop,grid);// was Nconv
+      eval.resize(Nconv);// Nstop?
+      evec.resize(Nconv,grid);// Nstop?
      basisSortInPlace(evec,eval,reverse);
      
    }
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -27,7 +27,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
 *************************************************************************************/
 /*  END LEGAL */

-#define Mheader "SharedMemoryMpi: "
+#define header "SharedMemoryMpi: "

 #include <Grid/GridCore.h>
 #include <pwd.h>
@ -174,8 +174,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);

  if ( WorldRank == 0) {
-    std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;  
-    std::cout << Mheader " Node  communicator of size " <<WorldShmSize << std::endl;
+    std::cout << header " World communicator of size " <<WorldSize << std::endl;  
+    std::cout << header " Node  communicator of size " <<WorldShmSize << std::endl;
  }
  // WorldShmComm, WorldShmSize, WorldShmRank

@ -452,7 +452,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
 #ifdef GRID_MPI3_SHMGET
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);

@ -537,7 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }

-  std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+  std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;

  SharedMemoryZero(ShmCommBuf,bytes);
@ -580,7 +580,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
@ -604,8 +604,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;

-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
+    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
+    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
      
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
@ -744,7 +744,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -781,7 +781,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@ -791,7 +791,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHM_NONE
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -838,7 +838,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@ -47,4 +47,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
-#include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -697,68 +697,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
+
  // the above should guarantee that the operations are local
-  
-#if 1
-
-  size_t nsite = 1;
-  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
-  
-  size_t tbytes = 4*nsite*sizeof(int);
-  int *table = (int*)malloc(tbytes);
- 
-  thread_for(idx, nsite, {
-      Coordinate from_coor, to_coor;
-      size_t rem = idx;
-      for(int i=0;i<nd;i++){
-	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
-	from_coor[i] = base_i + FromLowerLeft[i];
-	to_coor[i] = base_i + ToLowerLeft[i];
-      }
-      
-      int foidx = Fg->oIndex(from_coor);
-      int fiidx = Fg->iIndex(from_coor);
-      int toidx = Tg->oIndex(to_coor);
-      int tiidx = Tg->iIndex(to_coor);
-      int* tt = table + 4*idx;
-      tt[0] = foidx;
-      tt[1] = fiidx;
-      tt[2] = toidx;
-      tt[3] = tiidx;
-    });
-  
-  int* table_d = (int*)acceleratorAllocDevice(tbytes);
-  acceleratorCopyToDevice(table,table_d,tbytes);
-
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  autoView(from_v,From,AcceleratorRead);
-  autoView(to_v,To,AcceleratorWrite);
-  
-  accelerator_for(idx,nsite,1,{
-      static const int words=sizeof(vobj)/sizeof(vector_type);
-      int* tt = table_d + 4*idx;
-      int from_oidx = *tt++;
-      int from_lane = *tt++;
-      int to_oidx = *tt++;
-      int to_lane = *tt;
-
-      const vector_type* from = (const vector_type *)&from_v[from_oidx];
-      vector_type* to = (vector_type *)&to_v[to_oidx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	putlane(to[w], stmp, to_lane);
-      }
-    });
-  
-  acceleratorFreeDevice(table_d);    
-  free(table);
-  
-
-#else  
  Coordinate ldf = Fg->_ldimensions;
  Coordinate rdf = Fg->_rdimensions;
  Coordinate isf = Fg->_istride;
@ -767,9 +707,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  Coordinate ist = Tg->_istride;
  Coordinate ost = Tg->_ostride;

-  autoView( t_v , To, CpuWrite);
-  autoView( f_v , From, CpuRead);
-  thread_for(idx,Fg->lSites(),{
+  autoView( t_v , To, AcceleratorWrite);
+  autoView( f_v , From, AcceleratorRead);
+  accelerator_for(idx,Fg->lSites(),1,{
    sobj s;
    Coordinate Fcoor(nd);
    Coordinate Tcoor(nd);
@ -782,24 +722,17 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
    }
    if (in_region) {
-#if 0      
-      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
-      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
-      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
-      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
-      scalar_type * fp = (scalar_type *)&f_v[odx_f];
-      scalar_type * tp = (scalar_type *)&t_v[odx_t];
+      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
+      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
+      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
+      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
+      vector_type * fp = (vector_type *)&f_v[odx_f];
+      vector_type * tp = (vector_type *)&t_v[odx_t];
      for(int w=0;w<words;w++){
 	tp[w].putlane(fp[w].getlane(idx_f),idx_t);
      }
-#else
-    peekLocalSite(s,f_v,Fcoor);
-    pokeLocalSite(s,t_v,Tcoor);
-#endif
    }
  });
-
-#endif
 }


@ -892,8 +825,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 }


-//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
-//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@ -910,70 +841,11 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int

  for(int d=0;d<nh;d++){
    if ( d!=orthog ) {
-      assert(lg->_processors[d]  == hg->_processors[d]);
-      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
-    }
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
  }

-#if 1
-  size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
-  size_t tbytes = 4*nsite*sizeof(int);
-  int *table = (int*)malloc(tbytes);
-  
-  thread_for(idx,nsite,{
-    Coordinate lcoor(nl);
-    Coordinate hcoor(nh);
-    lcoor[orthog] = slice_lo;
-    hcoor[orthog] = slice_hi;
-    size_t rem = idx;
-    for(int mu=0;mu<nl;mu++){
-      if(mu != orthog){
-	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu];
-	lcoor[mu] = hcoor[mu] = xmu;
-      }
-    }
-    int loidx = lg->oIndex(lcoor);
-    int liidx = lg->iIndex(lcoor);
-    int hoidx = hg->oIndex(hcoor);
-    int hiidx = hg->iIndex(hcoor);
-    int* tt = table + 4*idx;
-    tt[0] = loidx;
-    tt[1] = liidx;
-    tt[2] = hoidx;
-    tt[3] = hiidx;
-    });
-   
-  int* table_d = (int*)acceleratorAllocDevice(tbytes);
-  acceleratorCopyToDevice(table,table_d,tbytes);
-
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  autoView(lowDim_v,lowDim,AcceleratorRead);
-  autoView(higherDim_v,higherDim,AcceleratorWrite);
-  
-  accelerator_for(idx,nsite,1,{
-      static const int words=sizeof(vobj)/sizeof(vector_type);
-      int* tt = table_d + 4*idx;
-      int from_oidx = *tt++;
-      int from_lane = *tt++;
-      int to_oidx = *tt++;
-      int to_lane = *tt;
-
-      const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
-      vector_type* to = (vector_type *)&higherDim_v[to_oidx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	putlane(to[w], stmp, to_lane);
-      }
-    });
-  
-  acceleratorFreeDevice(table_d);    
-  free(table);
-  
-#else
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
@ -989,7 +861,6 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
      pokeLocalSite(s,higherDimv,hcoor);
    }
  });
-#endif
 }


--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -1,174 +0,0 @@
-/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/lattice/PaddedCell.h
-
-    Copyright (C) 2019
-
-Author: Peter Boyle pboyle@bnl.gov
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-#include<Grid/cshift/Cshift.h>
-
-NAMESPACE_BEGIN(Grid);
-
-//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
-template<typename vobj>
-struct CshiftImplBase{
-  virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
-  virtual ~CshiftImplBase(){}
-};
-template<typename vobj>
-struct CshiftImplDefault: public CshiftImplBase<vobj>{
-  Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
-};
-template<typename Gimpl>
-struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
-  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
-};  
-
-class PaddedCell {
-public:
-  GridCartesian * unpadded_grid;
-  int dims;
-  int depth;
-  std::vector<GridCartesian *> grids;
-
-  ~PaddedCell()
-  {
-    DeleteGrids();
-  }
-  PaddedCell(int _depth,GridCartesian *_grid)
-  {
-    unpadded_grid = _grid;
-    depth=_depth;
-    dims=_grid->Nd();
-    AllocateGrids();
-    Coordinate local     =unpadded_grid->LocalDimensions();
-    for(int d=0;d<dims;d++){
-      assert(local[d]>=depth);
-    }
-  }
-  void DeleteGrids(void)
-  {
-    for(int d=0;d<grids.size();d++){
-      delete grids[d];
-    }
-    grids.resize(0);
-  };
-  void AllocateGrids(void)
-  {
-    Coordinate local     =unpadded_grid->LocalDimensions();
-    Coordinate simd      =unpadded_grid->_simd_layout;
-    Coordinate processors=unpadded_grid->_processors;
-    Coordinate plocal    =unpadded_grid->LocalDimensions();
-    Coordinate global(dims);
-
-    // expand up one dim at a time
-    for(int d=0;d<dims;d++){
-
-      plocal[d] += 2*depth; 
-
-      for(int d=0;d<dims;d++){
-	global[d] = plocal[d]*processors[d];
-      }
-
-      grids.push_back(new GridCartesian(global,simd,processors));
-    }
-  };
-  template<class vobj>
-  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
-  {
-    Lattice<vobj> out(unpadded_grid);
-
-    Coordinate local     =unpadded_grid->LocalDimensions();
-    Coordinate fll(dims,depth); // depends on the MPI spread
-    Coordinate tll(dims,0); // depends on the MPI spread
-    localCopyRegion(in,out,fll,tll,local);
-    return out;
-  }
-  template<class vobj>
-  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
-  {
-    GridBase *old_grid = in.Grid();
-    int dims = old_grid->Nd();
-    Lattice<vobj> tmp = in;
-    for(int d=0;d<dims;d++){
-      tmp = Expand(d,tmp,cshift); // rvalue && assignment
-    }
-    return tmp;
-  }
-  // expand up one dim at a time
-  template<class vobj>
-  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
-  {
-    GridBase *old_grid = in.Grid();
-    GridCartesian *new_grid = grids[dim];//These are new grids
-    Lattice<vobj>  padded(new_grid);
-    Lattice<vobj> shifted(old_grid);    
-    Coordinate local     =old_grid->LocalDimensions();
-    Coordinate plocal    =new_grid->LocalDimensions();
-    if(dim==0) conformable(old_grid,unpadded_grid);
-    else       conformable(old_grid,grids[dim-1]);
-
-    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
-
-    double tins=0, tshift=0;
-    
-    // Middle bit
-    double t = usecond();
-    for(int x=0;x<local[dim];x++){
-      InsertSliceLocal(in,padded,x,depth+x,dim);
-    }
-    tins += usecond() - t;
-    
-    // High bit
-    t = usecond();
-    shifted = cshift.Cshift(in,dim,depth);
-    tshift += usecond() - t;
-
-    t=usecond();
-    for(int x=0;x<depth;x++){
-      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
-    }
-    tins += usecond() - t;
-    
-    // Low bit
-    t = usecond();
-    shifted = cshift.Cshift(in,dim,-depth);
-    tshift += usecond() - t;
-    
-    t = usecond();
-    for(int x=0;x<depth;x++){
-      InsertSliceLocal(shifted,padded,x,x,dim);
-    }
-    tins += usecond() - t;
-
-    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
-    
-    return padded;
-  }
-
-};
- 
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@ -104,7 +104,6 @@ template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iSca
 template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
 template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
 template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
-template<typename vtype> using iLorentzComplex            = iVector<iScalar<iScalar<vtype> >, Nd > ;
 template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
 template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
 template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
@ -179,15 +178,6 @@ typedef iLorentzColourMatrix<vComplexF>  vLorentzColourMatrixF;
 typedef iLorentzColourMatrix<vComplexD>  vLorentzColourMatrixD;
 typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;

-// LorentzComplex
-typedef iLorentzComplex<Complex  > LorentzComplex;
-typedef iLorentzComplex<ComplexF > LorentzComplexF;
-typedef iLorentzComplex<ComplexD > LorentzComplexD;
-
-typedef iLorentzComplex<vComplex > vLorentzComplex;
-typedef iLorentzComplex<vComplexF> vLorentzComplexF;
-typedef iLorentzComplex<vComplexD> vLorentzComplexD;
-
 // DoubleStored gauge field
 typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
@ -317,10 +307,6 @@ typedef Lattice<vLorentzColourMatrixF>  LatticeLorentzColourMatrixF;
 typedef Lattice<vLorentzColourMatrixD>  LatticeLorentzColourMatrixD;
 typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;

-typedef Lattice<vLorentzComplex>  LatticeLorentzComplex;
-typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF;
-typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD;
-
 // DoubleStored gauge field
 typedef Lattice<vDoubleStoredColourMatrix>   LatticeDoubleStoredColourMatrix;
 typedef Lattice<vDoubleStoredColourMatrixF>  LatticeDoubleStoredColourMatrixF;
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@ -34,24 +34,10 @@ directory

 NAMESPACE_BEGIN(Grid);

-///////////////////////////////////
-// Smart configuration base class
-///////////////////////////////////
-template< class Field >
-class ConfigurationBase
-{
-public:
-  ConfigurationBase() {}
-  virtual ~ConfigurationBase() {}
-  virtual void set_Field(Field& U) =0;
-  virtual void smeared_force(Field&) = 0;
-  virtual Field& get_SmearedU() =0;
-  virtual Field &get_U(bool smeared = false) = 0;
-};
-
 template <class GaugeField >
 class Action 
 {
+
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
@ -91,39 +77,11 @@ public:
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
-  /////////////////////////////
  // Heatbath?
-  /////////////////////////////
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
  virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
-
-  /////////////////////////////////////////////////////////////
-  // virtual smeared interface through configuration container
-  /////////////////////////////////////////////////////////////
-  virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
-  {
-    refresh(U.get_U(is_smeared),sRNG,pRNG);
-  }
-  virtual RealD S(ConfigurationBase<GaugeField>& U)
-  {
-    return S(U.get_U(is_smeared));
-  }
-  virtual RealD Sinitial(ConfigurationBase<GaugeField>& U) 
-  {
-    return Sinitial(U.get_U(is_smeared));
-  }
-  virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
-  {
-    deriv(U.get_U(is_smeared),dSdU); 
-    if ( is_smeared ) {
-      U.smeared_force(dSdU);
-    }
-  }
-  ///////////////////////////////
-  // Logging
-  ///////////////////////////////
  virtual std::string action_name()    = 0;                             // return the action name
  virtual std::string LogParameters()  = 0;                             // prints action parameters
  virtual ~Action(){}
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@ -30,8 +30,6 @@ directory
 #ifndef QCD_ACTION_CORE
 #define QCD_ACTION_CORE

-#include <Grid/qcd/action/gauge/GaugeImplementations.h>
-
 #include <Grid/qcd/action/ActionBase.h>
 NAMESPACE_CHECK(ActionBase);
 #include <Grid/qcd/action/ActionSet.h>
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -196,7 +196,6 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  
  uint64_t Nsite = Umu.Grid()->oSites();
  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);
-
 };
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
@ -247,10 +246,14 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,

    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);

+    std::cout << " InsertForce Btilde "<< norm2(Btilde)<<std::endl;
+
    ////////////////////////////
    // spin trace outer product
    ////////////////////////////
    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
+
+    std::cout << " InsertForce "<< norm2(mat)<<std::endl;
  }
 }

--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -423,6 +423,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();

 #define KERNEL_CALL_EXT(A)						\
+  const uint64_t    NN = Nsite*Ls;					\
  const uint64_t    sz = st.surface_list.size();			\
  auto ptr = &st.surface_list[0];					\
  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@ -176,7 +176,7 @@ public:
      return PeriodicBC::CshiftLink(Link,mu,shift);
  }

-  static inline void       setDirections(const std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
+  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
 };
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@ -43,7 +43,7 @@ public:
 private:
  RealD c_plaq;
  RealD c_rect;
-  typename WilsonLoops<Gimpl>::StapleAndRectStapleAllWorkspace workspace;
+
 public:
  PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){};

@ -79,18 +79,27 @@ public:
    GridBase *grid = Umu.Grid();

    std::vector<GaugeLinkField> U (Nd,grid);
+    std::vector<GaugeLinkField> U2(Nd,grid);
+
    for(int mu=0;mu<Nd;mu++){
      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+      WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
    }
-    std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
-    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);

    GaugeLinkField dSdU_mu(grid);
    GaugeLinkField staple(grid);

    for (int mu=0; mu < Nd; mu++){
-      dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
-      dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
+
+      // Staple in direction mu
+
+      WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
+
+      dSdU_mu = Ta(U[mu]*staple)*factor_p;
+
+      WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
+
+      dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
 	  
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
--- a/Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
+++ b/Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
@ -119,13 +119,19 @@ public:
    //  X^dag Der_oe MeeInv Meo Y
    // Use Mooee as nontrivial but gauge field indept
    this->_Mat.MeooeDag   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied
+    std::cout << " tmp 1" << norm2(tmp1)<<std::endl;
    this->_Mat.MooeeInvDag(tmp1,tmp2);   // even->even 
+    std::cout << " tmp 1" << norm2(tmp2)<<std::endl;
    this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerYes);
+    std::cout << " ForceO " << norm2(ForceO)<<std::endl;
          
    //  Accumulate X^dag M_oe MeeInv Der_eo Y
    this->_Mat.Meooe   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied
+    std::cout << " tmp 1" << norm2(tmp1)<<std::endl;
    this->_Mat.MooeeInv(tmp1,tmp2); // even->even 
+    std::cout << " tmp 2" << norm2(tmp2)<<std::endl;
    this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerYes);
+    std::cout << " ForceE " << norm2(ForceE)<<std::endl;

    assert(ForceE.Checkerboard()==Even);
    assert(ForceO.Checkerboard()==Odd);
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@ -283,13 +283,12 @@ public:
      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;

      TheIntegrator.print_timer();
-      
-      TheIntegrator.Smearer.set_Field(Ucur);
+
      for (int obs = 0; obs < Observables.size(); obs++) {
      	std::cout << GridLogDebug << "Observables # " << obs << std::endl;
      	std::cout << GridLogDebug << "Observables total " << Observables.size() << std::endl;
      	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
-        Observables[obs]->TrajectoryComplete(traj + 1, TheIntegrator.Smearer, sRNG, pRNG);
+        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
      }
      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
    }
--- a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
@ -35,16 +35,13 @@ class CheckpointerParameters : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(CheckpointerParameters, 
 				  std::string, config_prefix, 
-				  std::string, smeared_prefix, 
 				  std::string, rng_prefix, 
 				  int, saveInterval, 
-				  bool, saveSmeared, 
 				  std::string, format, );

-  CheckpointerParameters(std::string cf = "cfg", std::string sf="cfg_smr" , std::string rn = "rng",
+  CheckpointerParameters(std::string cf = "cfg", std::string rn = "rng",
 			 int savemodulo = 1, const std::string &f = "IEEE64BIG")
    : config_prefix(cf),
-      smeared_prefix(sf),
      rng_prefix(rn),
      saveInterval(savemodulo),
      format(f){};
@ -64,21 +61,13 @@ template <class Impl>
 class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> {
 public:
  void build_filenames(int traj, CheckpointerParameters &Params,
-                       std::string &conf_file,
-                       std::string &smear_file,
-		       std::string &rng_file) {
+                       std::string &conf_file, std::string &rng_file) {
    {
      std::ostringstream os;
      os << Params.rng_prefix << "." << traj;
      rng_file = os.str();
    }

-    {
-      std::ostringstream os;
-      os << Params.smeared_prefix << "." << traj;
-      smear_file = os.str();
-    }
-
    {
      std::ostringstream os;
      os << Params.config_prefix << "." << traj;
@ -95,11 +84,6 @@ public:
  }
  virtual void initialize(const CheckpointerParameters &Params) = 0;

-  virtual void TrajectoryComplete(int traj,
-                                  typename Impl::Field &U,
-                                  GridSerialRNG &sRNG,
-                                  GridParallelRNG &pRNG) { assert(0); } ; // HMC should pass the smart config with smeared and unsmeared
-  
  virtual void CheckpointRestore(int traj, typename Impl::Field &U,
                                 GridSerialRNG &sRNG,
                                 GridParallelRNG &pRNG) = 0;
--- a/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
@ -61,14 +61,11 @@ public:
    fout.close();
  }

-  void TrajectoryComplete(int traj,
-			  ConfigurationBase<Field> &SmartConfig,
-			  GridSerialRNG &sRNG, GridParallelRNG &pRNG)
-  {
+  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {

    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng, smr;
-      this->build_filenames(traj, Params, config, smr, rng);
+      std::string config, rng;
+      this->build_filenames(traj, Params, config, rng);

      uint32_t nersc_csum;
      uint32_t scidac_csuma;
@ -77,15 +74,9 @@ public:
      BinarySimpleUnmunger<sobj_double, sobj> munge;
      truncate(rng);
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      std::cout << GridLogMessage << "Written Binary RNG " << rng
-                << " checksum " << std::hex 
-		<< nersc_csum   <<"/"
-		<< scidac_csuma   <<"/"
-		<< scidac_csumb 
-		<< std::dec << std::endl;
-
      truncate(config);
-      BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(false), config, munge, 0, Params.format,
+
+      BinaryIO::writeLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
 						      nersc_csum,scidac_csuma,scidac_csumb);

      std::cout << GridLogMessage << "Written Binary Configuration " << config
@ -94,18 +85,6 @@ public:
 		<< scidac_csuma   <<"/"
 		<< scidac_csumb 
 		<< std::dec << std::endl;
-
-      if ( Params.saveSmeared ) {
-	truncate(smr);
-	BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(true), smr, munge, 0, Params.format,
-							nersc_csum,scidac_csuma,scidac_csumb);
-	std::cout << GridLogMessage << "Written Binary Smeared Configuration " << smr
-                << " checksum " << std::hex 
-		<< nersc_csum   <<"/"
-		<< scidac_csuma   <<"/"
-		<< scidac_csumb 
-		<< std::dec << std::endl;
-      }
    }

  };
--- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
@ -69,27 +69,17 @@ public:
    }
  }

-  void TrajectoryComplete(int traj,
-			  ConfigurationBase<GaugeField> &SmartConfig,
-			  GridSerialRNG &sRNG,
+  void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
                          GridParallelRNG &pRNG) {
    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng, smr;
+      std::string config, rng;
      this->build_filenames(traj, Params, config, rng);
-      GridBase *grid = SmartConfig.get_U(false).Grid();
+      GridBase *grid = U.Grid();
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      std::cout << GridLogMessage << "Written BINARY RNG " << rng
-                << " checksum " << std::hex 
-		<< nersc_csum<<"/"
-		<< scidac_csuma<<"/"
-		<< scidac_csumb
-		<< std::dec << std::endl;
-
-      
      IldgWriter _IldgWriter(grid->IsBoss());
      _IldgWriter.open(config);
-      _IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(false), traj, config, config);
+      _IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config);
      _IldgWriter.close();

      std::cout << GridLogMessage << "Written ILDG Configuration on " << config
@ -98,21 +88,6 @@ public:
 		<< scidac_csuma<<"/"
 		<< scidac_csumb
 		<< std::dec << std::endl;
-
-      if ( Params.saveSmeared ) { 
-	IldgWriter _IldgWriter(grid->IsBoss());
-	_IldgWriter.open(smr);
-	_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
-	_IldgWriter.close();
-
-	std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
-                << " checksum " << std::hex 
-		<< nersc_csum<<"/"
-		<< scidac_csuma<<"/"
-		<< scidac_csumb
-		<< std::dec << std::endl;
-      }
-
    }
  };

--- a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
@ -52,29 +52,23 @@ public:
    Params.format = "IEEE64BIG";  // fixed, overwrite any other choice
  }

-  virtual void TrajectoryComplete(int traj,
-                                  ConfigurationBase<GaugeField> &SmartConfig,
-                                  GridSerialRNG &sRNG,
-                                  GridParallelRNG &pRNG)
-  {
+  void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
+                          GridParallelRNG &pRNG) {
    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng, smr;
-      this->build_filenames(traj, Params, config, smr, rng);
-      
+      std::string config, rng;
+      this->build_filenames(traj, Params, config, rng);
+
      int precision32 = 1;
      int tworow = 0;
      NerscIO::writeRNGState(sRNG, pRNG, rng);
-      NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(false), config, tworow, precision32);
-      if ( Params.saveSmeared ) {
-	NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(true), smr, tworow, precision32);
-      }
+      NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32);
    }
  };

  void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
                         GridParallelRNG &pRNG) {
-    std::string config, rng, smr;
-    this->build_filenames(traj, Params, config, smr, rng );
+    std::string config, rng;
+    this->build_filenames(traj, Params, config, rng);
    this->check_filename(rng);
    this->check_filename(config);

--- a/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
@ -70,37 +70,19 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
    }
  }

-  void TrajectoryComplete(int traj, 
-			  ConfigurationBase<Field> &SmartConfig,
-			  GridSerialRNG &sRNG,
+  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
                          GridParallelRNG &pRNG) {
    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng,smr;
-      this->build_filenames(traj, Params, config, smr, rng);
-      GridBase *grid = SmartConfig.get_U(false).Grid();
+      std::string config, rng;
+      this->build_filenames(traj, Params, config, rng);
+      GridBase *grid = U.Grid();
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      std::cout << GridLogMessage << "Written Binary RNG " << rng
-                << " checksum " << std::hex 
-		<< nersc_csum   <<"/"
-		<< scidac_csuma   <<"/"
-		<< scidac_csumb 
-		<< std::dec << std::endl;
+      ScidacWriter _ScidacWriter(grid->IsBoss());
+      _ScidacWriter.open(config);
+      _ScidacWriter.writeScidacFieldRecord(U, MData);
+      _ScidacWriter.close();

-
-      {
-	ScidacWriter _ScidacWriter(grid->IsBoss());
-	_ScidacWriter.open(config);
-	_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(false), MData);
-	_ScidacWriter.close();
-      }
-      
-      if ( Params.saveSmeared ) {
-	ScidacWriter _ScidacWriter(grid->IsBoss());
-	_ScidacWriter.open(smr);
-	_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(true), MData);
-	_ScidacWriter.close();
-      }
      std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl;
    }
  };
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@ -66,7 +66,6 @@ public:
 template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy>
 class Integrator {
 protected:
-public:
  typedef FieldImplementation_ FieldImplementation;
  typedef typename FieldImplementation::Field MomentaField;  //for readability
  typedef typename FieldImplementation::Field Field;
@ -97,6 +96,7 @@ public:
  {
    t_P[level] += ep;
    update_P(P, U, level, ep);
+
    std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
  }

@ -130,20 +130,28 @@ public:
      Field force(U.Grid());
      conformable(U.Grid(), Mom.Grid());

+      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      double start_force = usecond();

+      std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl;
+      
      as[level].actions.at(a)->deriv_timer_start();
-      as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta
+      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
      as[level].actions.at(a)->deriv_timer_stop();

+      std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl;
+
+      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
      auto name = as[level].actions.at(a)->action_name();
+      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);

      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-      
-      MomFilter->applyFilter(force);

+      //      DumpSliceNorm("force ",force,Nd-1);
+      MomFilter->applyFilter(force);
      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl;
+      DumpSliceNorm("force filtered ",force,Nd-1);
      
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
@ -369,9 +377,14 @@ public:
 	auto name = as[level].actions.at(actionID)->action_name();
        std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;

+        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
+
+	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl;
+
 	as[level].actions.at(actionID)->refresh_timer_start();
-        as[level].actions.at(actionID)->refresh(Smearer, sRNG, pRNG);
+        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
 	as[level].actions.at(actionID)->refresh_timer_stop();
+	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl;

      }

@ -412,9 +425,10 @@ public:

        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
+        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
 	        as[level].actions.at(actionID)->S_timer_start();
-        Hterm = as[level].actions.at(actionID)->S(Smearer);
+        Hterm = as[level].actions.at(actionID)->S(Us);
   	        as[level].actions.at(actionID)->S_timer_stop();
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
@ -455,11 +469,12 @@ public:
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
+        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
+	        as[level].actions.at(actionID)->S_timer_start();

-	as[level].actions.at(actionID)->S_timer_start();
-        Hterm = as[level].actions.at(actionID)->S(Smearer);
-	as[level].actions.at(actionID)->S_timer_stop();
+        Hterm = as[level].actions.at(actionID)->Sinitial(Us);
+   	        as[level].actions.at(actionID)->S_timer_stop();

        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
--- a/Grid/qcd/observables/hmc_observable.h
+++ b/Grid/qcd/observables/hmc_observable.h
@ -34,13 +34,6 @@ NAMESPACE_BEGIN(Grid);
 template <class Field>
 class HmcObservable {
 public:
-  virtual void TrajectoryComplete(int traj,
-                                  ConfigurationBase<Field> &SmartConfig,
-                                  GridSerialRNG &sRNG,
-                                  GridParallelRNG &pRNG)
-  {
-    TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
-  };
  virtual void TrajectoryComplete(int traj,
                                  Field &U,
                                  GridSerialRNG &sRNG,
--- a/Grid/qcd/observables/plaquette.h
+++ b/Grid/qcd/observables/plaquette.h
@ -42,18 +42,6 @@ public:
  // necessary for HmcObservable compatibility
  typedef typename Impl::Field Field;

-  virtual void TrajectoryComplete(int traj,
-                                  ConfigurationBase<Field> &SmartConfig,
-                                  GridSerialRNG &sRNG,
-                                  GridParallelRNG &pRNG)
-  {
-    std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
-    std::cout << GridLogMessage << "Unsmeared plaquette"<<std::endl;
-    TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
-    std::cout << GridLogMessage << "Smeared plaquette"<<std::endl;
-    TrajectoryComplete(traj,SmartConfig.get_U(true),sRNG,pRNG); // Unsmeared observable
-    std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
-  };
  void TrajectoryComplete(int traj,
                          Field &U,
                          GridSerialRNG &sRNG,
--- a/Grid/qcd/smearing/GaugeConfiguration.h
+++ b/Grid/qcd/smearing/GaugeConfiguration.h
@ -7,27 +7,26 @@

 NAMESPACE_BEGIN(Grid);

-
 //trivial class for no smearing
 template< class Impl >
-class NoSmearing : public ConfigurationBase<typename Impl::Field>
+class NoSmearing
 {
 public:
  INHERIT_FIELD_TYPES(Impl);

-  Field* ThinLinks;
+  Field* ThinField;

-  NoSmearing(): ThinLinks(NULL) {}
+  NoSmearing(): ThinField(NULL) {}

-  virtual void set_Field(Field& U) { ThinLinks = &U; }
+  void set_Field(Field& U) { ThinField = &U; }

-  virtual void smeared_force(Field&) {}
+  void smeared_force(Field&) const {}

-  virtual Field& get_SmearedU() { return *ThinLinks; }
+  Field& get_SmearedU() { return *ThinField; }

-  virtual Field &get_U(bool smeared = false)
+  Field &get_U(bool smeared = false)
  {
-    return *ThinLinks;
+    return *ThinField;
  }
 };

@ -43,24 +42,19 @@ public:
  It stores a list of smeared configurations.
 */
 template <class Gimpl>
-class SmearedConfiguration : public ConfigurationBase<typename Gimpl::Field>
+class SmearedConfiguration
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);

-protected:
+private:
  const unsigned int smearingLevels;
  Smear_Stout<Gimpl> *StoutSmearing;
  std::vector<GaugeField> SmearedSet;
-public:
-  GaugeField*  ThinLinks; /* Pointer to the thin links configuration */ // move to base???
-protected:
-  
+
  // Member functions
  //====================================================================
-
-  // Overridden in masked version
-  virtual void fill_smearedSet(GaugeField &U)
+  void fill_smearedSet(GaugeField &U)
  {
    ThinLinks = &U;  // attach the smearing routine to the field U

@ -88,10 +82,9 @@ protected:
      }
    }
  }
-
-  //overridden in masked verson
-  virtual GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
-					  const GaugeField& GaugeK) const 
+  //====================================================================
+  GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
+                                  const GaugeField& GaugeK) const 
  {
    GridBase* grid = GaugeK.Grid();
    GaugeField C(grid), SigmaK(grid), iLambda(grid);
@ -220,6 +213,8 @@ protected:

  //====================================================================
 public:
+  GaugeField*
+      ThinLinks; /* Pointer to the thin links configuration */

  /* Standard constructor */
  SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
@ -235,7 +230,7 @@ public:
    : smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {}

  // attach the smeared routines to the thin links U and fill the smeared set
-  virtual void set_Field(GaugeField &U)
+  void set_Field(GaugeField &U)
  {
    double start = usecond();
    fill_smearedSet(U);
@ -245,7 +240,7 @@ public:
  }

  //====================================================================
-  virtual void smeared_force(GaugeField &SigmaTilde) 
+  void smeared_force(GaugeField &SigmaTilde) const
  {
    if (smearingLevels > 0)
    {
@ -272,16 +267,14 @@ public:
      }
      double end = usecond();
      double time = (end - start)/ 1e3;
-      std::cout << GridLogMessage << " GaugeConfiguration: Smeared Force chain rule took " << time << " ms" << std::endl;
+      std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;  
    }  // if smearingLevels = 0 do nothing
-    SigmaTilde=Gimpl::projectForce(SigmaTilde); // Ta
-      
  }
  //====================================================================

-  virtual GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
+  GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }

-  virtual GaugeField &get_U(bool smeared = false)
+  GaugeField &get_U(bool smeared = false)
  {
    // get the config, thin links by default
    if (smeared)
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@ -1,813 +0,0 @@
-/*!
-  @file GaugeConfiguration.h
-  @brief Declares the GaugeConfiguration class
-*/
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-/*!
-  @brief Smeared configuration masked container
-  Modified for a multi-subset smearing (aka Luscher Flowed HMC)
-*/
-template <class Gimpl>
-class SmearedConfigurationMasked : public SmearedConfiguration<Gimpl>
-{
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-private:
-  // These live in base class
-  //  const unsigned int smearingLevels;
-  //  Smear_Stout<Gimpl> *StoutSmearing;
-  //  std::vector<GaugeField> SmearedSet;
-  
-  std::vector<LatticeLorentzComplex> masks;
-
-  typedef typename SU3Adjoint::AMatrix AdjMatrix;
-  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
-  typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField;
-
-  // Adjoint vector to GaugeField force
-  void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu)
-  {
-    Complex ci(0,1);
-    GaugeLinkField Fdet_pol(Fdet.Grid());
-    Fdet_pol=Zero();
-    for(int e=0;e<8;e++){
-      ColourMatrix te;
-      SU3::generator(e, te);
-      auto tmp=peekColour(Fdet_nu,e);
-      Fdet_pol=Fdet_pol + ci*tmp*te; // but norm of te is different.. why?
-    }
-    pokeLorentz(Fdet, Fdet_pol, nu);
-  }
-  void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
-  {
-    GaugeLinkField UtaU(PlaqL.Grid());
-    GaugeLinkField D(PlaqL.Grid());
-    AdjMatrixField Dbc(PlaqL.Grid());
-    LatticeComplex tmp(PlaqL.Grid());
-    const int Ngen = SU3Adjoint::Dimension;
-    Complex ci(0,1);
-    ColourMatrix   ta,tb,tc;
-    
-    for(int a=0;a<Ngen;a++) {
-      SU3::generator(a, ta);
-      // Qlat Tb = 2i Tb^Grid
-      UtaU= 2.0*ci*adj(PlaqL)*ta*PlaqR;
-      for(int c=0;c<Ngen;c++) {
-	SU3::generator(c, tc);
-	D = Ta( (2.0)*ci*tc *UtaU);
-	for(int b=0;b<Ngen;b++){
-	  SU3::generator(b, tb);
-	  tmp =-trace(ci*tb*D); 
-	  PokeIndex<ColourIndex>(Dbc,tmp,b,c);  // Adjoint rep
-	}
-      }
-      tmp = trace(MpInvJx * Dbc);
-      PokeIndex<ColourIndex>(Fdet2,tmp,a);
-    }
-  }
-  
-  void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd)
-  {
-    GaugeLinkField Nx(PlaqL.Grid());
-    const int Ngen = SU3Adjoint::Dimension;
-    Complex ci(0,1);
-    ColourMatrix   tb;
-    ColourMatrix   tc;
-    for(int b=0;b<Ngen;b++) {
-      SU3::generator(b, tb);
-      Nx = (2.0)*Ta( adj(PlaqL)*ci*tb * PlaqR );
-      for(int c=0;c<Ngen;c++) {
-	SU3::generator(c, tc);
-	auto tmp =closure( -trace(ci*tc*Nx)); 
-	PokeIndex<ColourIndex>(NxAd,tmp,c,b); 
-      }
-    }
-  }
-  void ApplyMask(GaugeField &U,int smr)
-  {
-    LatticeComplex tmp(U.Grid());
-    GaugeLinkField Umu(U.Grid());
-    for(int mu=0;mu<Nd;mu++){
-      Umu=PeekIndex<LorentzIndex>(U,mu);
-      tmp=PeekIndex<LorentzIndex>(masks[smr],mu);
-      Umu=Umu*tmp;
-      PokeIndex<LorentzIndex>(U, Umu, mu);
-    }
-  }
-public:
-
-  void logDetJacobianForceLevel(const GaugeField &U, GaugeField &force ,int smr)
-  {
-    GridBase* grid = U.Grid();
-    ColourMatrix   tb;
-    ColourMatrix   tc;
-    ColourMatrix   ta;
-    GaugeField C(grid);
-    GaugeField Umsk(grid);
-    std::vector<GaugeLinkField> Umu(Nd,grid);
-    GaugeLinkField Cmu(grid); // U and staple; C contains factor of epsilon
-    GaugeLinkField Zx(grid);  // U times Staple, contains factor of epsilon
-    GaugeLinkField Nxx(grid);  // Nxx fundamental space
-    GaugeLinkField Utmp(grid);
-    GaugeLinkField PlaqL(grid);
-    GaugeLinkField PlaqR(grid);
-    const int Ngen = SU3Adjoint::Dimension;
-    AdjMatrix TRb;
-    ColourMatrix Ident;
-    LatticeComplex  cplx(grid);
-    
-    AdjVectorField  dJdXe_nMpInv(grid); 
-    AdjVectorField  dJdXe_nMpInv_y(grid); 
-    AdjMatrixField  MpAd(grid);    // Mprime luchang's notes
-    AdjMatrixField  MpAdInv(grid); // Mprime inverse
-    AdjMatrixField  NxxAd(grid);    // Nxx in adjoint space
-    AdjMatrixField  JxAd(grid);     
-    AdjMatrixField  ZxAd(grid);
-    AdjMatrixField  mZxAd(grid);
-    AdjMatrixField  X(grid);
-    Complex ci(0,1);
-
-    RealD t0 = usecond();
-    Ident = ComplexD(1.0);
-    for(int d=0;d<Nd;d++){
-      Umu[d] = peekLorentz(U, d);
-    }
-    int mu= (smr/2) %Nd;
-
-    ////////////////////////////////////////////////////////////////////////////////
-    // Mask the gauge field
-    ////////////////////////////////////////////////////////////////////////////////
-    auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
-
-    Umsk = U;
-    ApplyMask(Umsk,smr);
-    Utmp = peekLorentz(Umsk,mu);
-
-    ////////////////////////////////////////////////////////////////////////////////
-    // Retrieve the eps/rho parameter(s) -- could allow all different but not so far
-    ////////////////////////////////////////////////////////////////////////////////
-    double rho=this->StoutSmearing->SmearRho[1];
-    int idx=0;
-    for(int mu=0;mu<4;mu++){
-    for(int nu=0;nu<4;nu++){
-      if ( mu!=nu) assert(this->StoutSmearing->SmearRho[idx]==rho);
-      else         assert(this->StoutSmearing->SmearRho[idx]==0.0);
-      idx++;
-    }}
-    //////////////////////////////////////////////////////////////////
-    // Assemble the N matrix
-    //////////////////////////////////////////////////////////////////
-    // Computes ALL the staples -- could compute one only and do it here
-    RealD time;
-    time=-usecond();
-    this->StoutSmearing->BaseSmear(C, U);
-    Cmu = peekLorentz(C, mu);
-
-    //////////////////////////////////////////////////////////////////
-    // Assemble Luscher exp diff map J matrix 
-    //////////////////////////////////////////////////////////////////
-    // Ta so Z lives in Lie algabra
-    Zx  = Ta(Cmu * adj(Umu[mu]));
-    time+=usecond();
-    std::cout << GridLogMessage << "Z took "<<time<< " us"<<std::endl;
-
-    time=-usecond();
-    // Move Z to the Adjoint Rep == make_adjoint_representation
-    ZxAd = Zero();
-    for(int b=0;b<8;b++) {
-      // Adj group sets traceless antihermitian T's -- Guido, really????
-      SU3::generator(b, tb);         // Fund group sets traceless hermitian T's
-      SU3Adjoint::generator(b,TRb);
-      TRb=-TRb;
-      cplx = 2.0*trace(ci*tb*Zx); // my convention 1/2 delta ba
-      ZxAd = ZxAd + cplx * TRb; // is this right? YES - Guido used Anti herm Ta's and with bloody wrong sign.
-    }
-    time+=usecond();
-    std::cout << GridLogMessage << "ZxAd took "<<time<< " us"<<std::endl;
-
-    //////////////////////////////////////
-    // J(x) = 1 + Sum_k=1..N (-Zac)^k/(k+1)!
-    //////////////////////////////////////
-    time=-usecond();
-    X=1.0; 
-    JxAd = X;
-    mZxAd = (-1.0)*ZxAd; 
-    RealD kpfac = 1;
-    for(int k=1;k<12;k++){
-      X=X*mZxAd;
-      kpfac = kpfac /(k+1);
-      JxAd = JxAd + X * kpfac;
-    }
-    time+=usecond();
-    std::cout << GridLogMessage << "Jx took "<<time<< " us"<<std::endl;
-
-    //////////////////////////////////////
-    // dJ(x)/dxe
-    //////////////////////////////////////
-    time=-usecond();
-    std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid);
-    AdjMatrixField tbXn(grid);
-    AdjMatrixField sumXtbX(grid);
-    AdjMatrixField t2(grid);
-    AdjMatrixField dt2(grid);
-    AdjMatrixField t3(grid);
-    AdjMatrixField dt3(grid);
-    AdjMatrixField aunit(grid);
-    for(int b=0;b<8;b++){
-      aunit = ComplexD(1.0);
-      SU3Adjoint::generator(b, TRb); //dt2
-
-      X  = (-1.0)*ZxAd; 
-      t2 = X;
-      dt2 = TRb;
-      for (int j = 20; j > 1; --j) {
-	t3 = t2*(1.0 / (j + 1))  + aunit;
-	dt3 = dt2*(1.0 / (j + 1));
-	t2 = X * t3;
-	dt2 = TRb * t3 + X * dt3;
-      }
-      dJdX[b] = -dt2; 
-    }
-    time+=usecond();
-    std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl;
-    /////////////////////////////////////////////////////////////////
-    // Mask Umu for this link
-    /////////////////////////////////////////////////////////////////
-    time=-usecond();
-    PlaqL = Ident;
-    PlaqR = Utmp*adj(Cmu);
-    ComputeNxy(PlaqL,PlaqR,NxxAd);
-    time+=usecond();
-    std::cout << GridLogMessage << "ComputeNxy took "<<time<< " us"<<std::endl;
-    
-    ////////////////////////////
-    // Mab
-    ////////////////////////////
-    MpAd = Complex(1.0,0.0);
-    MpAd = MpAd - JxAd * NxxAd;
-
-    /////////////////////////
-    // invert the 8x8
-    /////////////////////////
-    time=-usecond();
-    MpAdInv = Inverse(MpAd);
-    time+=usecond();
-    std::cout << GridLogMessage << "MpAdInv took "<<time<< " us"<<std::endl;
-    
-    RealD t3a = usecond();
-    /////////////////////////////////////////////////////////////////
-    // Nxx Mp^-1
-    /////////////////////////////////////////////////////////////////
-    AdjVectorField  FdetV(grid);
-    AdjVectorField  Fdet1_nu(grid);
-    AdjVectorField  Fdet2_nu(grid);
-    AdjVectorField  Fdet2_mu(grid);
-    AdjVectorField  Fdet1_mu(grid);
-
-    AdjMatrixField nMpInv(grid);
-    nMpInv= NxxAd *MpAdInv;
-
-    AdjMatrixField MpInvJx(grid);
-    AdjMatrixField MpInvJx_nu(grid);
-    MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
-
-    Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
-    Fdet2_mu=FdetV;
-    Fdet1_mu=Zero();
-    
-    for(int e =0 ; e<8 ; e++){
-      LatticeComplexD tr(grid);
-      ColourMatrix te;
-      SU3::generator(e, te);
-      tr = trace(dJdX[e] * nMpInv);
-      pokeColour(dJdXe_nMpInv,tr,e);
-    }
-    ///////////////////////////////
-    // Mask it off
-    ///////////////////////////////
-    auto tmp=PeekIndex<LorentzIndex>(masks[smr],mu);
-    dJdXe_nMpInv = dJdXe_nMpInv*tmp;
-    
-    //    dJdXe_nMpInv needs to multiply:
-    //       Nxx_mu (site local)                           (1)
-    //       Nxy_mu one site forward  in each nu direction (3)
-    //       Nxy_mu one site backward in each nu direction (3)
-    //       Nxy_nu 0,0  ; +mu,0; 0,-nu; +mu-nu   [ 3x4 = 12]
-    // 19 terms.
-    AdjMatrixField Nxy(grid);
-
-    GaugeField Fdet1(grid);
-    GaugeField Fdet2(grid);
-    GaugeLinkField Fdet_pol(grid); // one polarisation
-
-    RealD t4 = usecond();
-    for(int nu=0;nu<Nd;nu++){
-
-      if (nu!=mu) {
-	///////////////// +ve nu /////////////////
-	//     __
-	//    |  |
-	//    x==    // nu polarisation -- clockwise
-
-	time=-usecond();
-	PlaqL=Ident;
-
-	PlaqR=(-rho)*Gimpl::CovShiftForward(Umu[nu], nu,
- 	       Gimpl::CovShiftForward(Umu[mu], mu,
-	         Gimpl::CovShiftBackward(Umu[nu], nu,
-		   Gimpl::CovShiftIdentityBackward(Utmp, mu))));
-	time+=usecond();
-	std::cout << GridLogMessage << "PlaqLR took "<<time<< " us"<<std::endl;
-
-	time=-usecond();
-	dJdXe_nMpInv_y =   dJdXe_nMpInv;
-	ComputeNxy(PlaqL,PlaqR,Nxy);
-	Fdet1_nu = transpose(Nxy)*dJdXe_nMpInv_y;
-	time+=usecond();
-	std::cout << GridLogMessage << "ComputeNxy (occurs 6x) took "<<time<< " us"<<std::endl;
-
-	time=-usecond();
-	PlaqR=(-1.0)*PlaqR;
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
-	Fdet2_nu = FdetV;
-	time+=usecond();
-	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
-	
-	//    x==
-	//    |  |
-	//    .__|    // nu polarisation -- anticlockwise
-
-	PlaqR=(rho)*Gimpl::CovShiftForward(Umu[nu], nu,
-		      Gimpl::CovShiftBackward(Umu[mu], mu,
-    	 	        Gimpl::CovShiftIdentityBackward(Umu[nu], nu)));
-
-	PlaqL=Gimpl::CovShiftIdentityBackward(Utmp, mu);
-
-	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1);
-	ComputeNxy(PlaqL, PlaqR,Nxy);
-	Fdet1_nu = Fdet1_nu+transpose(Nxy)*dJdXe_nMpInv_y;
-	
-
-	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
-	Fdet2_nu = Fdet2_nu+FdetV;
-	
-	///////////////// -ve nu /////////////////
-	//  __
-	// |  |
-	// x==          // nu polarisation -- clockwise
-
-	PlaqL=(rho)* Gimpl::CovShiftForward(Umu[mu], mu,
-		       Gimpl::CovShiftForward(Umu[nu], nu,
-			 Gimpl::CovShiftIdentityBackward(Utmp, mu)));
-
-        PlaqR = Gimpl::CovShiftIdentityForward(Umu[nu], nu);
-
-	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1);
-	ComputeNxy(PlaqL,PlaqR,Nxy);
-	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
-
-	MpInvJx_nu = Cshift(MpInvJx,nu,1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
-	Fdet2_nu = Fdet2_nu+FdetV;
-	
-	// x==
-	// |  |
-	// |__|         // nu polarisation
-
-	PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[nu], nu,
- 	        Gimpl::CovShiftIdentityBackward(Utmp, mu));
-
-	PlaqR=Gimpl::CovShiftBackward(Umu[mu], mu,
-	        Gimpl::CovShiftIdentityForward(Umu[nu], nu));
-
-	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1);
-	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv_y,nu,1);
-
-	ComputeNxy(PlaqL,PlaqR,Nxy);
-	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
-
-	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
-	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
-	Fdet2_nu = Fdet2_nu+FdetV;
-
-	/////////////////////////////////////////////////////////////////////
-	// Set up the determinant force contribution in 3x3 algebra basis
-	/////////////////////////////////////////////////////////////////////
-	InsertForce(Fdet1,Fdet1_nu,nu);
-	InsertForce(Fdet2,Fdet2_nu,nu);
-	
-	//////////////////////////////////////////////////
-	// Parallel direction terms
-	//////////////////////////////////////////////////
-
-        //     __
-	//    |  "
-	//    |__"x    // mu polarisation
-	PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[mu], mu,
-		      Gimpl::CovShiftBackward(Umu[nu], nu,
-   		        Gimpl::CovShiftIdentityBackward(Utmp, mu)));
-
-	PlaqR=Gimpl::CovShiftIdentityBackward(Umu[nu], nu);
-	
-	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,-1);
-
-	ComputeNxy(PlaqL,PlaqR,Nxy);
-	Fdet1_mu = Fdet1_mu + transpose(Nxy)*dJdXe_nMpInv_y;
-
-	MpInvJx_nu = Cshift(MpInvJx,nu,-1);
-
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
-	Fdet2_mu = Fdet2_mu+FdetV;
-
-	//  __
-	// "  |
-	// x__|          // mu polarisation
-
-	PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[mu], mu,
-		       Gimpl::CovShiftForward(Umu[nu], nu,
-		 	 Gimpl::CovShiftIdentityBackward(Utmp, mu)));
-
-        PlaqR=Gimpl::CovShiftIdentityForward(Umu[nu], nu);
-
-	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1);
-
-	ComputeNxy(PlaqL,PlaqR,Nxy);
-	Fdet1_mu = Fdet1_mu + transpose(Nxy)*dJdXe_nMpInv_y;
-
-	MpInvJx_nu = Cshift(MpInvJx,nu,1);
-
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
-	Fdet2_mu = Fdet2_mu+FdetV;
-	
-      }
-    }
-    RealD t5 = usecond();
-
-    Fdet1_mu = Fdet1_mu + transpose(NxxAd)*dJdXe_nMpInv;
-
-    InsertForce(Fdet1,Fdet1_mu,mu);
-    InsertForce(Fdet2,Fdet2_mu,mu);
-
-    force= (-0.5)*( Fdet1 + Fdet2);
-    RealD t1 = usecond();
-    std::cout << GridLogMessage << " logDetJacobianForce level took "<<t1-t0<<" us "<<std::endl;
-    std::cout << GridLogMessage << " logDetJacobianForce t3-t0 "<<t3a-t0<<" us "<<std::endl;
-    std::cout << GridLogMessage << " logDetJacobianForce t4-t3 dJdXe_nMpInv "<<t4-t3a<<" us "<<std::endl;
-    std::cout << GridLogMessage << " logDetJacobianForce t5-t4 mu nu loop "<<t5-t4<<" us "<<std::endl;
-    std::cout << GridLogMessage << " logDetJacobianForce t1-t5 "<<t1-t5<<" us "<<std::endl;
-    std::cout << GridLogMessage << " logDetJacobianForce level took "<<t1-t0<<" us "<<std::endl;
-  }
-  RealD logDetJacobianLevel(const GaugeField &U,int smr)
-  {
-    GridBase* grid = U.Grid();
-    GaugeField C(grid);
-    GaugeLinkField Nb(grid);
-    GaugeLinkField Z(grid);
-    GaugeLinkField Umu(grid), Cmu(grid);
-    ColourMatrix   Tb;
-    ColourMatrix   Tc;
-    typedef typename SU3Adjoint::AMatrix AdjMatrix;
-    typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
-    typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField;
-    const int Ngen = SU3Adjoint::Dimension;
-    AdjMatrix TRb;
-    LatticeComplex       cplx(grid); 
-    AdjVectorField  AlgV(grid); 
-    AdjMatrixField  Mab(grid);
-    AdjMatrixField  Ncb(grid);
-    AdjMatrixField  Jac(grid);
-    AdjMatrixField  Zac(grid);
-    AdjMatrixField  mZac(grid);
-    AdjMatrixField  X(grid);
-
-    int mu= (smr/2) %Nd;
-
-    auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
-
-    //////////////////////////////////////////////////////////////////
-    // Assemble the N matrix
-    //////////////////////////////////////////////////////////////////
-    // Computes ALL the staples -- could compute one only here
-    this->StoutSmearing->BaseSmear(C, U);
-    Cmu = peekLorentz(C, mu);
-    Umu = peekLorentz(U, mu);
-    Complex ci(0,1);
-    for(int b=0;b<Ngen;b++) {
-      SU3::generator(b, Tb);
-      // Qlat Tb = 2i Tb^Grid
-      Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu));
-      for(int c=0;c<Ngen;c++) {
-	SU3::generator(c, Tc);
-	auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important
-	PokeIndex<ColourIndex>(Ncb,tmp,c,b); 
-      }
-    }      
-
-    //////////////////////////////////////////////////////////////////
-    // Assemble Luscher exp diff map J matrix 
-    //////////////////////////////////////////////////////////////////
-    // Ta so Z lives in Lie algabra
-    Z  = Ta(Cmu * adj(Umu));
-
-    // Move Z to the Adjoint Rep == make_adjoint_representation
-    Zac = Zero();
-    for(int b=0;b<8;b++) {
-      // Adj group sets traceless antihermitian T's -- Guido, really????
-      // Is the mapping of these the same? Same structure constants
-      // Might never have been checked.
-      SU3::generator(b, Tb);         // Fund group sets traceless hermitian T's
-      SU3Adjoint::generator(b,TRb);
-      TRb=-TRb;
-      cplx = 2.0*trace(ci*Tb*Z); // my convention 1/2 delta ba
-      Zac = Zac + cplx * TRb; // is this right? YES - Guido used Anti herm Ta's and with bloody wrong sign.
-    }
-
-    //////////////////////////////////////
-    // J(x) = 1 + Sum_k=1..N (-Zac)^k/(k+1)!
-    //////////////////////////////////////
-    X=1.0; 
-    Jac = X;
-    mZac = (-1.0)*Zac; 
-    RealD kpfac = 1;
-    for(int k=1;k<12;k++){
-      X=X*mZac;
-      kpfac = kpfac /(k+1);
-      Jac = Jac + X * kpfac;
-    }
-
-    ////////////////////////////
-    // Mab
-    ////////////////////////////
-    Mab = Complex(1.0,0.0);
-    Mab = Mab - Jac * Ncb;
-
-    ////////////////////////////
-    // det
-    ////////////////////////////
-    LatticeComplex       det(grid); 
-    det = Determinant(Mab);
-
-    ////////////////////////////
-    // ln det
-    ////////////////////////////
-    LatticeComplex       ln_det(grid); 
-    ln_det = log(det);
-
-    ////////////////////////////
-    // Masked sum
-    ////////////////////////////
-    ln_det = ln_det * mask;
-    Complex result = sum(ln_det);
-    return result.real();
-  }
-public:
-  RealD logDetJacobian(void)
-  {
-    RealD ln_det = 0;
-    if (this->smearingLevels > 0)
-    {
-      double start = usecond();
-      for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
-	ln_det+= logDetJacobianLevel(this->get_smeared_conf(ismr-1),ismr);
-      }
-      ln_det +=logDetJacobianLevel(*(this->ThinLinks),0);
-
-      double end = usecond();
-      double time = (end - start)/ 1e3;
-      std::cout << GridLogMessage << "GaugeConfigurationMasked: logDetJacobian took " << time << " ms" << std::endl;  
-    }
-    return ln_det;
-  }
-  void logDetJacobianForce(GaugeField &force)
-  {
-    force =Zero();
-    GaugeField force_det(force.Grid());
-
-    if (this->smearingLevels > 0)
-    {
-      double start = usecond();
-
-      GaugeLinkField tmp_mu(force.Grid());
-
-      for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
-
-	// remove U in UdSdU...
-	for (int mu = 0; mu < Nd; mu++) {
-	  tmp_mu = adj(peekLorentz(this->get_smeared_conf(ismr), mu)) * peekLorentz(force, mu);
-	  pokeLorentz(force, tmp_mu, mu);
-	}
-	
-      	// Propagate existing force
-        force = this->AnalyticSmearedForce(force, this->get_smeared_conf(ismr - 1), ismr);
-
-	// Add back U in UdSdU...
-	for (int mu = 0; mu < Nd; mu++) {
-	  tmp_mu = peekLorentz(this->get_smeared_conf(ismr - 1), mu) * peekLorentz(force, mu);
-	  pokeLorentz(force, tmp_mu, mu);
-	}
-    	
-	// Get this levels determinant force
-	force_det = Zero();
-	logDetJacobianForceLevel(this->get_smeared_conf(ismr-1),force_det,ismr);
-
-	// Sum the contributions
-	force = force + force_det;
-      }
-    
-      // remove U in UdSdU...
-      for (int mu = 0; mu < Nd; mu++) {
-	tmp_mu = adj(peekLorentz(this->get_smeared_conf(0), mu)) * peekLorentz(force, mu);
-	pokeLorentz(force, tmp_mu, mu);
-      }
-
-      force = this->AnalyticSmearedForce(force, *this->ThinLinks,0);
-
-      for (int mu = 0; mu < Nd; mu++) {
-	tmp_mu = peekLorentz(*this->ThinLinks, mu) * peekLorentz(force, mu);
-	pokeLorentz(force, tmp_mu, mu);
-      }
-
-      force_det = Zero();
-
-      logDetJacobianForceLevel(*this->ThinLinks,force_det,0);
-
-      force = force + force_det;
-
-      force=Ta(force); // Ta
-      
-      double end = usecond();
-      double time = (end - start)/ 1e3;
-      std::cout << GridLogMessage << "GaugeConfigurationMasked: lnDetJacobianForce took " << time << " ms" << std::endl;  
-    }  // if smearingLevels = 0 do nothing
-  }
-
-private:
-  //====================================================================
-  // Override base clas here to mask it
-  virtual void fill_smearedSet(GaugeField &U)
-  {
-    this->ThinLinks = &U;  // attach the smearing routine to the field U
-
-    // check the pointer is not null
-    if (this->ThinLinks == NULL)
-      std::cout << GridLogError << "[SmearedConfigurationMasked] Error in ThinLinks pointer\n";
-
-    if (this->smearingLevels > 0)
-    {
-      std::cout << GridLogMessage << "[SmearedConfigurationMasked] Filling SmearedSet\n";
-      GaugeField previous_u(this->ThinLinks->Grid());
-
-      GaugeField smeared_A(this->ThinLinks->Grid());
-      GaugeField smeared_B(this->ThinLinks->Grid());
-
-      previous_u = *this->ThinLinks;
-      double start = usecond();
-      for (int smearLvl = 0; smearLvl < this->smearingLevels; ++smearLvl)
-      {
-        this->StoutSmearing->smear(smeared_A, previous_u);
-	ApplyMask(smeared_A,smearLvl);
-	smeared_B = previous_u;
-	ApplyMask(smeared_B,smearLvl);
-	// Replace only the masked portion
-	this->SmearedSet[smearLvl] = previous_u-smeared_B + smeared_A;
-        previous_u = this->SmearedSet[smearLvl];
-
-        // For debug purposes
-        RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(previous_u);
-        std::cout << GridLogMessage << "[SmearedConfigurationMasked] smeared Plaq: " << impl_plaq << std::endl;
-      }
-      double end = usecond();
-      double time = (end - start)/ 1e3;
-      std::cout << GridLogMessage << "GaugeConfigurationMasked: Link smearing took " << time << " ms" << std::endl;  
-    }
-  }
-  //====================================================================
-  // Override base to add masking
-  virtual GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
-					  const GaugeField& GaugeK,int level) 
-  {
-    GridBase* grid = GaugeK.Grid();
-    GaugeField C(grid), SigmaK(grid), iLambda(grid);
-    GaugeField SigmaKPrimeA(grid);
-    GaugeField SigmaKPrimeB(grid);
-    GaugeLinkField iLambda_mu(grid);
-    GaugeLinkField iQ(grid), e_iQ(grid);
-    GaugeLinkField SigmaKPrime_mu(grid);
-    GaugeLinkField GaugeKmu(grid), Cmu(grid);
-    
-    this->StoutSmearing->BaseSmear(C, GaugeK);
-    SigmaK = Zero();
-    iLambda = Zero();
-
-    SigmaKPrimeA = SigmaKPrime;
-    ApplyMask(SigmaKPrimeA,level);
-    SigmaKPrimeB = SigmaKPrime - SigmaKPrimeA;
-    
-    // Could get away with computing only one polarisation here
-    // int mu= (smr/2) %Nd;
-    // SigmaKprime_A has only one component
-    for (int mu = 0; mu < Nd; mu++)
-    {
-      Cmu = peekLorentz(C, mu);
-      GaugeKmu = peekLorentz(GaugeK, mu);
-      SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
-      iQ = Ta(Cmu * adj(GaugeKmu));
-      this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
-      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
-      pokeLorentz(iLambda, iLambda_mu, mu);
-    }
-    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase
-
-    ////////////////////////////////////////////////////////////////////////////////////
-    // propagate the rest of the force as identity map, just add back
-    ////////////////////////////////////////////////////////////////////////////////////
-    SigmaK = SigmaK+SigmaKPrimeB;
-
-    return SigmaK;
-  }
-
-public:
-
-  /* Standard constructor */
-  SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
-    : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
-  {
-    assert(Nsmear%(2*Nd)==0); // Or multiply by 8??
-
-    // was resized in base class
-    assert(this->SmearedSet.size()==Nsmear);
-    
-    GridRedBlackCartesian * UrbGrid;
-    UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
-    LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
-    LatticeComplex tmp(_UGrid);
-
-    for (unsigned int i = 0; i < this->smearingLevels; ++i) {
-
-      masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
-
-      int mu= (i/2) %Nd;
-      int cb= (i%2);
-      LatticeComplex tmpcb(UrbGrid);
-	
-      masks[i]=Zero();
-      ////////////////////
-      // Setup the mask
-      ////////////////////
-      tmp = Zero();
-      pickCheckerboard(cb,tmpcb,one);
-      setCheckerboard(tmp,tmpcb);
-      PokeIndex<LorentzIndex>(masks[i],tmp, mu);
-	
-    }
-    delete UrbGrid;
-  }
-  
-  virtual void smeared_force(GaugeField &SigmaTilde) 
-  {
-    if (this->smearingLevels > 0)
-    {
-      double start = usecond();
-      GaugeField force = SigmaTilde; // actually = U*SigmaTilde
-      GaugeLinkField tmp_mu(SigmaTilde.Grid());
-
-      // Remove U from UdSdU
-      for (int mu = 0; mu < Nd; mu++)
-      {
-        // to get just SigmaTilde
-        tmp_mu = adj(peekLorentz(this->SmearedSet[this->smearingLevels - 1], mu)) * peekLorentz(force, mu);
-        pokeLorentz(force, tmp_mu, mu);
-      }
-
-      for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
-        force = this->AnalyticSmearedForce(force, this->get_smeared_conf(ismr - 1),ismr);
-      }
-      
-      force = this->AnalyticSmearedForce(force, *this->ThinLinks,0);
-
-      // Add U to UdSdU
-      for (int mu = 0; mu < Nd; mu++)
-      {
-        tmp_mu = peekLorentz(*this->ThinLinks, mu) * peekLorentz(force, mu);
-        pokeLorentz(SigmaTilde, tmp_mu, mu);
-      }
-
-
-      double end = usecond();
-      double time = (end - start)/ 1e3;
-      std::cout << GridLogMessage << " GaugeConfigurationMasked: Smeared Force chain rule took " << time << " ms" << std::endl;
-
-    }  // if smearingLevels = 0 do nothing
-    SigmaTilde=Gimpl::projectForce(SigmaTilde); // Ta
-  }
-
-};
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/smearing/JacobianAction.h
+++ b/Grid/qcd/smearing/JacobianAction.h
@ -1,87 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/gauge/JacobianAction.h
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-			   /*  END LEGAL */
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-////////////////////////////////////////////////////////////////////////
-// Jacobian Action .. 
-////////////////////////////////////////////////////////////////////////
-template <class Gimpl>
-class JacobianAction : public Action<typename Gimpl::GaugeField> {
-public:  
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  SmearedConfigurationMasked<Gimpl> * smearer;
-  /////////////////////////// constructors
-  explicit JacobianAction(SmearedConfigurationMasked<Gimpl> * _smearer ) { smearer=_smearer;};
-
-  virtual std::string action_name() {return "JacobianAction";}
-
-  virtual std::string LogParameters(){
-    std::stringstream sstream;
-    sstream << GridLogMessage << "[JacobianAction] " << std::endl;
-    return sstream.str();
-  }
-
-  //////////////////////////////////
-  // Usual cases are not used
-  //////////////////////////////////
-  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){ assert(0);};
-  virtual RealD S(const GaugeField &U) { assert(0); }
-  virtual void deriv(const GaugeField &U, GaugeField &dSdU) { assert(0);  }
-
-  //////////////////////////////////
-  // Functions of smart configs only
-  //////////////////////////////////
-  virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
-  {
-    return;
-  }
-  virtual RealD S(ConfigurationBase<GaugeField>& U)
-  {
-    // det M = e^{ - ( - logDetM) }
-    assert( &U == smearer );
-    return -smearer->logDetJacobian();
-  }
-  virtual RealD Sinitial(ConfigurationBase<GaugeField>& U) 
-  {
-    return S(U);
-  }
-  virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
-  {
-    assert( &U == smearer );
-    smearer->logDetJacobianForce(dSdU);
-  }
-
-private:
- };
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@ -40,9 +40,7 @@ template <class Gimpl>
 class Smear_Stout : public Smear<Gimpl> {
 private:
  int OrthogDim = -1;
-public:
  const std::vector<double> SmearRho;
-private:
  // Smear<Gimpl>* ownership semantics:
  //    Smear<Gimpl>* passed in to constructor are owned by caller, so we don't delete them here
  //    Smear<Gimpl>* created within constructor need to be deleted as part of the destructor
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@ -37,14 +37,13 @@ NAMESPACE_BEGIN(Grid);
 // Make these members of an Impl class for BC's.

 namespace PeriodicBC { 
-  //Out(x) = Link(x)*field(x+mu)
+
  template<class covariant,class gauge> Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link, 
 									   int mu,
 									   const Lattice<covariant> &field)
  {
    return Link*Cshift(field,mu,1);// moves towards negative mu
  }
-  //Out(x) = Link^dag(x-mu)*field(x-mu)
  template<class covariant,class gauge> Lattice<covariant> CovShiftBackward(const Lattice<gauge> &Link, 
 									    int mu,
 									    const Lattice<covariant> &field)
@ -53,19 +52,19 @@ namespace PeriodicBC {
    tmp = adj(Link)*field;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
-  //Out(x) = Link^dag(x-mu)
+
  template<class gauge> Lattice<gauge>
  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) 
  {
    return Cshift(adj(Link), mu, -1);
  }
-  //Out(x) = Link(x)
+
  template<class gauge> Lattice<gauge>
  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu)
  {
    return Link;
  }
-  //Link(x) = Link(x+mu)
+
  template<class gauge> Lattice<gauge>
  ShiftStaple(const Lattice<gauge> &Link, int mu)
  {
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@ -34,61 +34,6 @@ directory

 NAMESPACE_BEGIN(Grid);

-template<int N, class Vec>
-Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
-{
-  GridBase *grid=Umu.Grid();
-  auto lvol = grid->lSites();
-  Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
-  typedef typename Vec::scalar_type scalar;
-  autoView(Umu_v,Umu,CpuRead);
-  autoView(ret_v,ret,CpuWrite);
-  thread_for(site,lvol,{
-    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
-    Coordinate lcoor;
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    iScalar<iScalar<iMatrix<scalar, N> > > Us;
-    peekLocalSite(Us, Umu_v, lcoor);
-    for(int i=0;i<N;i++){
-      for(int j=0;j<N;j++){
-	scalar tmp= Us()()(i,j);
-	ComplexD ztmp(real(tmp),imag(tmp));
-	EigenU(i,j)=ztmp;
-      }}
-    ComplexD detD  = EigenU.determinant();
-    typename Vec::scalar_type det(detD.real(),detD.imag());
-    pokeLocalSite(det,ret_v,lcoor);
-  });
-  return ret;
-}
-
-template<int N, class Vec>
-static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
-{
-  Umu      = ProjectOnGroup(Umu);
-  auto det = Determinant(Umu);
-
-  det = conjugate(det);
-
-  for(int i=0;i<N;i++){
-    auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
-    element = element * det;
-    PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
-  }
-}
-template<int N,class Vec>
-static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<Vec, N> >,Nd> > &U)
-{
-  GridBase *grid=U.Grid();
-  // Reunitarise
-  for(int mu=0;mu<Nd;mu++){
-    auto Umu = PeekIndex<LorentzIndex>(U,mu);
-    Umu      = ProjectOnGroup(Umu);
-    ProjectSUn(Umu);
-    PokeIndex<LorentzIndex>(U,Umu,mu);
-  }
-}
-
 template <int ncolour>
 class SU {
 public:
@ -796,14 +741,8 @@ public:
    typedef Lattice<vMatrixType> LatticeMatrixType;

    LatticeMatrixType Umu(out.Grid());
-    LatticeMatrixType tmp(out.Grid());
    for (int mu = 0; mu < Nd; mu++) {
-      //      LieRandomize(pRNG, Umu, 1.0);
-      //      PokeIndex<LorentzIndex>(out, Umu, mu);
-      gaussian(pRNG,Umu);
-      tmp = Ta(Umu);
-      taExp(tmp,Umu);
-      ProjectSUn(Umu);
+      LieRandomize(pRNG, Umu, 1.0);
      PokeIndex<LorentzIndex>(out, Umu, mu);
    }
  }
@ -860,12 +799,12 @@ public:
 };

 template<int N>
-Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
+LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
 {
  GridBase *grid=Umu.Grid();
  auto lvol = grid->lSites();
-  Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
-  
+  LatticeComplexD ret(grid);
+
  autoView(Umu_v,Umu,CpuRead);
  autoView(ret_v,ret,CpuWrite);
  thread_for(site,lvol,{
@ -873,21 +812,42 @@ Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScala
    Coordinate lcoor;
    grid->LocalIndexToLocalCoor(site, lcoor);
    iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
-    iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
    peekLocalSite(Us, Umu_v, lcoor);
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	EigenU(i,j) = Us()()(i,j);
      }}
-    Eigen::MatrixXcd EigenUinv = EigenU.inverse();
-    for(int i=0;i<N;i++){
-      for(int j=0;j<N;j++){
-	Ui()()(i,j) = EigenUinv(i,j);
-      }}
-    pokeLocalSite(Ui,ret_v,lcoor);
+    ComplexD det = EigenU.determinant();
+    pokeLocalSite(det,ret_v,lcoor);
  });
  return ret;
 }
+template<int N>
+static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
+{
+  Umu      = ProjectOnGroup(Umu);
+  auto det = Determinant(Umu);
+
+  det = conjugate(det);
+
+  for(int i=0;i<N;i++){
+    auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
+    element = element * det;
+    PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
+  }
+}
+template<int N>
+static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<vComplexD, N> >,Nd> > &U)
+{
+  GridBase *grid=U.Grid();
+  // Reunitarise
+  for(int mu=0;mu<Nd;mu++){
+    auto Umu = PeekIndex<LorentzIndex>(U,mu);
+    Umu      = ProjectOnGroup(Umu);
+    ProjectSUn(Umu);
+    PokeIndex<LorentzIndex>(U,Umu,mu);
+  }
+}
 // Explicit specialisation for SU(3).
 // Explicit specialisation for SU(3).
 static void
--- a/Grid/qcd/utils/SUnAdjoint.h
+++ b/Grid/qcd/utils/SUnAdjoint.h
@ -51,7 +51,6 @@ public:
  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> > LatticeAdjFieldF;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> > LatticeAdjFieldD;

-  typedef Lattice<iScalar<iScalar<iVector<vComplex, Dimension> > > >  LatticeAdjVector;

  template <class cplx>
  static void generator(int Index, iSUnAdjointMatrix<cplx> &iAdjTa) {
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -290,7 +290,7 @@ public:
  }
 */
  //////////////////////////////////////////////////
-  // the sum over all nu-oriented staples for nu != mu on each site
+  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

@ -300,10 +300,6 @@ public:
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
-    Staple(staple, U, mu);
-  }
-
-  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
    staple = Zero();

    for (int nu = 0; nu < Nd; nu++) {
@ -339,202 +335,6 @@ public:
    }
  }

-  /////////////
-  //Staples for each direction mu, summed over nu != mu
-  //staple: output staples for each mu (Nd)
-  //U: link array (Nd)
-  /////////////
-  static void StapleAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U) {
-    assert(staple.size() == Nd); assert(U.size() == Nd);
-    for(int mu=0;mu<Nd;mu++) Staple(staple[mu], U, mu);
-  }
-
-
-  //A workspace class allowing reuse of the stencil
-  class WilsonLoopPaddedStencilWorkspace{
-    std::unique_ptr<GeneralLocalStencil> stencil;
-    size_t nshift;
-
-    void generateStencil(GridBase* padded_grid){
-      double t0 = usecond();
-      
-      //Generate shift arrays
-      std::vector<Coordinate> shifts = this->getShifts();
-      nshift = shifts.size();
-      
-      double t1 = usecond();
-      //Generate local stencil
-      stencil.reset(new GeneralLocalStencil(padded_grid,shifts));
-      double t2 = usecond();
-      std::cout << GridLogPerformance << " WilsonLoopPaddedWorkspace timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms" << std::endl;   
-    }
-  public:
-    //Get the stencil. If not already generated, or if generated using a different Grid than in PaddedCell, it will be created on-the-fly
-    const GeneralLocalStencil & getStencil(const PaddedCell &pcell){
-      assert(pcell.depth >= this->paddingDepth());
-      if(!stencil || stencil->Grid() != (GridBase*)pcell.grids.back() ) generateStencil((GridBase*)pcell.grids.back());
-      return *stencil;
-    }
-    size_t Nshift() const{ return nshift; }
-    
-    virtual std::vector<Coordinate> getShifts() const = 0;
-    virtual int paddingDepth() const = 0; //padding depth required
-    
-    virtual ~WilsonLoopPaddedStencilWorkspace(){}
-  };
-
-  //This workspace allows the sharing of a common PaddedCell object between multiple stencil workspaces
-  class WilsonLoopPaddedWorkspace{
-    std::vector<WilsonLoopPaddedStencilWorkspace*> stencil_wk;
-    std::unique_ptr<PaddedCell> pcell;
-
-    void generatePcell(GridBase* unpadded_grid){
-      assert(stencil_wk.size());
-      int max_depth = 0;
-      for(auto const &s : stencil_wk) max_depth=std::max(max_depth, s->paddingDepth());
-      
-      pcell.reset(new PaddedCell(max_depth, dynamic_cast<GridCartesian*>(unpadded_grid)));
-    }
-    
-  public:
-    //Add a stencil definition. This should be done before the first call to retrieve a stencil object.
-    //Takes ownership of the pointer
-    void addStencil(WilsonLoopPaddedStencilWorkspace *stencil){
-      assert(!pcell);
-      stencil_wk.push_back(stencil);
-    }
-
-    const GeneralLocalStencil & getStencil(const size_t stencil_idx, GridBase* unpadded_grid){
-      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
-      return stencil_wk[stencil_idx]->getStencil(*pcell);
-    }      
-    const PaddedCell & getPaddedCell(GridBase* unpadded_grid){
-      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
-      return *pcell;
-    }
-    
-    ~WilsonLoopPaddedWorkspace(){
-      for(auto &s : stencil_wk) delete s;
-    }
-  };
-
-  //A workspace class allowing reuse of the stencil
-  class StaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
-  public:
-    std::vector<Coordinate> getShifts() const override{
-      std::vector<Coordinate> shifts;
-      for(int mu=0;mu<Nd;mu++){
-	for(int nu=0;nu<Nd;nu++){
-	  if(nu != mu){
-	    Coordinate shift_0(Nd,0);
-	    Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
-	    Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
-	    Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
-	    Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
-      
-	    //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
-	    shifts.push_back(shift_0);
-	    shifts.push_back(shift_nu);
-	    shifts.push_back(shift_mu);
-      
-	    //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
-	    shifts.push_back(shift_mnu);
-	    shifts.push_back(shift_mnu);
-	    shifts.push_back(shift_mnu_pmu);
-	  }
-	}
-      }
-      return shifts;
-    }
-
-    int paddingDepth() const override{ return 1; }
-  }; 
-
-  //Padded cell implementation of the staple method for all mu, summed over nu != mu
-  //staple: output staple for each mu, summed over nu != mu (Nd)
-  //U_padded: the gauge link fields padded out using the PaddedCell class
-  //Cell: the padded cell class
-  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
-    StaplePaddedAllWorkspace wk;
-    StaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
-  }
-  
-  //Padded cell implementation of the staple method for all mu, summed over nu != mu
-  //staple: output staple for each mu, summed over nu != mu (Nd)
-  //U_padded: the gauge link fields padded out using the PaddedCell class
-  //Cell: the padded cell class
-  //gStencil: the precomputed generalized local stencil for the staple
-  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
-    double t0 = usecond();
-    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
-    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
-    assert(Cell.depth >= 1);
-    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
-
-    int shift_mu_off = gStencil._npoints/Nd;
-    
-    //Open views to padded gauge links and keep open over mu loop
-    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
-    size_t vsize = Nd*sizeof(GaugeViewType);
-    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
-    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
-    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
-    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
-    
-    GaugeMat gStaple(ggrid);
-
-    int outer_off = 0;
-    for(int mu=0;mu<Nd;mu++){
-      { //view scope
-	autoView( gStaple_v , gStaple, AcceleratorWrite);
-	auto gStencil_v = gStencil.View();
-	
-	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
-	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
-	    stencil_ss = Zero();
-	    int off = outer_off;
-	    
-	    for(int nu=0;nu<Nd;nu++){
-	      if(nu != mu){	  
-		GeneralStencilEntry const* e = gStencil_v.GetEntry(off++,ss);
-		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(off++,ss);
-		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(off++,ss);
-		auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-      
-		stencil_ss = stencil_ss + U2 * U1 * U0;
-
-		e = gStencil_v.GetEntry(off++,ss);
-		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(off++,ss);
-		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(off++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-
-		stencil_ss = stencil_ss + U2 * U1 * U0;
-	      }
-	    }
-		
-	    coalescedWrite(gStaple_v[ss],stencil_ss);
-	  }
-	  );
-      } //ensure views are all closed!
-      
-      staple[mu] = Cell.Extract(gStaple);
-      outer_off += shift_mu_off;
-    }//mu loop
-
-    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
-    free(Ug_dirs_v_host);
-    acceleratorFreeDevice(Ug_dirs_v);
-    
-    double t1=usecond();
-    
-    std::cout << GridLogPerformance << "StaplePaddedAll timing:" << (t1-t0)/1000 << "ms" << std::endl;   
-  }
-
-   
  //////////////////////////////////////////////////
  // the sum over all staples on each site in direction mu,nu, upper part
  //////////////////////////////////////////////////
@ -907,14 +707,18 @@ public:
  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
-    U2 = U * Gimpl::CshiftLink(U, mu, 1);
+    U2 = U * Cshift(U, mu, 1);
  }

  ////////////////////////////////////////////////////////////////////////////
-  // Hop by two optimisation strategy. Use RectStapleDouble to obtain 'U2'
+  // Hop by two optimisation strategy does not work nicely with Gparity. (could
+  // do,
+  // but need to track two deep where cross boundary and apply a conjugation).
+  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do
+  // so .
  ////////////////////////////////////////////////////////////////////////////
-  static void RectStapleOptimised(GaugeMat &Stap, const std::vector<GaugeMat> &U2,
-                                  const std::vector<GaugeMat> &U, int mu) {
+  static void RectStapleOptimised(GaugeMat &Stap, std::vector<GaugeMat> &U2,
+                                  std::vector<GaugeMat> &U, int mu) {

    Stap = Zero();

@ -928,9 +732,9 @@ public:

        // Up staple    ___ ___
        //             |       |
-        tmp = Gimpl::CshiftLink(adj(U[nu]), nu, -1);
+        tmp = Cshift(adj(U[nu]), nu, -1);
        tmp = adj(U2[mu]) * tmp;
-        tmp = Gimpl::CshiftLink(tmp, mu, -2);
+        tmp = Cshift(tmp, mu, -2);

        Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);

@ -938,14 +742,14 @@ public:
        //             |___ ___|
        //
        tmp = adj(U2[mu]) * U[nu];
-        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CshiftLink(tmp, mu, -2));
+        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Cshift(tmp, mu, -2));

        //              ___ ___
        //             |    ___|
        //             |___ ___|
        //

-        Stap += Gimpl::CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
+        Stap += Cshift(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);

        //              ___ ___
        //             |___    |
@ -954,7 +758,7 @@ public:

        //  tmp= Staple2x1* Cshift(U[mu],mu,-2);
        //  Stap+= Cshift(tmp,mu,1) ;
-        Stap += Gimpl::CshiftLink(Staple2x1, mu, 1) * Gimpl::CshiftLink(U[mu], mu, -1);
+        Stap += Cshift(Staple2x1, mu, 1) * Cshift(U[mu], mu, -1);
        ;

        //       --
@ -962,10 +766,10 @@ public:
        //
        //      |  |

-        tmp = Gimpl::CshiftLink(adj(U2[nu]), nu, -2);
+        tmp = Cshift(adj(U2[nu]), nu, -2);
        tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
-        tmp = U2[nu] * Gimpl::CshiftLink(tmp, nu, 2);
-        Stap += Gimpl::CshiftLink(tmp, mu, 1);
+        tmp = U2[nu] * Cshift(tmp, nu, 2);
+        Stap += Cshift(tmp, mu, 1);

        //      |  |
        //
@ -974,12 +778,25 @@ public:

        tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
        tmp = adj(U2[nu]) * tmp;
-        tmp = Gimpl::CshiftLink(tmp, nu, -2);
-        Stap += Gimpl::CshiftLink(tmp, mu, 1);
+        tmp = Cshift(tmp, nu, -2);
+        Stap += Cshift(tmp, mu, 1);
      }
    }
  }

+  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
+    RectStapleUnoptimised(Stap, Umu, mu);
+  }
+  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
+                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
+                         int mu) {
+    if (Gimpl::isPeriodicGaugeField()) {
+      RectStapleOptimised(Stap, U2, U, mu);
+    } else {
+      RectStapleUnoptimised(Stap, Umu, mu);
+    }
+  }
+
  static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu,
                                    int mu) {
    GridBase *grid = Umu.Grid();
@ -1078,288 +895,6 @@ public:
    }
  }

-  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
-    RectStapleUnoptimised(Stap, Umu, mu);
-  }
-  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
-                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
-                         int mu) {
-    RectStapleOptimised(Stap, U2, U, mu);
-  }
-  //////////////////////////////////////////////////////
-  //Compute the rectangular staples for all orientations
-  //Stap : Array of staples (Nd)
-  //U: Gauge links in each direction (Nd)
-  /////////////////////////////////////////////////////
-  static void RectStapleAll(std::vector<GaugeMat> &Stap, const std::vector<GaugeMat> &U){
-    assert(Stap.size() == Nd); assert(U.size() == Nd);
-    std::vector<GaugeMat> U2(Nd,U[0].Grid());
-    for(int mu=0;mu<Nd;mu++) RectStapleDouble(U2[mu], U[mu], mu);
-    for(int mu=0;mu<Nd;mu++) RectStapleOptimised(Stap[mu], U2, U, mu);
-  }
-
-  //A workspace class allowing reuse of the stencil
-  class RectStaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
-  public:
-    std::vector<Coordinate> getShifts() const override{
-      std::vector<Coordinate> shifts;
-      for (int mu = 0; mu < Nd; mu++){
-	for (int nu = 0; nu < Nd; nu++) {
-	  if (nu != mu) {
-	    auto genShift = [&](int mushift,int nushift){
-	      Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
-	    };
-
-	    //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
-	    shifts.push_back(genShift(0,0));
-	    shifts.push_back(genShift(0,+1));
-	    shifts.push_back(genShift(+1,+1));
-	    shifts.push_back(genShift(+2,0));
-	    shifts.push_back(genShift(+1,0));
-
-	    //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
-	    shifts.push_back(genShift(0,-1));
-	    shifts.push_back(genShift(0,-1));
-	    shifts.push_back(genShift(+1,-1));
-	    shifts.push_back(genShift(+2,-1));
-	    shifts.push_back(genShift(+1,0));
-
-	    //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
-	    shifts.push_back(genShift(-1,0));
-	    shifts.push_back(genShift(-1,-1));
-	    shifts.push_back(genShift(-1,-1));
-	    shifts.push_back(genShift(0,-1));
-	    shifts.push_back(genShift(+1,-1));
-
-	    //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
-	    shifts.push_back(genShift(-1,0));
-	    shifts.push_back(genShift(-1,0));
-	    shifts.push_back(genShift(-1,+1));
-	    shifts.push_back(genShift(0,+1));
-	    shifts.push_back(genShift(+1,0));
-
-	    //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
-	    shifts.push_back(genShift(0,0));
-	    shifts.push_back(genShift(0,+1));
-	    shifts.push_back(genShift(0,+2));
-	    shifts.push_back(genShift(+1,+1));
-	    shifts.push_back(genShift(+1,0));
-
-	    //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
-	    shifts.push_back(genShift(0,-1));
-	    shifts.push_back(genShift(0,-2));
-	    shifts.push_back(genShift(0,-2));
-	    shifts.push_back(genShift(+1,-2));
-	    shifts.push_back(genShift(+1,-1));
-	  }
-	}
-      }
-      return shifts;
-    }
-
-    int paddingDepth() const override{ return 2; }
-  }; 
-
-  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
-  //staple: output staple for each mu, summed over nu != mu (Nd)
-  //U_padded: the gauge link fields padded out using the PaddedCell class
-  //Cell: the padded cell class
-  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
-    RectStaplePaddedAllWorkspace wk;
-    RectStaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
-  }
-  
-  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
-  //staple: output staple for each mu, summed over nu != mu (Nd)
-  //U_padded: the gauge link fields padded out using the PaddedCell class
-  //Cell: the padded cell class
-  //gStencil: the stencil
-  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
-    double t0 = usecond();
-    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
-    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
-    assert(Cell.depth >= 2);
-    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
-
-    size_t nshift = gStencil._npoints;
-    int mu_off_delta = nshift / Nd;
-    
-    //Open views to padded gauge links and keep open over mu loop
-    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
-    size_t vsize = Nd*sizeof(GaugeViewType);
-    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
-    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
-    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
-    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
-
-    GaugeMat gStaple(ggrid); //temp staple object on padded grid
-
-    int offset = 0;
-    for(int mu=0; mu<Nd; mu++){
-
-      { //view scope
-	autoView( gStaple_v , gStaple, AcceleratorWrite);
-	auto gStencil_v = gStencil.View();
-
-	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
-	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
-	    stencil_ss = Zero();
-	    int s=offset;
-	    for(int nu=0;nu<Nd;nu++){
-	      if(nu != mu){
-		//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
-		GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
-		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-	    
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-		//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
-		e = gStencil_v.GetEntry(s++,ss);
-		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-		//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
-		e = gStencil_v.GetEntry(s++,ss);
-		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-		//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
-		e = gStencil_v.GetEntry(s++,ss);
-		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-		//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
-		e = gStencil_v.GetEntry(s++,ss);
-		U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
-
-		//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
-		e = gStencil_v.GetEntry(s++,ss);
-		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(s++,ss);
-		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(s++,ss);
-		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-
-		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
-
-	      }
-	    }
-	    coalescedWrite(gStaple_v[ss],stencil_ss);
-	  }
-	  );
-	offset += mu_off_delta;
-      }//kernel/view scope
-
-      staple[mu] = Cell.Extract(gStaple);    
-    }//mu loop
-  
-    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
-    free(Ug_dirs_v_host);
-    acceleratorFreeDevice(Ug_dirs_v);
-    
-    double t1 = usecond();
-    
-    std::cout << GridLogPerformance << "RectStaplePaddedAll timings:" << (t1-t0)/1000 << "ms" << std::endl;   
-  }
-
-  //A workspace for reusing the PaddedCell and GeneralLocalStencil objects
-  class StapleAndRectStapleAllWorkspace: public WilsonLoopPaddedWorkspace{
-  public:
-    StapleAndRectStapleAllWorkspace(){
-      this->addStencil(new StaplePaddedAllWorkspace);
-      this->addStencil(new RectStaplePaddedAllWorkspace);
-    }
-  };     
-    
-  //////////////////////////////////////////////////////
-  //Compute the 1x1 and 1x2 staples for all orientations
-  //Stap : Array of staples (Nd)
-  //RectStap: Array of rectangular staples (Nd)
-  //U: Gauge links in each direction (Nd)
-  /////////////////////////////////////////////////////
-  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U){
-    StapleAndRectStapleAllWorkspace wk;
-    StapleAndRectStapleAll(Stap,RectStap,U,wk);
-  }
-  
-  //////////////////////////////////////////////////////
-  //Compute the 1x1 and 1x2 staples for all orientations
-  //Stap : Array of staples (Nd)
-  //RectStap: Array of rectangular staples (Nd)
-  //U: Gauge links in each direction (Nd)
-  //wk: a workspace containing stored PaddedCell and GeneralLocalStencil objects to maximize reuse
-  /////////////////////////////////////////////////////
-  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U, StapleAndRectStapleAllWorkspace &wk){
-#if 0
-    StapleAll(Stap, U);
-    RectStapleAll(RectStap, U);
-#else
-    double t0 = usecond();
-
-    GridCartesian* unpadded_grid = dynamic_cast<GridCartesian*>(U[0].Grid());
-    const PaddedCell &Ghost = wk.getPaddedCell(unpadded_grid);
-        
-    CshiftImplGauge<Gimpl> cshift_impl;
-    std::vector<GaugeMat> U_pad(Nd, Ghost.grids.back());
-    for(int mu=0;mu<Nd;mu++) U_pad[mu] = Ghost.Exchange(U[mu], cshift_impl);
-    double t1 = usecond();
-    StaplePaddedAll(Stap, U_pad, Ghost, wk.getStencil(0,unpadded_grid) );
-    double t2 = usecond();
-    RectStaplePaddedAll(RectStap, U_pad, Ghost, wk.getStencil(1,unpadded_grid));
-    double t3 = usecond();
-    std::cout << GridLogPerformance << "StapleAndRectStapleAll timings: pad:" << (t1-t0)/1000 << "ms, staple:" << (t2-t1)/1000 << "ms, rect-staple:" << (t3-t2)/1000 << "ms" << std::endl;
-#endif
-  }
-
  //////////////////////////////////////////////////
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
  //////////////////////////////////////////////////
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@ -79,60 +79,60 @@ public:
    this->_entries.resize(npoints* osites);
    this->_entries_p = &_entries[0];

-    thread_for(site, osites, {
-	Coordinate Coor;
-	Coordinate NbrCoor;

-	for(Integer ii=0;ii<npoints;ii++){
-	  Integer lex = site*npoints+ii;
-	  GeneralStencilEntry SE;
-	  ////////////////////////////////////////////////
-	  // Outer index of neighbour Offset calculation
-	  ////////////////////////////////////////////////
-	  grid->oCoorFromOindex(Coor,site);
-	  for(int d=0;d<Coor.size();d++){
-	    int rd = grid->_rdimensions[d];
-	    NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
-	  }
-	  SE._offset      = grid->oIndexReduced(NbrCoor);
-
-	  ////////////////////////////////////////////////
-	  // Inner index permute calculation
-	  // Simpler version using icoor calculation
-	  ////////////////////////////////////////////////
-	  SE._permute =0;
-	  for(int d=0;d<Coor.size();d++){
-
-	    int fd = grid->_fdimensions[d];
-	    int rd = grid->_rdimensions[d];
-	    int ly = grid->_simd_layout[d];
-
-	    assert((ly==1)||(ly==2));
-
-	    int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
-	    int x = Coor[d];                // x in [0... rd-1] as an oSite 
-
-	    int permute_dim  = grid->PermuteDim(d);
-	    int permute_slice=0;
-	    if(permute_dim){    
-	      int  num = shift%rd; // Slice within dest osite cell of slice zero
-	      int wrap = shift/rd; // Number of osite local volume cells crossed through
-	      // x+num < rd dictates whether we are in same permute state as slice 0
-	      if ( x< rd-num ) permute_slice=wrap;
-	      else             permute_slice=(wrap+1)%ly;
-	    }
-	    if ( permute_slice ) {
-	      int ptype       =grid->PermuteType(d);
-	      uint8_t mask    =0x1<<ptype;
-	      SE._permute    |= mask;
-	    }
-	  }	
-	  ////////////////////////////////////////////////
-	  // Store in look up table
-	  ////////////////////////////////////////////////
-	  this->_entries[lex] = SE;
+    Coordinate Coor;
+    Coordinate NbrCoor;
+    for(Integer site=0;site<osites;site++){
+      for(Integer ii=0;ii<npoints;ii++){
+	Integer lex = site*npoints+ii;
+	GeneralStencilEntry SE;
+	////////////////////////////////////////////////
+	// Outer index of neighbour Offset calculation
+	////////////////////////////////////////////////
+	grid->oCoorFromOindex(Coor,site);
+	for(int d=0;d<Coor.size();d++){
+	  int rd = grid->_rdimensions[d];
+	  NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
 	}
-      });
+	SE._offset      = grid->oIndexReduced(NbrCoor);
+
+	////////////////////////////////////////////////
+	// Inner index permute calculation
+	// Simpler version using icoor calculation
+	////////////////////////////////////////////////
+	SE._permute =0;
+	for(int d=0;d<Coor.size();d++){
+
+	  int fd = grid->_fdimensions[d];
+	  int rd = grid->_rdimensions[d];
+	  int ly = grid->_simd_layout[d];
+
+	  assert((ly==1)||(ly==2));
+
+	  int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
+	  int x = Coor[d];                // x in [0... rd-1] as an oSite 
+
+	  int permute_dim  = grid->PermuteDim(d);
+	  int permute_slice=0;
+	  if(permute_dim){    
+	    int  num = shift%rd; // Slice within dest osite cell of slice zero
+	    int wrap = shift/rd; // Number of osite local volume cells crossed through
+                                  // x+num < rd dictates whether we are in same permute state as slice 0
+	    if ( x< rd-num ) permute_slice=wrap;
+	    else             permute_slice=(wrap+1)%ly;
+	  }
+	  if ( permute_slice ) {
+	    int ptype       =grid->PermuteType(d);
+	    uint8_t mask    =grid->Nsimd() >> (ptype + 1);		
+	    SE._permute    |= mask;
+	  }
+	}	
+	////////////////////////////////////////////////
+	// Store in look up table
+	////////////////////////////////////////////////
+	this->_entries[lex] = SE;
+      }
+    }      
  }
  
 };
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -32,7 +32,6 @@

 #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate
 #include <Grid/stencil/Lebesgue.h>   // subdir aggregate
-#include <Grid/stencil/GeneralLocalStencil.h>

 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
@ -452,6 +451,7 @@ public:
    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
    acceleratorCopySynchronise();
+    // Everyone agrees we are all done
    _grid->StencilBarrier(); 
  }
  ////////////////////////////////////////////////////////////////////////
@ -540,7 +540,6 @@ public:
      compress.Point(point);
      HaloGatherDir(source,compress,point,face_idx);
    }
-    accelerator_barrier();
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);

@ -706,7 +705,7 @@ public:
 	}
      }
    }
-    std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
--- a/Grid/tensors/Tensor_SIMT.h
+++ b/Grid/tensors/Tensor_SIMT.h
@ -73,16 +73,6 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
    return vec;
  }
 }
-//'perm_mask' acts as a bitmask
-template<class vobj> accelerator_inline
-vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0)
-{
-  auto obj = vec, tmp = vec;
-  for (int d=0;d<nd;d++)
-    if (perm_mask & (0x1 << d)) { permute(obj,tmp,d); tmp=obj;}
-  return obj;
-}
-
 template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
 {
@ -93,7 +83,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 {
  vstream(vec, extracted);
 }
-#else //==GRID_SIMT
+#else


 //#ifndef GRID_SYCL
@ -176,14 +166,6 @@ typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,
  return extractLane(plane,vec);
 }
 template<class vobj> accelerator_inline
-typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd()))
-{
-  int plane = lane;
-  for (int d=0;d<nd;d++)
-    plane = (perm_mask & (0x1 << d)) ? plane ^ (vobj::Nsimd() >> (d + 1)) : plane;
-  return extractLane(plane,vec);
-}
-template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 {
  insertLane(lane,vec,extracted);
--- a/Grid/tensors/Tensor_Ta.h
+++ b/Grid/tensors/Tensor_Ta.h
@ -90,12 +90,10 @@ template<class vtype,int N> accelerator_inline iVector<vtype,N> ProjectOnGroup(c
 template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
 accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
 {
-  typedef typename iMatrix<vtype,N>::scalar_type scalar;
  // need a check for the group type?
  iMatrix<vtype,N> ret(arg);
  vtype nrm;
  vtype inner;
-  scalar one(1.0);
  for(int c1=0;c1<N;c1++){

    // Normalises row c1
@ -104,7 +102,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);

    nrm = sqrt(inner);
-    nrm = one/nrm;
+    nrm = 1.0/nrm;
    for(int c2=0;c2<N;c2++)
      ret._internal[c1][c2]*= nrm;
      
@ -129,7 +127,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);

    nrm = sqrt(inner);
-    nrm = one/nrm;
+    nrm = 1.0/nrm;
    for(int c2=0;c2<N;c2++)
      ret._internal[c1][c2]*= nrm;
  }
--- a/Grid/tensors/Tensor_exp.h
+++ b/Grid/tensors/Tensor_exp.h
@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c


 // Specialisation: Cayley-Hamilton exponential for SU(3)
-#if 0
+#ifndef GRID_ACCELERATED
 template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
 accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
 {
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@ -1,224 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Copyright (C) 2023
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
-#include <Grid/qcd/smearing/JacobianAction.h>
-
-using namespace Grid;
-
-int main(int argc, char **argv)
-{
-  std::cout << std::setprecision(12);
-  
-  Grid_init(&argc, &argv);
-  int threads = GridThread::GetThreads();
-  // here make a routine to print all the relevant information on the run
-  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
-
-   // Typedefs to simplify notation
-  typedef WilsonImplR FermionImplPolicy;
-  typedef MobiusFermionD FermionAction;
-  typedef typename FermionAction::FermionField FermionField;
-
-  typedef Grid::XmlReader       Serialiser;
-
-  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-  IntegratorParameters MD;
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
-  //  MD.name    = std::string("Leap Frog");
-  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
-  //  MD.name    = std::string("Force Gradient");
-  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
-  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps = 12;
-  MD.trajL   = 1.0;
-
-  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 0;
-  HMCparams.Trajectories     = 200;
-  HMCparams.NoMetropolisUntil=  20;
-  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  HMCparams.StartingType     =std::string("HotStart");
-  HMCparams.MD = MD;
-  HMCWrapper TheHMC(HMCparams);
-
-  // Grid from the command line arguments --grid and --mpi
-  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-
-  CheckpointerParameters CPparams;
-  CPparams.config_prefix = "ckpoint_EODWF_lat";
-  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
-  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
-  CPparams.saveInterval  = 1;
-  CPparams.saveSmeared   = true;
-  CPparams.format        = "IEEE64BIG";
-  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-
-  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5";
-  RNGpar.parallel_seeds = "6 7 8 9 10";
-  TheHMC.Resources.SetRNGSeeds(RNGpar);
-
-  // Construct observables
-  // here there is too much indirection
-  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
-  TheHMC.Resources.AddObservable<PlaqObs>();
-  //////////////////////////////////////////////
-
-  const int Ls      = 16;
-  Real beta         = 2.13;
-  Real light_mass   = 0.01;
-  Real strange_mass = 0.04;
-  Real pv_mass      = 1.0;
-  RealD M5  = 1.8;
-  RealD b   = 1.0; // Scale factor two
-  RealD c   = 0.0;
-
-  OneFlavourRationalParams OFRp;
-  OFRp.lo       = 1.0e-2;
-  OFRp.hi       = 64;
-  OFRp.MaxIter  = 10000;
-  OFRp.tolerance= 1.0e-10;
-  OFRp.degree   = 14;
-  OFRp.precision= 40;
-
-  std::vector<Real> hasenbusch({ 0.1 });
-
-  auto GridPtr   = TheHMC.Resources.GetCartesian();
-  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
-  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
-  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
-
-  IwasakiGaugeActionR GaugeAction(beta);
-
-  // temporarily need a gauge field
-  LatticeGaugeField U(GridPtr);
-  LatticeGaugeField Uhot(GridPtr);
-
-  // These lines are unecessary if BC are all periodic
-  std::vector<Complex> boundary = {1,1,1,-1};
-  FermionAction::ImplParams Params(boundary);
-
-  double StoppingCondition = 1e-10;
-  double MaxCGIterations = 30000;
-  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
-
-  bool ApplySmearing = true;
-  
-  ////////////////////////////////////
-  // Collect actions
-  ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(2);
-  ActionLevel<HMCWrapper::Field> Level3(4);
-
-  ////////////////////////////////////
-  // Strange action
-  ////////////////////////////////////
-
-  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
-  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA(Strange_Op_L, Strange_Op_R, 
-	 CG,
-	 CG, CG,
-	 CG, CG, 
-	 OFRp, false);
-
-  EOFA.is_smeared = ApplySmearing;
-  Level1.push_back(&EOFA);
-
-  ////////////////////////////////////
-  // up down action
-  ////////////////////////////////////
-  std::vector<Real> light_den;
-  std::vector<Real> light_num;
-
-  int n_hasenbusch = hasenbusch.size();
-  light_den.push_back(light_mass);
-  for(int h=0;h<n_hasenbusch;h++){
-    light_den.push_back(hasenbusch[h]);
-    light_num.push_back(hasenbusch[h]);
-  }
-  light_num.push_back(pv_mass);
-
-  std::vector<FermionAction *> Numerators;
-  std::vector<FermionAction *> Denominators;
-  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
-
-  for(int h=0;h<n_hasenbusch+1;h++){
-    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
-    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
-    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
-    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
-  }
-
-  for(int h=0;h<n_hasenbusch+1;h++){
-    Quotients[h]->is_smeared = ApplySmearing;
-    Level1.push_back(Quotients[h]);
-  }
-
-  /////////////////////////////////////////////////////////////
-  // lnDetJacobianAction
-  /////////////////////////////////////////////////////////////
-  double rho = 0.1;  // smearing parameter
-  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
-  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
-  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
-  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
-  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
-  if( ApplySmearing ) Level2.push_back(&Jacobian);
-  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
-
-
-  /////////////////////////////////////////////////////////////
-  // Gauge action
-  /////////////////////////////////////////////////////////////
-  //  GaugeAction.is_smeared = ApplySmearing;
-  GaugeAction.is_smeared = true;
-  Level3.push_back(&GaugeAction);
-
-  std::cout << GridLogMessage << " ************************************************"<< std::endl;
-  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
-  std::cout << GridLogMessage << " ************************************************"<< std::endl;
-  std::cout << GridLogMessage <<  std::endl;
-  std::cout << GridLogMessage <<  std::endl;
-
-
-  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
-
-  TheHMC.TheAction.push_back(Level1);
-  TheHMC.TheAction.push_back(Level2);
-  TheHMC.TheAction.push_back(Level3);
-
-  TheHMC.Run(SmearingPolicy); // for smearing
-
-  Grid_finalize();
-} // main
-
-
-
--- a/HMC/Mobius2p1f_EOFA_96I_hmc.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc
@ -146,8 +146,6 @@ NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  using namespace Grid;

-  std::cout << " Grid Initialise "<<std::endl;
-  
  Grid_init(&argc, &argv);

  CartesianCommunicator::BarrierWorld();
@ -172,24 +170,24 @@ int main(int argc, char **argv) {
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
-  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
-  //  MD.name    = std::string("Force Gradient");
-  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
-  MD.name    = std::string("MinimumNorm2");
+  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  MD.name    = std::string("Force Gradient");
+  //typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  // MD.name    = std::string("MinimumNorm2");
  // TrajL = 2
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
-  MD.MDsteps =  14;
+  MD.MDsteps =  12;
  MD.trajL   = 0.5;

  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 1077;
-  HMCparams.Trajectories     = 20;
+  HMCparams.Trajectories     = 1;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  HMCparams.StartingType     =std::string("ColdStart");
-  //  HMCparams.StartingType     =std::string("CheckpointStart");
+  //  HMCparams.StartingType     =std::string("ColdStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);

@ -225,7 +223,7 @@ int main(int argc, char **argv) {
  Real pv_mass      = 1.0;
  //  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
-  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated
+  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });

  auto GridPtr   = TheHMC.Resources.GetCartesian();
@ -277,10 +275,10 @@ int main(int argc, char **argv) {

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
-  double StoppingCondition = 1e-9;
-  double MDStoppingCondition = 1e-8;
-  double MDStoppingConditionLoose = 1e-8;
-  double MDStoppingConditionStrange = 1e-8;
+  double StoppingCondition = 1e-8;
+  double MDStoppingCondition = 1e-7;
+  double MDStoppingConditionLoose = 1e-7;
+  double MDStoppingConditionStrange = 1e-7;
  double MaxCGIterations = 300000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
--- a/documentation/David_notes.txt
+++ b/documentation/David_notes.txt
@ -1,90 +0,0 @@
-Branch: develop
-
-Files:
-
-Grid/lattice/PaddedCell.h -- Halo exchange
-tests/Test_general_stencil.cc -- test local off axis stencil addressing
-tests/debug/Test_padded_cell.cc -- test PaddedCell halo exchange and the General local stencil  by computing ALL plaquettes on lattice
-
-Functionality:
-
-- extend a lattice field:
-Grid/lattice/PaddedCell.h
-
-// Constructor
-  PaddedCell(int _depth,GridCartesian *_grid)
-
-// Expand a field "in" to depth "d"
-  template<class vobj>
-  inline Lattice<vobj> Exchange(Lattice<vobj> &in)
-  
-// Take the "apple core" of in to a smaller local volume
-  template<class vobj>
-  inline Lattice<vobj> Extract(Lattice<vobj> &in)
-
-- Plaquette test:
-tests/debug/Test_padded_cell.cc
-  /////////////////////////////////////////////////
-  // Create a padded cell of extra padding depth=1
-  /////////////////////////////////////////////////
-  int depth = 1;
-  PaddedCell Ghost(depth,&GRID);
-  LatticeGaugeField Ughost = Ghost.Exchange(Umu);
-
-///// Array for the site plaquette
-  GridBase *GhostGrid = Ughost.Grid();
-  LatticeComplex gplaq(GhostGrid); 
-
-  std::vector<Coordinate> shifts;
-  for(int mu=0;mu<Nd;mu++){
-    for(int nu=mu+1;nu<Nd;nu++){
-  
-      //    Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x)
-      Coordinate shift_0(Nd,0);
-      Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
-      Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
-      shifts.push_back(shift_0);
-      shifts.push_back(shift_mu);
-      shifts.push_back(shift_nu);
-      shifts.push_back(shift_0);
-    }
-  }
-  GeneralLocalStencil gStencil(GhostGrid,shifts);
-
-  gplaq=Zero();
-  {
-    autoView( gp_v , gplaq, CpuWrite);
-    autoView( t_v , trplaq, CpuRead);
-    autoView( U_v , Ughost, CpuRead);
-    for(int ss=0;ss<gp_v.size();ss++){
-      int s=0;
-      for(int mu=0;mu<Nd;mu++){
-	for(int nu=mu+1;nu<Nd;nu++){
-
-	  auto SE0 = gStencil.GetEntry(s+0,ss);
-	  auto SE1 = gStencil.GetEntry(s+1,ss);
-	  auto SE2 = gStencil.GetEntry(s+2,ss);
-	  auto SE3 = gStencil.GetEntry(s+3,ss);
-	
-	  int o0 = SE0->_offset;
-	  int o1 = SE1->_offset;
-	  int o2 = SE2->_offset;
-	  int o3 = SE3->_offset;
-	  
-	  auto U0 = U_v[o0](mu);
-	  auto U1 = U_v[o1](nu);
-	  auto U2 = adj(U_v[o2](mu));
-	  auto U3 = adj(U_v[o3](nu));
-
-	  gpermute(U0,SE0->_permute);
-	  gpermute(U1,SE1->_permute);
-	  gpermute(U2,SE2->_permute);
-	  gpermute(U3,SE3->_permute);
-	  
-	  gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 );
-	  s=s+4;
-	}
-      }
-    }
-  }
-  cplaq = Ghost.Extract(gplaq);
--- a/documentation/GridXcode/readme.md
+++ b/documentation/GridXcode/readme.md
@ -10,8 +10,9 @@ For first time setup of the Xcode and Grid build environment on Mac OS, you will

 1. Install Xcode and the Xcode command-line utilities
 2. Set Grid environment variables
-3. Install and build Grid pre-requisites
-4. Install, Configure and Build Grid
+3. Install and build Open MPI ***optional***
+4. Install and build Grid pre-requisites
+5. Install, Configure and Build Grid

 Apple's [Xcode website][Xcode] is the go-to reference for 1, and the definitive reference for 4 and 5 is the [Grid Documentation][GridDoc].

@ -91,33 +92,60 @@ launchctl setenv GridPkg /opt/local</string>
 </plist>
 ```

-## 3. Install and build Grid pre-requisites
+## 3. Install and build Open MPI -- ***optional***
+
+Download the latest version of [Open MPI][OMPI] version 3.1 (I used 3.1.5) and build it like so:
+
+[OMPI]: https://www.open-mpi.org/software/ompi/v3.1/
+
+    ../configure CC=clang CXX=clang++ CXXFLAGS=-g --prefix=$GridPre/bin
+    make -j 4 all install
+
+***Note the `/bin` at the end of the prefix - this is required. As a quirk of the OpenMPI installer, `--prefix` must point to the `bin` subdirectory, with other files installed in `$GridPre/include`, `$GridPre/lib`, `$GridPre/share`, etc.***
+
+Grid does not have any dependencies on fortran, however many standard scientific packages do, so you may wish to download GNU fortran (e.g. MacPorts ``gfortran`` package) and add the following to your configure invocation:
+
+    F77=gfortran FC=gfortran
+
+## 4. Install and build Grid pre-requisites

 To simplify the installation of **Grid pre-requisites**, you can use your favourite package manager, e.g.:

-### 3.1. [MacPorts][MacPorts]
+### 1. [MacPorts][MacPorts]

 [MacPorts]: https://www.macports.org "MacPorts package manager"

 Install [MacPorts][MacPorts] if you haven't done so already, and then install packages with:

-    sudo port install openmpi git-flow-avh gmp hdf5 mpfr fftw-3-single lapack wget autoconf automake bison cmake gawk libomp
+    sudo port install <portname>

-On a Mac without GPUs:
+These are the `portname`s for mandatory Grid libraries:

-    sudo port install OpenBLAS +native
+* git-flow-avh
+* gmp
+* hdf5
+* mpfr

-To use `Gnu sha256sum`:
+and these are the `portname`s for optional Grid libraries:

-    pushd /opt/local/bin; sudo ln -s gsha256sum sha256sum; popd 
+* fftw-3-single
+* lapack
+* doxygen
+* OpenBLAS

-These `port`s are not strictly necessary, but they are helpful:
+***Please update this list with any packages I've missed! ... and double-check whether OpenBLAS is really for Grid. NB: lapack doesn't seem to work. Should it be scalapack?***

-    sudo port install gnuplot gsl h5utils nasm rclone texinfo tree xorg-server
+### 2. [Homebrew][Homebrew]

-***Please update this list with any packages I've missed!***
+[Homebrew]: https://brew.sh "Homebrew package manager"

-#### Install LIME
+Install [Homebrew][Homebrew] if you haven't done so already, and then install packages with:
+
+    sudo brew install <packagename>
+
+The same packages are available as from MacPorts.
+
+### Install LIME ***optional***

 There isn't currently a port for [C-LIME][C-LIME], so download the source and then build it:

@ -126,19 +154,9 @@ There isn't currently a port for [C-LIME][C-LIME], so download the source and th
    ../configure CC=clang --prefix=$GridPre
    make -j 4 all install

-### 3.2. [Homebrew][Homebrew]
+## 5. Install, Configure and Build Grid

-[Homebrew]: https://brew.sh "Homebrew package manager"
-
-Install [Homebrew][Homebrew] if you haven't done so already, and then install packages with:
-
-    sudo brew install <packagename>
-
-I don't use Homebrew, so I'm not sure what the Brew package name equivalents are. ** Please update if you know **
-
-## 4. Install, Configure and Build Grid
-
-### 4.1 Install Grid
+### 5.1 Install Grid

 [Grid]: https://github.com/paboyle/Grid

@ -156,7 +174,7 @@ or

 depending on how many times you like to enter your password.

-### 4.2 Configure Grid
+### 5.2 Configure Grid

 The Xcode build system supports multiple configurations for each project, by default: `Debug` and `Release`, but more configurations can be defined. We will create separate Grid build directories for each configuration, using the Grid **Autoconf** build system to make each configuration. NB: it is **not** necessary to run `make install` on them once they are built (IDE features such as *jump to definition* will work better of you don't).

@ -180,7 +198,7 @@ Debug configuration with MPI:

    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --prefix=$GridPre/MPIDebug

-### 4.3 Build Grid
+### 5.3 Build Grid

 Each configuration must be built before they can be used. You can either:

--- a/grid-config.in
+++ b/grid-config.in
@ -60,7 +60,7 @@ while test $# -gt 0; do
    ;;
    
    --cxxflags)
-      echo @GRID_CXXFLAGS@ -I@prefix@/include
+      echo @GRID_CXXFLAGS@
    ;;
    
    --cxx)
@ -72,11 +72,11 @@ while test $# -gt 0; do
    ;;
    
    --ldflags)
-      echo @GRID_LDFLAGS@ -L@prefix@/lib
+      echo @GRID_LDFLAGS@
    ;;
    
    --libs)
-      echo @GRID_LIBS@ -lGrid
+      echo @GRID_LIBS@
    ;;
    
    --summary)
--- a/systems/Lumi/benchmarks/bench2.slurm
+++ b/systems/Lumi/benchmarks/bench2.slurm
@ -1,44 +0,0 @@
-#!/bin/bash -l
-#SBATCH --job-name=bench_lehner
-#SBATCH --partition=small-g
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=8
-#SBATCH --cpus-per-task=7
-#SBATCH --gpus-per-node=8
-#SBATCH --time=00:10:00
-#SBATCH --account=project_465000546
-#SBATCH --gpu-bind=none
-#SBATCH --exclusive
-#SBATCH --mem=0
-
-CPU_BIND="map_cpu:48,56,32,40,16,24,1,8"
-echo $CPU_BIND
-
-cat << EOF > select_gpu
-#!/bin/bash
-export GPU_MAP=(0 1 2 3 4 5 6 7)
-export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
-export HIP_VISIBLE_DEVICES=\$GPU
-unset ROCR_VISIBLE_DEVICES
-echo RANK \$SLURM_LOCALID using GPU \$GPU    
-exec \$*
-EOF
-
-chmod +x ./select_gpu
-
-root=/scratch/project_465000546/boylepet/Grid/systems/Lumi
-source ${root}/sourceme.sh
-
-export OMP_NUM_THREADS=7
-export MPICH_GPU_SUPPORT_ENABLED=1
-export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
-
-for vol in 16.16.16.64 32.32.32.64  32.32.32.128
-do
-srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
-#srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
-
-srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
-#srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
-done
-
--- a/systems/Lumi/config-command
+++ b/systems/Lumi/config-command
@ -1,30 +0,0 @@
-spack load c-lime
-spack load gmp
-spack load mpfr
-CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
-GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
-MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
-echo clime X$CLIME
-echo gmp X$GMP
-echo mpfr X$MPFR
-
-../../configure \
--enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--enable-simd=GPU \
--enable-accelerator-cshift \
--with-gmp=$GMP \
--with-mpfr=$MPFR \
--with-fftw=$FFTW_DIR/.. \
--disable-fermion-reps \
--disable-gparity \
-CXX=hipcc MPICXX=mpicxx \
-  CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
-  LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp" 
-
-
-
--- a/systems/Lumi/sourceme.sh
+++ b/systems/Lumi/sourceme.sh
@ -1,5 +0,0 @@
-source ~/spack/share/spack/setup-env.sh
-module load CrayEnv LUMI/22.12 partition/G  cray-fftw/3.3.10.1 rocm
-spack load c-lime
-spack load gmp
-spack load mpfr
--- a/systems/OEM/README
+++ b/systems/OEM/README
@ -1,53 +0,0 @@
-1. Prerequisites:
-===================
-Make sure you have the latest Intel ipcx release loaded (via modules or similar)
-Make sure you have SYCL aware MPICH or Intel MPI loaded (assumed as mpicxx)
-
-2. Obtain Grid:
-===================
-
-bash$
-git clone https://github.com/paboyle/Grid
-cd Grid
-./bootstrap.sh
-cd systems/PVC
-
-3. Build Grid:
-===================
-
-Here, configure command is stored in file config-command:
-
-bash$
-../../configure \
-	--enable-simd=GPU \
-	--enable-gen-simd-width=64 \
-	--enable-comms=mpi-auto \
-	--enable-accelerator-cshift \
-	--disable-gparity \
-	--disable-fermion-reps \
-	--enable-shm=nvlink \
-	--enable-accelerator=sycl \
-	--enable-unified=no \
-	MPICXX=mpicxx \
-	CXX=icpx \
-	LDFLAGS="-fiopenmp  -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "
-
-make all
-
-4. Run a benchmark:
-===================
-
-*** Assumes interactive access to node. ***
-
-run Benchmark_dwf_fp32 using benchmarks/bench.sh
-
-bash$
-cd benchmarks
-./bench.sh
-
-
-
-
-
-
--- a/systems/OEM/benchmarks/bench.sh
+++ b/systems/OEM/benchmarks/bench.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-
-export EnableImplicitScaling=0
-export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
-export ZE_AFFINITY_MASK=$gpu_id.$tile_id
-export ONEAPI_DEVICE_FILTER=gpu,level_zero
-export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
-
-mpiexec -launcher ssh -n 1 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 1tile.log
-mpiexec -launcher ssh -n 2 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 2tile.log
-
-#mpiexec -launcher ssh -n 4 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 16.16.64.64 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 4tile.log
-#mpiexec -launcher ssh -n 8 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.4 --grid 16.16.64.128 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 8tile.log
-
-
--- a/systems/OEM/benchmarks/select_gpu.sh
+++ b/systems/OEM/benchmarks/select_gpu.sh
@ -1,13 +0,0 @@
-#!/bin/bash
-
-num_tile=2
-
-gpu_id=$(( (MPI_LOCAL_RANKID % num_tile ) ))
-tile_id=$((MPI_LOCAL_RANKID / num_tile))
-
-export ZE_AFFINITY_MASK=$gpu_id.$tile_id
-
-echo "local rank $MPI_LOCALRANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK"
-
-"$@"
-
--- a/systems/OEM/config-command
+++ b/systems/OEM/config-command
@ -1,15 +0,0 @@
-../../configure \
-	--enable-simd=GPU \
-	--enable-gen-simd-width=64 \
-	--enable-comms=mpi-auto \
-	--enable-accelerator-cshift \
-	--disable-gparity \
-	--disable-fermion-reps \
-	--enable-shm=nvlink \
-	--enable-accelerator=sycl \
-	--enable-unified=no \
-	MPICXX=mpicxx \
-	CXX=icpx \
-	LDFLAGS="-fiopenmp  -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "
-
--- a/systems/OEM/setup.sh
+++ b/systems/OEM/setup.sh
@ -1,3 +0,0 @@
-export https_proxy=http://proxy-chain.intel.com:911
-module load intel-release
-module load intel/mpich
--- a/systems/Sunspot/benchmarks/bench.pbs
+++ b/systems/Sunspot/benchmarks/bench.pbs
@ -1,46 +0,0 @@
-#!/bin/bash
-
-#PBS -l select=1:system=sunspot,place=scatter
-#PBS -A LatticeQCD_aesp_CNDA
-#PBS -l walltime=01:00:00
-#PBS -N dwf
-#PBS -k doe
-
-HDIR=/home/paboyle/
-module use /soft/testing/modulefiles/
-module load intel-UMD23.05.25593.11/23.05.25593.11
-module load tools/pti-gpu  
-export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
-export PATH=$HDIR/tools/bin:$PATH
-
-export TZ='/usr/share/zoneinfo/US/Central'
-export OMP_PROC_BIND=spread
-export OMP_NUM_THREADS=3
-unset OMP_PLACES
-
-cd $PBS_O_WORKDIR
-
-qsub jobscript.pbs
-
-echo Jobid: $PBS_JOBID
-echo Running on host `hostname`
-echo Running on nodes `cat $PBS_NODEFILE`
-
-echo NODES
-cat $PBS_NODEFILE
-NNODES=`wc -l < $PBS_NODEFILE`
-NRANKS=12         # Number of MPI ranks per node
-NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
-NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
-
-NTOTRANKS=$(( NNODES * NRANKS ))
-
-echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
-echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
-
-    
-CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
-	     ./gpu_tile_compact.sh \
-	./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
-	--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-
--- a/systems/Sunspot/benchmarks/gpu_tile_compact.sh
+++ b/systems/Sunspot/benchmarks/gpu_tile_compact.sh
@ -1,52 +0,0 @@
-#!/bin/bash
-
-display_help() {
-  echo " Will map gpu tile to rank in compact and then round-robin fashion"
-  echo " Usage (only work for one node of ATS/PVC):"
-  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
-  echo
-  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
-  echo "   0 Rank 0.0"
-  echo "   1 Rank 0.1"
-  echo "   2 Rank 1.0"
-  echo "   3 Rank 1.1"
-  echo "   4 Rank 2.0"
-  echo "   5 Rank 2.1"
-  echo "   6 Rank 0.0"
-  echo
-  echo " Hacked together by apl@anl.gov, please contact if bug found"
-  exit 1
-}
-
-#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
-#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
-num_gpu=6
-num_tile=2
-
-if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
-  display_help
-fi
-
-gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
-tile_id=$((PALS_LOCAL_RANKID % num_tile))
-
-unset EnableWalkerPartition
-export EnableImplicitScaling=0
-export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
-export ZE_AFFINITY_MASK=$gpu_id.$tile_id
-export ONEAPI_DEVICE_FILTER=gpu,level_zero
-export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
-#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
-
-echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK"
-
-if [ $PALS_LOCAL_RANKID = 0 ]
-then
-    onetrace --chrome-device-timeline "$@"
-#    "$@"
-else
-"$@"
-fi
--- a/systems/Sunspot/config-command
+++ b/systems/Sunspot/config-command
@ -1,16 +0,0 @@
-TOOLS=$HOME/tools
-../../configure \
-	--enable-simd=GPU \
-	--enable-gen-simd-width=64 \
-	--enable-comms=mpi-auto \
-	--enable-accelerator-cshift \
-	--disable-gparity \
-	--disable-fermion-reps \
-	--enable-shm=nvlink \
-	--enable-accelerator=sycl \
-	--enable-unified=no \
-	MPICXX=mpicxx \
-	CXX=icpx \
-	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -lapmidg -L$TOOLS/lib64/" \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
-
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@ -1,4 +1,2 @@
-BREW=/opt/local/
-MPICXX=mpicxx CXX=c++-12 ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug
-
+CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi --enable-unified=yes

--- a/tests/Test_general_stencil.cc
+++ b/tests/Test_general_stencil.cc
@ -115,7 +115,6 @@ int main(int argc, char ** argv)
 	  if (SE->_permute & 0x2 ) { permute(check[i],tmp,1); tmp=check[i];}
 	  if (SE->_permute & 0x4 ) { permute(check[i],tmp,2); tmp=check[i];}
 	  if (SE->_permute & 0x8 ) { permute(check[i],tmp,3); tmp=check[i];}
-	  //	  std::cout<<GridLogMessage<<"stencil["<<i<<"] "<< check[i]<< " perm "<<(uint32_t)SE->_permute <<std::endl;
 	}

 	Real nrmC = norm2(Check);
@ -139,17 +138,18 @@ int main(int argc, char ** argv)
 	  ddiff = check -bar;
 	  diff =norm2(ddiff);
 	  if ( diff > 0){
-	    std::cout <<"Diff at Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]
-		      <<") stencil " <<check<<" vs cshift "<<bar<<std::endl;
+	    std::cout <<"Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]
+		      <<") " <<check<<" vs "<<bar<<std::endl;
 	  }

+
 	}}}}

 	if (nrm > 1.0e-4) {
 	  autoView( check , Check, CpuRead);
 	  autoView(   bar ,   Bar, CpuRead);
 	  for(int i=0;i<check.size();i++){
-	    std::cout << i<<" ERROR Check \n"<<check[i]<< "\n"<<i<<" Bar \n"<<bar[i]<<std::endl;
+	    std::cout << i<<" Check "<<check[i]<< "\n"<<i<<" Bar "<<bar[i]<<std::endl;
 	  }
 	}
 	if (nrm > 1.0e-4) exit(-1);
--- a/tests/core/Test_fft_pf.cc
+++ b/tests/core/Test_fft_pf.cc
@ -1,307 +0,0 @@
-    /*************************************************************************************
-
-    grid` physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_cshift.cc
-
-    Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace Grid;
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-
-  Coordinate latt_size   = GridDefaultLatt();
-  Coordinate simd_layout( { vComplexD::Nsimd(),1,1,1});
-  Coordinate mpi_layout  = GridDefaultMpi();
-
-  int vol = 1;
-  for(int d=0;d<latt_size.size();d++){
-    vol = vol * latt_size[d];
-  }
-  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian RBGRID(&GRID);
-
-  ComplexD ci(0.0,1.0);
-
-  std::vector<int> seeds({1,2,3,4});
-  GridSerialRNG          sRNG;  sRNG.SeedFixedIntegers(seeds); // naughty seeding
-  GridParallelRNG          pRNG(&GRID);
-  pRNG.SeedFixedIntegers(seeds);
-
-  LatticeGaugeFieldD Umu(&GRID);
-
-  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
-
-  ////////////////////////////////////////////////////
-  // PF prop
-  ////////////////////////////////////////////////////
-  LatticeFermionD    src(&GRID);
-
-  gaussian(pRNG,src);
-#if 1
-    Coordinate point(4,0);
-    src=Zero();
-    SpinColourVectorD ferm; gaussian(sRNG,ferm);
-    pokeSite(ferm,src,point);
-#endif
-  
-  {
-    std::cout<<"****************************************"<<std::endl;
-    std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator \n";
-    std::cout<<"****************************************"<<std::endl;
-
-    //    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
-    LatticeFermionD    tmp(&GRID);
-    LatticeFermionD    ref(&GRID);
-    LatticeFermionD    diff(&GRID);
-
-    const int Ls=48+1;
-    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
-    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
-
-    RealD mass=0.1;
-    RealD M5  =0.8;
-    OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0);
-
-    // Momentum space prop
-    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
-    bool fiveD = false; //calculate 4d free propagator
-
-    std::cout << " Free propagator " <<std::endl;
-    Dov.FreePropagator(src,ref,mass) ;
-    std::cout << " Free propagator norm "<< norm2(ref) <<std::endl;
-
-    Gamma G5(Gamma::Algebra::Gamma5);
-
-    LatticeFermionD    src5(FGrid); src5=Zero();
-    LatticeFermionD    tmp5(FGrid); 
-    LatticeFermionD    result5(FGrid); result5=Zero();
-    LatticeFermionD    result4(&GRID); 
-    const int sdir=0;
-
-    ////////////////////////////////////////////////////////////////////////
-    // Import
-    ////////////////////////////////////////////////////////////////////////
-    std::cout << " Free propagator Import "<< norm2(src) <<std::endl;
-    Dov.ImportPhysicalFermionSource  (src,src5);
-    std::cout << " Free propagator Imported "<< norm2(src5) <<std::endl;
-    
-    ////////////////////////////////////////////////////////////////////////
-    // Conjugate gradient on normal equations system
-    ////////////////////////////////////////////////////////////////////////
-    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
-    Dov.Mdag(src5,tmp5);
-    src5=tmp5;
-    MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov);
-    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-    CG(HermOp,src5,result5);
-    ////////////////////////////////////////////////////////////////////////
-    // Domain wall physical field propagator
-    ////////////////////////////////////////////////////////////////////////
-    Dov.ExportPhysicalFermionSolution(result5,result4);
-
-    // From DWF4d.pdf :
-    //
-    // Dov_pf = 2/(1-m) D_cayley_ovlap  [ Page 43 ]
-    // Dinv_cayley_ovlap = 2/(1-m) Dinv_pf 
-    // Dinv_cayley_surface =1/(1-m) ( Dinv_cayley_ovlap - 1 ) =>  2/(1-m)^2 Dinv_pf - 1/(1-m) * src   [ Eq.2.67 ]
-
-    RealD scale = 2.0/(1.0-mass)/(1.0-mass);
-    result4 = result4 * scale;
-    result4 = result4 - src*(1.0/(1.0-mass)); // Subtract contact term
-    DumpSliceNorm("Src",src);
-    DumpSliceNorm("Grid",result4);
-    DumpSliceNorm("Fourier",ref);
-
-    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
-    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
-
-    diff = result4- ref;
-    DumpSliceNorm("diff ",diff);
-    
-  }
-  
-  ////////////////////////////////////////////////////
-  // Dwf prop
-  ////////////////////////////////////////////////////
-  {
-    std::cout<<"****************************************"<<std::endl;
-    std::cout << "Testing Dov(Hw) Mom space 4d propagator \n";
-    std::cout<<"****************************************"<<std::endl;
-
-    LatticeFermionD    tmp(&GRID);
-    LatticeFermionD    ref(&GRID);
-    LatticeFermionD    diff(&GRID);
-
-    const int Ls=48;
-    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
-    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
-
-    RealD mass=0.1;
-    RealD M5  =0.8;
-
-    OverlapWilsonCayleyTanhFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,1.0);
-
-    // Momentum space prop
-    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
-    Dov.FreePropagator(src,ref,mass) ;
-
-    Gamma G5(Gamma::Algebra::Gamma5);
-
-    LatticeFermionD    src5(FGrid); src5=Zero();
-    LatticeFermionD    tmp5(FGrid); 
-    LatticeFermionD    result5(FGrid); result5=Zero();
-    LatticeFermionD    result4(&GRID); 
-    const int sdir=0;
-
-    ////////////////////////////////////////////////////////////////////////
-    // Domain wall physical field source; need D_minus
-    ////////////////////////////////////////////////////////////////////////
-    /*
-	chi_5[0]   = chiralProjectPlus(chi);
-	chi_5[Ls-1]= chiralProjectMinus(chi);
-    */      
-    tmp =   (src + G5*src)*0.5;      InsertSlice(tmp,src5,   0,sdir);
-    tmp =   (src - G5*src)*0.5;      InsertSlice(tmp,src5,Ls-1,sdir);
-    
-    ////////////////////////////////////////////////////////////////////////
-    // Conjugate gradient on normal equations system
-    ////////////////////////////////////////////////////////////////////////
-    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
-    Dov.Dminus(src5,tmp5);
-    src5=tmp5;
-    Dov.Mdag(src5,tmp5);
-    src5=tmp5;
-    MdagMLinearOperator<OverlapWilsonCayleyTanhFermionD,LatticeFermionD> HermOp(Dov);
-    ConjugateGradient<LatticeFermionD> CG(1.0e-16,10000);
-    CG(HermOp,src5,result5);
-    
-    ////////////////////////////////////////////////////////////////////////
-    // Domain wall physical field propagator
-    ////////////////////////////////////////////////////////////////////////
-    /*
-      psi  = chiralProjectMinus(psi_5[0]);
-      psi += chiralProjectPlus(psi_5[Ls-1]);
-    */
-    ExtractSlice(tmp,result5,0   ,sdir);   result4 =         (tmp-G5*tmp)*0.5;
-    ExtractSlice(tmp,result5,Ls-1,sdir);   result4 = result4+(tmp+G5*tmp)*0.5;
-    
-    std::cout << " Taking difference" <<std::endl;
-    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
-    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
-    DumpSliceNorm("Grid",result4);
-    DumpSliceNorm("Fourier",ref);
-    diff = ref - result4;
-    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
-    
-    DumpSliceNorm("diff",diff);
-
-  }
-
-  
-  {
-    std::cout<<"****************************************"<<std::endl;
-    std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator with q\n";
-    std::cout<<"****************************************"<<std::endl;
-
-    //    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
-    LatticeFermionD    tmp(&GRID);
-    LatticeFermionD    ref(&GRID);
-    LatticeFermionD    diff(&GRID);
-
-    const int Ls=48+1;
-    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
-    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
-
-    RealD mass=0.1;
-    RealD M5  =0.8;
-    OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0);
-
-    // Momentum space prop
-    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
-    bool fiveD = false; //calculate 4d free propagator
-
-    std::cout << " Free propagator " <<std::endl;
-    Dov.FreePropagator(src,ref,mass) ;
-    std::cout << " Free propagator norm "<< norm2(ref) <<std::endl;
-
-    Gamma G5(Gamma::Algebra::Gamma5);
-
-    LatticeFermionD    src5(FGrid); src5=Zero();
-    LatticeFermionD    tmp5(FGrid); 
-    LatticeFermionD    result5(FGrid); result5=Zero();
-    LatticeFermionD    result4(&GRID); 
-    const int sdir=0;
-
-    ////////////////////////////////////////////////////////////////////////
-    // Import
-    ////////////////////////////////////////////////////////////////////////
-    std::cout << " Free propagator Import "<< norm2(src) <<std::endl;
-    Dov.ImportPhysicalFermionSource  (src,src5);
-    std::cout << " Free propagator Imported "<< norm2(src5) <<std::endl;
-    
-    ////////////////////////////////////////////////////////////////////////
-    // Conjugate gradient on normal equations system
-    ////////////////////////////////////////////////////////////////////////
-    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
-    Dov.Mdag(src5,tmp5);
-    src5=tmp5;
-    MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov);
-    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-    CG(HermOp,src5,result5);
-    ////////////////////////////////////////////////////////////////////////
-    // Domain wall physical field propagator
-    ////////////////////////////////////////////////////////////////////////
-    Dov.ExportPhysicalFermionSolution(result5,result4);
-
-    // From DWF4d.pdf :
-    //
-    // Dov_pf = 2/(1-m) D_cayley_ovlap  [ Page 43 ]
-    // Dinv_cayley_ovlap = 2/(1-m) Dinv_pf 
-    // Dinv_cayley_surface =1/(1-m) ( Dinv_cayley_ovlap - 1 ) =>  2/(1-m)^2 Dinv_pf - 1/(1-m) * src   [ Eq.2.67 ]
-
-    RealD scale = 2.0/(1.0-mass)/(1.0-mass);
-    result4 = result4 * scale;
-    result4 = result4 - src*(1.0/(1.0-mass)); // Subtract contact term
-    DumpSliceNorm("Src",src);
-    DumpSliceNorm("Grid",result4);
-    DumpSliceNorm("Fourier",ref);
-
-    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
-    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
-
-    diff = result4- ref;
-    DumpSliceNorm("diff ",diff);
-    
-  }
-
-  
-  Grid_finalize();
-}
--- a/tests/core/Test_lie_generators.cc
+++ b/tests/core/Test_lie_generators.cc
@ -63,9 +63,7 @@ int main(int argc, char** argv) {
  std::cout << "Dimension of adjoint representation: "<< SU2Adjoint::Dimension << std::endl;

  // guard as this code fails to compile for Nc != 3
-#if 1
-
-  std::cout << " Printing  Adjoint Generators"<< std::endl;
+#if (Nc == 3)
    
  SU2Adjoint::printGenerators();
  SU2::testGenerators();
@ -150,33 +148,10 @@ int main(int argc, char** argv) {
    typename AdjointRep<Nc>::LatticeMatrix Vrmu = peekLorentz(Vr,mu);
    pokeLorentz(UrVr,Urmu*Vrmu, mu);
  }
-
-  typedef typename SU_Adjoint<Nc>::AMatrix AdjointMatrix;
+    
  typename AdjointRep<Nc>::LatticeField Diff_check = UVr - UrVr;
  std::cout << GridLogMessage << "Group structure SU("<<Nc<<") check difference (Adjoint representation) : " << norm2(Diff_check) << std::endl;
-
-  std::cout << GridLogMessage << "****************************************** " << std::endl;
-  std::cout << GridLogMessage << " MAP BETWEEN FUNDAMENTAL AND ADJOINT CHECK " << std::endl;
-  std::cout << GridLogMessage << "****************************************** " << std::endl;
-  for(int a=0;a<Nc*Nc-1;a++){
-  for(int b=0;b<Nc*Nc-1;b++){
-  for(int c=0;c<Nc*Nc-1;c++){
-    ColourMatrix Ta;
-    ColourMatrix Tb;
-    ColourMatrix Tc;
-    SU3::generator(a, Ta);
-    SU3::generator(b, Tb);
-    SU3::generator(c, Tc);
-    AdjointMatrix TRa;
-    SU3Adjoint::generator(a,TRa);
-    Complex tr1 = trace ( Tc * ( Ta*Tb-Tb*Ta)); // i/2 fabc
-    Complex tr2 = TRa()()(b,c) * Complex(0,1);
-    std::cout << " 2 Tr( Tc[Ta,Tb]) " << 2.0*tr1<<std::endl;
-    std::cout << " - TRa_bc " << tr2<<std::endl;
-    assert(abs( (2.0*tr1-tr2) ) < 1.0e-7);
-    std::cout << "------------------"<<std::endl;
-  }}}
-  
+    
  // Check correspondence of algebra and group transformations
  // Create a random vector
  SU3::LatticeAlgebraVector h_adj(grid);
--- a/tests/debug/Test_iwasaki_action_newstaple.cc
+++ b/tests/debug/Test_iwasaki_action_newstaple.cc
@ -1,188 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_iwasaki_action_newstaple.cc
-
-    Copyright (C) 2015
-
-Author: Christopher Kelly <ckelly@bnl.gov>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-
-////////////////////////////////////////////////////////////////////////
-// PlaqPlusRectangleActoin
-////////////////////////////////////////////////////////////////////////
-template<class Gimpl>
-class PlaqPlusRectangleActionOrig : public Action<typename Gimpl::GaugeField> {
-public:
-
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-private:
-  RealD c_plaq;
-  RealD c_rect;
-
-public:
-  PlaqPlusRectangleActionOrig(RealD b,RealD c): c_plaq(b),c_rect(c){};
-
-  virtual std::string action_name(){return "PlaqPlusRectangleActionOrig";}
-      
-  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
-      
-  virtual std::string LogParameters(){
-    std::stringstream sstream;
-    sstream << GridLogMessage << "["<<action_name() <<"] c_plaq: " << c_plaq << std::endl;
-    sstream << GridLogMessage << "["<<action_name() <<"] c_rect: " << c_rect << std::endl;
-    return sstream.str();
-  }
-
-
-  virtual RealD S(const GaugeField &U) {
-    RealD vol = U.Grid()->gSites();
-
-    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
-    RealD rect = WilsonLoops<Gimpl>::avgRectangle(U);
-
-    RealD action=c_plaq*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5
-      +c_rect*(1.0 -rect)*(Nd*(Nd-1.0))*vol;
-
-    return action;
-  };
-
-  virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
-    //extend Ta to include Lorentz indexes
-    RealD factor_p = c_plaq/RealD(Nc)*0.5;
-    RealD factor_r = c_rect/RealD(Nc)*0.5;
-
-    GridBase *grid = Umu.Grid();
-
-    std::vector<GaugeLinkField> U (Nd,grid);
-    std::vector<GaugeLinkField> U2(Nd,grid);
-
-    for(int mu=0;mu<Nd;mu++){
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-      WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
-    }
-
-    GaugeLinkField dSdU_mu(grid);
-    GaugeLinkField staple(grid);
-
-    for (int mu=0; mu < Nd; mu++){
-
-      // Staple in direction mu
-
-      WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
-
-      dSdU_mu = Ta(U[mu]*staple)*factor_p;
-
-      WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
-
-      dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
-	  
-      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
-    }
-
-  };
-
-};
-
-// Convenience for common physically defined cases.
-//
-// RBC c1 parameterisation is not really RBC but don't have good
-// reference and we are happy to change name if prior use of this plaq coeff
-// parameterisation is made known to us. 
-template<class Gimpl>
-class RBCGaugeActionOrig : public PlaqPlusRectangleActionOrig<Gimpl> {
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-  RBCGaugeActionOrig(RealD beta,RealD c1) : PlaqPlusRectangleActionOrig<Gimpl>(beta*(1.0-8.0*c1), beta*c1) {};
-  virtual std::string action_name(){return "RBCGaugeActionOrig";}
-};
-
-template<class Gimpl>
-class IwasakiGaugeActionOrig : public RBCGaugeActionOrig<Gimpl> {
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-  IwasakiGaugeActionOrig(RealD beta) : RBCGaugeActionOrig<Gimpl>(beta,-0.331) {};
-  virtual std::string action_name(){return "IwasakiGaugeActionOrig";}
-};
-
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  Coordinate latt_size  = GridDefaultLatt();
-  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
-  Coordinate mpi_layout = GridDefaultMpi();
-  std::cout << " mpi "<<mpi_layout<<std::endl;
-  std::cout << " simd "<<simd_layout<<std::endl;
-  std::cout << " latt "<<latt_size<<std::endl;
-  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
-
-  GridParallelRNG   pRNG(&GRID);
-  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-  LatticeGaugeField U(&GRID);
-
-  SU<Nc>::HotConfiguration(pRNG,U);
-
-  //#define PRD
-#ifdef PRD
-  typedef PeriodicGimplD Gimpl;
-#else
-  typedef ConjugateGimplD Gimpl;
-  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
-  Gimpl::setDirections(conj_dirs);
-#endif
-
-  typedef typename WilsonLoops<Gimpl>::GaugeMat GaugeMat;
-  typedef typename WilsonLoops<Gimpl>::GaugeLorentz GaugeLorentz;
-
-  GaugeLorentz derivOrig(&GRID), derivNew(&GRID);
-  double beta = 2.13;
-  IwasakiGaugeActionOrig<Gimpl> action_orig(beta);
-  IwasakiGaugeAction<Gimpl> action_new(beta);
-
-  double torig=0, tnew=0;
-  int ntest = 10;
-  for(int i=0;i<ntest;i++){
-    double t0 = usecond();
-    action_orig.deriv(U, derivOrig);
-    double t1 = usecond();
-    action_new.deriv(U, derivNew);
-    double t2 = usecond();
-
-    GaugeLorentz diff = derivOrig - derivNew;
-    double n = norm2(diff);
-    std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
-    assert(n<1e-10);
-
-    std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms,  new: " << (t2-t1)/1000 << "ms" << std::endl;
-    torig += (t1-t0)/1000; tnew += (t2-t1)/1000;
-  }
-  std::cout << GridLogMessage << "Avg timings " << ntest << " iterations: orig:" << torig/ntest << "ms,   new:" << tnew/ntest << "ms" << std::endl;
-  
-  Grid_finalize();
-}
--- a/tests/debug/Test_optimized_staple_gaugebc.cc
+++ b/tests/debug/Test_optimized_staple_gaugebc.cc
@ -1,94 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_optimized_staple_gaugebc.cc
-
-    Copyright (C) 2015
-
-Author: Christopher Kelly <ckelly@bnl.gov>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#include <Grid/lattice/PaddedCell.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
-
-using namespace std;
-using namespace Grid;
- 
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  Coordinate latt_size  = GridDefaultLatt();
-  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
-  Coordinate mpi_layout = GridDefaultMpi();
-  std::cout << " mpi "<<mpi_layout<<std::endl;
-  std::cout << " simd "<<simd_layout<<std::endl;
-  std::cout << " latt "<<latt_size<<std::endl;
-  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
-
-  GridParallelRNG   pRNG(&GRID);
-  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-  LatticeGaugeField U(&GRID);
-
-  SU<Nc>::HotConfiguration(pRNG,U);
-
-  //#define PRD
-#ifdef PRD
-  typedef PeriodicGimplD Gimpl;
-#else
-  typedef ConjugateGimplD Gimpl;
-  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
-  Gimpl::setDirections(conj_dirs);
-#endif
-
-  typedef typename WilsonLoops<Gimpl>::GaugeMat GaugeMat;
-  typedef typename WilsonLoops<Gimpl>::GaugeLorentz GaugeLorentz;
-
-  int count = 0;
-  double torig=0, topt=0;
-     
-  std::vector<GaugeMat> Umu(Nd,&GRID), U2(Nd,&GRID);
-  for(int mu=0;mu<Nd;mu++){
-    Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
-    WilsonLoops<Gimpl>::RectStapleDouble(U2[mu], Umu[mu], mu);
-  }
-
-  std::cout << GridLogMessage << "Checking optimized vs unoptimized RectStaple" << std::endl;
-  for(int mu=0;mu<Nd;mu++){
-    GaugeMat staple_orig(&GRID), staple_opt(&GRID), staple_U2(&GRID);
-    double t0 = usecond();
-    WilsonLoops<Gimpl>::RectStapleUnoptimised(staple_orig,U,mu);
-    double t1 = usecond();
-    WilsonLoops<Gimpl>::RectStapleOptimised(staple_opt, U2, Umu, mu);
-    double t2 = usecond();
-    torig += t1-t0;  topt += t2-t1;
-    ++count;
-    
-    GaugeMat diff = staple_orig - staple_opt;
-    double n = norm2(diff);
-    std::cout << GridLogMessage << mu << " " << n << std::endl;
-    assert(n<1e-10);
-  }
-  std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  optimized: " << topt/1000/count << "ms" << std::endl;
-  
-  Grid_finalize();
-}
--- a/tests/debug/Test_padded_cell.cc
+++ b/tests/debug/Test_padded_cell.cc
@ -1,184 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_padded_cell.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#include <Grid/lattice/PaddedCell.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
-
-using namespace std;
-using namespace Grid;
-
-template<class vobj> void gpermute(vobj & inout,int perm){
-  vobj tmp=inout;
-  if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
-  if (perm & 0x2 ) { permute(inout,tmp,1); tmp=inout;}
-  if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
-  if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
-}
-  
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  Coordinate latt_size  = GridDefaultLatt();
-  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
-  Coordinate mpi_layout = GridDefaultMpi();
-  std::cout << " mpi "<<mpi_layout<<std::endl;
-  std::cout << " simd "<<simd_layout<<std::endl;
-  std::cout << " latt "<<latt_size<<std::endl;
-  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
-
-  GridParallelRNG   pRNG(&GRID);
-  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-  LatticeGaugeField Umu(&GRID);
-
-  SU<Nc>::HotConfiguration(pRNG,Umu);
-
-  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
-  LatticeComplex trplaq(&GRID);
-
-  std::vector<LatticeColourMatrix> U(Nd, Umu.Grid());
-  for (int mu = 0; mu < Nd; mu++) {
-    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
-  }
-
-  std::cout << GridLogMessage << " Average plaquette "<<plaq<<std::endl;
-
-  LatticeComplex cplaq(&GRID); cplaq=Zero();
-
-  /////////////////////////////////////////////////
-  // Create a padded cell of extra padding depth=1
-  /////////////////////////////////////////////////
-  int depth = 1;
-  PaddedCell Ghost(depth,&GRID);
-  LatticeGaugeField Ughost = Ghost.Exchange(Umu);
-
-  ///////////////////////////////////////////////////////////////////
-  // Temporary debug Hack for single rank sim:
-  // Check the contents of the cell are periodcally replicated
-  // In future ONLY pad those dimensions that are not local to node
-  ///////////////////////////////////////////////////////////////////
-#if 0
-  {
-    double diff=0;
-    double n=0;
-  {
-    autoView( Ug_v , Ughost, CpuRead);
-    autoView( Ul_v , Umu   , CpuRead);
-  for(int x=0;x<latt_size[0]+2;x++){
-  for(int y=0;y<latt_size[1]+2;y++){
-  for(int z=0;z<latt_size[2]+2;z++){
-  for(int t=0;t<latt_size[3]+2;t++){
-    int lx=(x-1+latt_size[0])%latt_size[0];
-    int ly=(y-1+latt_size[1])%latt_size[1];
-    int lz=(z-1+latt_size[2])%latt_size[2];
-    int lt=(t-1+latt_size[3])%latt_size[3];
-    Coordinate gcoor({x,y,z,t});
-    Coordinate lcoor({lx,ly,lz,lt});
-    LorentzColourMatrix g;
-    LorentzColourMatrix l;
-    peekLocalSite(g,Ug_v,gcoor);
-    peekLocalSite(l,Ul_v,lcoor);
-    g=g-l;
-    assert(norm2(g)==0);
-    diff = diff + norm2(g);
-    n = n + norm2(l);
-  }}}}
-  }
-  std::cout << "padded field check diff "<< diff <<" / "<< n<<std::endl;
-  std::cout << norm2(Ughost)<< " " << norm2(Umu)<<std::endl;
-  }
-#endif
-
-  ///// Array for the site plaquette
-  GridBase *GhostGrid = Ughost.Grid();
-  LatticeComplex gplaq(GhostGrid); 
-  
-  std::vector<Coordinate> shifts;
-  for(int mu=0;mu<Nd;mu++){
-    for(int nu=mu+1;nu<Nd;nu++){
-  
-      //    Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x)
-      Coordinate shift_0(Nd,0);
-      Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
-      Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
-      shifts.push_back(shift_0);
-      shifts.push_back(shift_mu);
-      shifts.push_back(shift_nu);
-      shifts.push_back(shift_0);
-    }
-  }
-  GeneralLocalStencil gStencil(GhostGrid,shifts);
-
-  gplaq=Zero();
-  {
-    autoView( gp_v , gplaq, CpuWrite);
-    autoView( t_v , trplaq, CpuRead);
-    autoView( U_v , Ughost, CpuRead);
-    for(int ss=0;ss<gp_v.size();ss++){
-      int s=0;
-      for(int mu=0;mu<Nd;mu++){
-	for(int nu=mu+1;nu<Nd;nu++){
-
-	  auto SE0 = gStencil.GetEntry(s+0,ss);
-	  auto SE1 = gStencil.GetEntry(s+1,ss);
-	  auto SE2 = gStencil.GetEntry(s+2,ss);
-	  auto SE3 = gStencil.GetEntry(s+3,ss);
-	
-	  int o0 = SE0->_offset;
-	  int o1 = SE1->_offset;
-	  int o2 = SE2->_offset;
-	  int o3 = SE3->_offset;
-	  
-	  auto U0 = U_v[o0](mu);
-	  auto U1 = U_v[o1](nu);
-	  auto U2 = adj(U_v[o2](mu));
-	  auto U3 = adj(U_v[o3](nu));
-
-	  gpermute(U0,SE0->_permute);
-	  gpermute(U1,SE1->_permute);
-	  gpermute(U2,SE2->_permute);
-	  gpermute(U3,SE3->_permute);
-	  
-	  gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 );
-	  s=s+4;
-	}
-      }
-    }
-  }
-  cplaq = Ghost.Extract(gplaq);
-  RealD vol = cplaq.Grid()->gSites();
-  RealD faces = (Nd * (Nd-1))/2;
-  auto p = TensorRemove(sum(cplaq));
-  auto result = p.real()/vol/faces/Nc;
-
-  std::cout << GridLogMessage << " Average plaquette via padded cell "<<result<<std::endl;
-  std::cout << GridLogMessage << " Diff "<<result-plaq<<std::endl;
-  
-  assert(fabs(result-plaq)<1.0e-8);
-  Grid_finalize();
-}
--- a/tests/debug/Test_padded_cell_staple.cc
+++ b/tests/debug/Test_padded_cell_staple.cc
@ -1,580 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_padded_cell_staple.cc
-
-    Copyright (C) 2015
-
-Author: Christopher Kelly <ckelly@bnl.gov>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#include <Grid/lattice/PaddedCell.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
-
-using namespace std;
-using namespace Grid;
-
-template <class Gimpl> class WilsonLoopsTest : public Gimpl {
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  typedef typename Gimpl::GaugeLinkField GaugeMat;
-  typedef typename Gimpl::GaugeField GaugeLorentz;
-
-
-  //Original implementation
-  static void StapleOrig(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
-			 int nu) {
-
-    GridBase *grid = Umu.Grid();
-
-    std::vector<GaugeMat> U(Nd, grid);
-    for (int d = 0; d < Nd; d++) {
-      U[d] = PeekIndex<LorentzIndex>(Umu, d);
-    }
-    staple = Zero();
-
-    if (nu != mu) {
-
-      // mu
-      // ^
-      // |__>  nu
-
-      //    __
-      //      |
-      //    __|
-      //
-
-      //Forward: Out(x) = Link(x)*field(x+mu)
-      //Backward: Out(x) = Link^dag(x-mu)*field(x-mu)
-      //ShiftStaple: Link(x) = Link(x+mu)
-
-      //tmp1 = U^dag_nu(x-nu)
-      //tmp2 = U^dag_mu(x-mu) tmp1(x-mu) = U^dag_mu(x-mu) U^dag_nu(x-nu-mu)
-      //tmp3 = U_nu(x) tmp2(x+nu) = U_nu(x)U^dag_mu(x-mu+nu) U^dag_nu(x-mu)
-      //tmp4 = tmp(x+mu) = U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
-
-      staple += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftForward(
-							  U[nu], nu,
-							  Gimpl::CovShiftBackward(
-										  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
-				   mu);
-
-      //  __
-      // |
-      // |__
-      //
-      //
-    
-      //tmp1 = U_mu^dag(x-mu) U_nu(x-mu)
-      //tmp2 = U_nu^dag(x-nu) tmp1(x-nu) = U_nu^dag(x-nu) U_mu^dag(x-mu-nu) U_nu(x-mu-nu)
-      //tmp3 = tmp2(x+mu) = U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
-      staple += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftBackward(U[nu], nu,
-							   Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
-				   mu);
-    }
-  }
-
-  static void StaplePadded(GaugeMat &staple, const GaugeLorentz &U, int mu,
-			   int nu) {
-    if(nu==mu){
-      staple = Zero();
-      return;
-    }
-    double peek = 0, construct = 0, exchange = 0, coord = 0, stencil =0, kernel = 0, extract = 0, total = 0;
-    
-    double tstart = usecond();
-    double t=tstart;
-    
-    PaddedCell Ghost(1, (GridCartesian*)U.Grid());
-
-    construct += usecond() - t;
-      
-    t=usecond();      
-    GaugeMat U_mu = PeekIndex<LorentzIndex>(U, mu);
-    GaugeMat U_nu = PeekIndex<LorentzIndex>(U, nu);
-    peek += usecond() - t;
-
-    t=usecond();
-    CshiftImplGauge<Gimpl> cshift_impl;
-    GaugeMat Ug_mu = Ghost.Exchange(U_mu, cshift_impl);
-    GaugeMat Ug_nu = Ghost.Exchange(U_nu, cshift_impl);
-    exchange += usecond() - t;
-    
-    GridBase *ggrid = Ug_mu.Grid();
-
-    GaugeMat gStaple(ggrid);
-
-    t=usecond();
-    Coordinate shift_0(Nd,0);
-    Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
-    Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
-    Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
-    Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
-
-    std::vector<Coordinate> shifts;
-
-    //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
-    shifts.push_back(shift_0);
-    shifts.push_back(shift_nu);
-    shifts.push_back(shift_mu);
-
-    //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
-    shifts.push_back(shift_mnu);
-    shifts.push_back(shift_mnu);
-    shifts.push_back(shift_mnu_pmu);
-    coord += usecond()-t;
-
-    t=usecond();
-    GeneralLocalStencil gStencil(ggrid,shifts);
-    stencil += usecond() -t;
-
-    t=usecond();
-    {
-      autoView( gStaple_v , gStaple, AcceleratorWrite);
-      auto gStencil_v = gStencil.View();
-      autoView( Ug_mu_v , Ug_mu, AcceleratorRead);
-      autoView( Ug_nu_v , Ug_nu, AcceleratorRead);
-  
-      accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
-	  GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss);
-	  auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
-	  e = gStencil_v.GetEntry(1,ss);
-	  auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
-	  e = gStencil_v.GetEntry(2,ss);
-	  auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
-      
-	  auto stencil_ss = U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x;
-
-	  e = gStencil_v.GetEntry(3,ss);
-	  auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
-	  e = gStencil_v.GetEntry(4,ss);
-	  auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
-	  e = gStencil_v.GetEntry(5,ss);
-	  auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
-
-	  stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu;
-      
-	  coalescedWrite(gStaple_v[ss],stencil_ss);
-	}
-	);
-    } //ensure views are all closed!
-    kernel += usecond() - t;
-
-    t=usecond();
-    staple = Ghost.Extract(gStaple);
-    extract += usecond()-t;
-    
-    total += usecond() - tstart;
-    std::cout << GridLogMessage << "StaplePadded timings peek:" << peek << " construct:" << construct << " exchange:" << exchange << " coord:" << coord << " stencil:" << stencil << " kernel:" << kernel << " extract:" << extract << " total:" << total << std::endl;
-  }
-
-  static void RectStapleOrig(GaugeMat &Stap, const GaugeLorentz &Umu,
-			     int mu) {
-    GridBase *grid = Umu.Grid();
-
-    std::vector<GaugeMat> U(Nd, grid);
-    for (int d = 0; d < Nd; d++) {
-      U[d] = PeekIndex<LorentzIndex>(Umu, d);
-    }
-
-    Stap = Zero();
-
-    for (int nu = 0; nu < Nd; nu++) {
-      if (nu != mu) {
-        //           __ ___
-        //          |    __ |
-        //
-	//tmp1 = U_nu^dag(x-nu)
-	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu) U_nu^dag(x-nu-mu)
-	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu) U_mu^dag(x-2mu) U_nu^dag(x-nu-2mu)
-	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu) U_mu^dag(x-2mu+nu) U_nu^dag(x-2mu)
-	//tmp5 = U_mu(x)tmp4(x+mu) = U_mu(x)U_nu(x+mu)U_mu^dag(x+nu) U_mu^dag(x-mu+nu) U_nu^dag(x-mu)
-	//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
-	
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftForward(
-							  U[mu], mu,
-							  Gimpl::CovShiftForward(
-										 U[nu], nu,
-										 Gimpl::CovShiftBackward(
-													 U[mu], mu,
-													 Gimpl::CovShiftBackward(
-																 U[mu], mu,
-																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
-				   mu);
-
-        //              __
-        //          |__ __ |
-
-	//tmp1 = U^dag_mu(x-mu)U_nu(x-mu)
-	//tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)
-	//tmp3 = U^dag_nu(x-nu)tmp2(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)
-	//tmp4 = U_mu(x)tmp3(x+mu) = U_mu(x)U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)
-	//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
-	
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftForward(
-							  U[mu], mu,
-							  Gimpl::CovShiftBackward(
-										  U[nu], nu,
-										  Gimpl::CovShiftBackward(
-													  U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))),
-				   mu);
-
-        //           __
-        //          |__ __ |
-	//Forward: Out(x) = Link(x)*field(x+mu)
-	//Backward: Out(x) = Link^dag(x-mu)*field(x-mu)
-	//ShiftStaple: Link(x) = Link(x+mu)
-
-	//tmp1 = U_nu(x)U_mu(x+nu)
-	//tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U_nu(x-mu)U_mu(x+nu-mu)
-	//tmp3 = U^dag_mu(x-mu)tmp2(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)U_mu(x+nu-2mu)
-	//tmp4 = U^dag_nu(x-nu)tmp3(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)U_mu(x-2mu)
-	//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftBackward(
-							   U[nu], nu,
-							   Gimpl::CovShiftBackward(
-										   U[mu], mu,
-										   Gimpl::CovShiftBackward(
-													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))),
-				   mu);
-
-        //           __ ___
-        //          |__    |
-	//tmp1 = U_nu^dag(x-nu)U_mu(x-nu)
-	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_mu(x-mu-nu)
-	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_mu^dag(x-2mu)U_nu^dag(x-2mu-nu)U_mu(x-2mu-nu)
-	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_mu^dag(x-2mu+nu)U_nu^dag(x-2mu)U_mu(x-2mu)
-	//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftForward(
-							  U[nu], nu,
-							  Gimpl::CovShiftBackward(
-										  U[mu], mu,
-										  Gimpl::CovShiftBackward(
-													  U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))),
-				   mu);
-
-        //       --
-        //      |  |
-        //
-        //      |  |
-	//tmp1 = U_nu^dag(x-nu)
-	//tmp2 = U_nu^dag(x-nu)tmp1(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)
-	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_nu^dag(x-mu-2nu)
-	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_nu^dag(x-mu-nu)
-	//tmp5 = U_nu(x)tmp4(x+nu) = U_nu(x)U_nu(x+nu)U_mu^dag(x-mu+2nu)U_nu^dag(x-mu+nu)U_nu^dag(x-mu)
-	//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftForward(
-							  U[nu], nu,
-							  Gimpl::CovShiftForward(
-										 U[nu], nu,
-										 Gimpl::CovShiftBackward(
-													 U[mu], mu,
-													 Gimpl::CovShiftBackward(
-																 U[nu], nu,
-																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
-				   mu);
-
-        //      |  |
-        //
-        //      |  |
-        //       --
-	//tmp1 = U_nu(x)U_nu(x+nu)
-	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu(x-mu)U_nu(x-mu+nu)
-	//tmp3 = U_nu^dag(x-nu)tmp2(x-nu) = U_nu^dag(x-nu)U_mu^dag(x-mu-nu)U_nu(x-mu-nu)U_nu(x-mu)
-	//tmp4 = U_nu^dag(x-nu)tmp3(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)U_mu^dag(x-mu-2nu)U_nu(x-mu-2nu)U_nu(x-mu-nu)
-	//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
-        Stap += Gimpl::ShiftStaple(
-				   Gimpl::CovShiftBackward(
-							   U[nu], nu,
-							   Gimpl::CovShiftBackward(
-										   U[nu], nu,
-										   Gimpl::CovShiftBackward(
-													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))),
-				   mu);
-      }
-    }
-  }
-
-
-  static void RectStaplePadded(GaugeMat &Stap, const GaugeLorentz &U,
-			       int mu) {
-    PaddedCell Ghost(2,(GridCartesian*)U.Grid());
-    GridBase *ggrid = Ghost.grids.back();
-    
-    CshiftImplGauge<Gimpl> cshift_impl;
-    std::vector<GaugeMat> Ug_dirs(Nd,ggrid);
-    for(int i=0;i<Nd;i++) Ug_dirs[i] = Ghost.Exchange(PeekIndex<LorentzIndex>(U, i), cshift_impl);
-
-    GaugeMat gStaple(ggrid);
-
-    std::vector<Coordinate> shifts;
-    for (int nu = 0; nu < Nd; nu++) {
-      if (nu != mu) {
-	auto genShift = [&](int mushift,int nushift){
-	  Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
-	};
-
-	//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
-	shifts.push_back(genShift(0,0));
-	shifts.push_back(genShift(0,+1));
-	shifts.push_back(genShift(+1,+1));
-	shifts.push_back(genShift(+2,0));
-	shifts.push_back(genShift(+1,0));
-
-	//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
-	shifts.push_back(genShift(0,-1));
-	shifts.push_back(genShift(0,-1));
-	shifts.push_back(genShift(+1,-1));
-	shifts.push_back(genShift(+2,-1));
-	shifts.push_back(genShift(+1,0));
-
-	//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
-	shifts.push_back(genShift(-1,0));
-	shifts.push_back(genShift(-1,-1));
-	shifts.push_back(genShift(-1,-1));
-	shifts.push_back(genShift(0,-1));
-	shifts.push_back(genShift(+1,-1));
-
-	//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
-	shifts.push_back(genShift(-1,0));
-	shifts.push_back(genShift(-1,0));
-	shifts.push_back(genShift(-1,+1));
-	shifts.push_back(genShift(0,+1));
-	shifts.push_back(genShift(+1,0));
-
-	//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
-	shifts.push_back(genShift(0,0));
-	shifts.push_back(genShift(0,+1));
-	shifts.push_back(genShift(0,+2));
-	shifts.push_back(genShift(+1,+1));
-	shifts.push_back(genShift(+1,0));
-
-	//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
-	shifts.push_back(genShift(0,-1));
-	shifts.push_back(genShift(0,-2));
-	shifts.push_back(genShift(0,-2));
-	shifts.push_back(genShift(+1,-2));
-	shifts.push_back(genShift(+1,-1));
-      }
-    }
-    size_t nshift = shifts.size();
-
-    GeneralLocalStencil gStencil(ggrid,shifts);
-    {
-      autoView( gStaple_v , gStaple, AcceleratorWrite);
-      auto gStencil_v = gStencil.View();
-
-      typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
-      size_t vsize = Nd*sizeof(GaugeViewType);
-      GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
-      for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = Ug_dirs[i].View(AcceleratorRead);
-      GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
-      acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
-
-      accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
-	  decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
-	  stencil_ss = Zero();
-	  int s=0;
-	  for(int nu=0;nu<Nd;nu++){
-	    if(nu != mu){
-	      //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
-	      GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
-	      auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-	    
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-	      //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-	      //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-	      //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
-
-	      //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
-
-	      //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-	      e = gStencil_v.GetEntry(s++,ss);
-	      U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-
-	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
-
-	    }
-	  }
-	  assert(s==nshift);
-	  coalescedWrite(gStaple_v[ss],stencil_ss);
-	}
-	);
-  
-      for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
-      free(Ug_dirs_v_host);
-      acceleratorFreeDevice(Ug_dirs_v);
-    }   
-    Stap = Ghost.Extract(gStaple);    
-  }
-
-
-
-};  
-  
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  Coordinate latt_size  = GridDefaultLatt();
-  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
-  Coordinate mpi_layout = GridDefaultMpi();
-  std::cout << " mpi "<<mpi_layout<<std::endl;
-  std::cout << " simd "<<simd_layout<<std::endl;
-  std::cout << " latt "<<latt_size<<std::endl;
-  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
-
-  GridParallelRNG   pRNG(&GRID);
-  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-  LatticeGaugeField U(&GRID);
-
-  SU<Nc>::HotConfiguration(pRNG,U);
-
-  //typedef PeriodicGimplD Gimpl;
-  typedef ConjugateGimplD Gimpl;
-  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
-  Gimpl::setDirections(conj_dirs);
-
-  typedef typename WilsonLoopsTest<Gimpl>::GaugeMat GaugeMat;
-  typedef typename WilsonLoopsTest<Gimpl>::GaugeLorentz GaugeLorentz;
-
-  std::cout << GridLogMessage << "Checking Staple" << std::endl;
-  int count = 0;
-  double torig=0, tpadded=0;
-  
-  for(int mu=0;mu<Nd;mu++){
-    for(int nu=0;nu<Nd;nu++){
-      if(mu != nu){
-	GaugeMat staple_orig(&GRID), staple_padded(&GRID);
-	double t0 = usecond();
-	WilsonLoopsTest<Gimpl>::StapleOrig(staple_orig,U,mu,nu);
-	double t1 = usecond();
-	WilsonLoopsTest<Gimpl>::StaplePadded(staple_padded,U,mu,nu);
-	double t2 = usecond();
-	torig += t1-t0;  tpadded += t2-t1;
-	++count;
-	
-	GaugeMat diff = staple_orig - staple_padded;
-	double n = norm2(diff);
-	std::cout << GridLogMessage << mu << " " << nu << " " << n << std::endl;
-	assert(n<1e-10);
-      }
-    }
-  }
-  std::cout << GridLogMessage << "Staple timings orig: " << torig/1000/count << "ms,  padded: " << tpadded/1000/count << "ms" << std::endl;
-  count=0; torig=tpadded=0;
-    
-  std::cout << GridLogMessage << "Checking RectStaple" << std::endl;
-  for(int mu=0;mu<Nd;mu++){
-    GaugeMat staple_orig(&GRID), staple_padded(&GRID);
-    double t0 = usecond();
-    WilsonLoopsTest<Gimpl>::RectStapleOrig(staple_orig,U,mu);
-    double t1 = usecond();
-    WilsonLoopsTest<Gimpl>::RectStaplePadded(staple_padded,U,mu);
-    double t2 = usecond();
-    torig += t1-t0;  tpadded += t2-t1;
-    ++count;
-    
-    GaugeMat diff = staple_orig - staple_padded;
-    double n = norm2(diff);
-    std::cout << GridLogMessage << mu << " " << n << std::endl;
-    assert(n<1e-10);
-  }
-  std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  padded: " << tpadded/1000/count << "ms" << std::endl;
-  
-  Grid_finalize();
-}
--- a/tests/forces/Test_fthmc.cc
+++ b/tests/forces/Test_fthmc.cc
@ -1,219 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_fthmc.cc
-
-    Copyright (C) 2022
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
-#include <Grid/qcd/smearing/JacobianAction.h>
-
-using namespace std;
-using namespace Grid;
-
-typedef MobiusFermionD FermionAction;
-typedef WilsonImplD FimplD;
-typedef WilsonImplD FermionImplPolicy;
-
-template<class Gimpl>
-void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeField> & smU,MomentumFilterBase<LatticeGaugeField> &Filter)
-{
-  LatticeGaugeField U = smU.get_U(false); // unsmeared config
-  GridBase *UGrid = U.Grid();
-
-  std::vector<int> seeds({1,2,3,5});
-  GridSerialRNG            sRNG;         sRNG.SeedFixedIntegers(seeds);
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds);
-
-  LatticeColourMatrix Pmu(UGrid); 
-  LatticeGaugeField P(UGrid); 
-  LatticeGaugeField UdSdU(UGrid); 
-
-  std::cout << GridLogMessage << "*********************************************************"<<std::endl;
-  std::cout << GridLogMessage << " Force test for "<<action.action_name()<<std::endl;
-  std::cout << GridLogMessage << "*********************************************************"<<std::endl;
-  
-  RealD eps=0.01;
-
-  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
-  std::cout << GridLogMessage << " Refresh "<<action.action_name()<<std::endl;
-  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
-  
-  Gimpl::generate_momenta(P,sRNG,RNG4);
-  //  Filter.applyFilter(P);
-
-  action.refresh(smU,sRNG,RNG4);
-
-  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
-  std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
-  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
-
-  RealD S1 = action.S(smU);
-
-  Gimpl::update_field(P,U,eps);
-  smU.set_Field(U);
-
-  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
-  std::cout << GridLogMessage << " Derivative "<<action.action_name()<<std::endl;
-  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
-  action.deriv(smU,UdSdU);
-  UdSdU = Ta(UdSdU);
-  //  Filter.applyFilter(UdSdU);
-
-  DumpSliceNorm("Force",UdSdU,Nd-1);
-  
-  Gimpl::update_field(P,U,eps);
-  smU.set_Field(U);
-
-  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
-  std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
-  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
-  
-  RealD S2 = action.S(smU);
-
-  // Use the derivative
-  LatticeComplex dS(UGrid); dS = Zero();
-  for(int mu=0;mu<Nd;mu++){
-    auto UdSdUmu = PeekIndex<LorentzIndex>(UdSdU,mu);
-    Pmu= PeekIndex<LorentzIndex>(P,mu);
-    dS = dS - trace(Pmu*UdSdUmu)*eps*2.0*HMC_MOMENTUM_DENOMINATOR;
-  }
-  ComplexD dSpred    = sum(dS);
-  RealD diff =  S2-S1-dSpred.real();
-
-  std::cout<< GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
-  std::cout<< GridLogMessage << "S1 : "<< S1    <<std::endl;
-  std::cout<< GridLogMessage << "S2 : "<< S2    <<std::endl;
-  std::cout<< GridLogMessage << "dS : "<< S2-S1 <<std::endl;
-  std::cout<< GridLogMessage << "dSpred : "<< dSpred.real() <<std::endl;
-  std::cout<< GridLogMessage << "diff : "<< diff<<std::endl;
-  std::cout<< GridLogMessage << "*********************************************************"<<std::endl;
-  //  assert(diff<1.0);
-  std::cout<< GridLogMessage << "Done" <<std::endl;
-  std::cout << GridLogMessage << "*********************************************************"<<std::endl;
-}
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  std::cout << std::setprecision(14);
-  Coordinate latt_size   = GridDefaultLatt();
-  Coordinate mpi_layout  = GridDefaultMpi();
-  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
-  Coordinate shm;
-  GlobalSharedMemory::GetShmDims(mpi_layout,shm);
-
-  const int Ls=12;
-  const int Nt = latt_size[3];
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  
-  ///////////////////// Gauge Field and Gauge Forces ////////////////////////////
-  LatticeGaugeField U(UGrid);
-
-#if  0
-  FieldMetaData header;
-  std::string file("./ckpoint_lat.2000");
-  NerscIO::readConfiguration(U,header,file);
-#else
-  std::vector<int> seeds({1,2,3,4,5,6,7,8});
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds);
-  SU<Nc>::HotConfiguration(RNG4,U);
-#endif
-
-  
-  WilsonGaugeActionR  PlaqAction(6.0);
-  IwasakiGaugeActionR RectAction(2.13);
-  PlaqAction.is_smeared = true;  
-  RectAction.is_smeared = true;  
-
-  ////////////////////////////////////
-  // Fermion Action
-  ////////////////////////////////////
-  RealD mass=0.01; 
-  RealD pvmass=1.0; 
-  RealD M5=1.8; 
-  RealD b=1.5;
-  RealD c=0.5;
-  
-  // Double versions
-  std::vector<Complex> boundary = {1,1,1,-1};
-  FermionAction::ImplParams Params(boundary);
-  FermionAction DdwfPeriodic(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,Params);
-  FermionAction PVPeriodic  (U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,pvmass,M5,b,c,Params);
-
-  double StoppingCondition = 1.0e-8;
-  double MaxCGIterations = 50000;
-  ConjugateGradient<LatticeFermion>  CG(StoppingCondition,MaxCGIterations);
-
-  TwoFlavourRatioPseudoFermionAction<FimplD> Nf2(PVPeriodic, DdwfPeriodic,CG,CG);
-  Nf2.is_smeared = true;  
-  
-  ////////////////////////////////////////////////
-  // Plaquette only FTHMC smearer
-  ////////////////////////////////////////////////
-  double rho = 0.1;
-  Smear_Stout<PeriodicGimplR> Smearer(rho);
-  SmearedConfigurationMasked<PeriodicGimplR> SmartConfig(UGrid,2*Nd,Smearer);
-  SmearedConfiguration<PeriodicGimplR> StoutConfig(UGrid,1,Smearer);
-
-  JacobianAction<PeriodicGimplR> Jacobian(&SmartConfig);
-  
-  ////////////////////////////////////////////////
-  // Run some tests
-  ////////////////////////////////////////////////
-  MomentumFilterNone<LatticeGaugeField> FilterNone;
-
-  std::cout << " *********  FIELD TRANSFORM SMEARING ***** "<<std::endl;
-
-  SmartConfig.set_Field(U);
-  ForceTest<GimplTypesR>(PlaqAction,SmartConfig,FilterNone);
-
-  SmartConfig.set_Field(U);
-  ForceTest<GimplTypesR>(RectAction,SmartConfig,FilterNone);
-
-  SmartConfig.set_Field(U);
-  ForceTest<GimplTypesR>(Jacobian,SmartConfig,FilterNone);
-
-  SmartConfig.set_Field(U);
-  ForceTest<GimplTypesR>(Nf2,SmartConfig,FilterNone);
-
-  std::cout << " *********    STOUT SMEARING ***** "<<std::endl;
-
-  StoutConfig.set_Field(U);
-  ForceTest<GimplTypesR>(PlaqAction,StoutConfig,FilterNone);
-
-  StoutConfig.set_Field(U);
-  ForceTest<GimplTypesR>(RectAction,StoutConfig,FilterNone);
-  
-  StoutConfig.set_Field(U);
-  ForceTest<GimplTypesR>(Nf2,StoutConfig,FilterNone);
-  
-
-  Grid_finalize();
-}
--- a/tests/hmc/Test_hmc_Mobius2p1f.cc
+++ b/tests/hmc/Test_hmc_Mobius2p1f.cc
@ -85,7 +85,7 @@ int main(int argc, char **argv) {
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////

-  const int Ls      = 4;
+  const int Ls      = 8;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.04;