Merge 1dfaa08afb into b8a7004365

Partial fraction test
Merge pull request #439 from felixerben/bugfix/IRL_convergence
2025-07-14 03:57:06 +01:00 · 2023-09-03 09:29:21 -07:00 · 2023-08-14 15:17:03 -04:00 · 2023-07-12 16:32:26 -04:00 · 2023-06-28 15:11:24 -04:00 · 2023-06-27 14:58:10 -04:00
65 changed files with 4646 additions and 348 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -0,0 +1,54 @@
+name: Bug report
+description: Report a bug.
+title: "<insert title>"
+labels: [bug]
+
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thank you for taking the time to file a bug report.
+        Please check that the code is pointing to the HEAD of develop
+        or any commit in master which is tagged with a version number.
+
+  - type: textarea
+    attributes:
+      label: "Describe the issue:"
+      description: >
+        Describe the issue and any previous attempt to solve it.
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: "Code example:"
+      description: >
+        If relevant, show how to reproduce the issue using a minimal working
+        example.
+      placeholder: |
+        << your code here >>
+      render: shell
+    validations:
+      required: false
+
+  - type: textarea
+    attributes:
+      label: "Target platform:"
+      description: >
+        Give a description of the target platform (CPU, network, compiler).
+        Please give the full CPU part description, using for example
+        `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
+        or `sysctl machdep.cpu.brand_string` (macOS) and the full output
+        the `--version` option of your compiler.
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: "Configure options:"
+      description: >
+        Please give the exact configure command used and attach
+        `config.log`, `grid.config.summary` and the output of `make V=1`.
+      render: shell
+    validations:
+      required: true
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -542,6 +542,7 @@ public:
      (*this)(in[i], out[i]);
    }
  }
+  virtual ~LinearFunction(){};
 };

 template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
@ -166,16 +166,16 @@ public:
      rsqf[s] =rsq[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      //      ps_d[s] = src_d;
-      precisionChangeFast(ps_f[s],src_d);
+      precisionChange(ps_f[s],src_d);
    }
    // r and p for primary
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    r_d = p_d;
    
    //MdagM+m[0]
-    precisionChangeFast(p_f,p_d);
+    precisionChange(p_f,p_d);
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    precisionChangeFast(tmp_d,mmp_f);
+    precisionChange(tmp_d,mmp_f);
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    tmp_d = tmp_d - mmp_d;
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
@ -204,7 +204,7 @@ public:
  
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
-      precisionChangeFast(psi_f[s],psi_d[s]);
+      precisionChange(psi_f[s],psi_d[s]);
    }
  
    ///////////////////////////////////////
@ -225,7 +225,7 @@ public:
      AXPYTimer.Stop();

      PrecChangeTimer.Start();
-      precisionChangeFast(r_f, r_d);
+      precisionChange(r_f, r_d);
      PrecChangeTimer.Stop();

      AXPYTimer.Start();
@ -243,13 +243,13 @@ public:

      cp=c;
      PrecChangeTimer.Start();
-      precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
+      precisionChange(p_f, p_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f);
      MatrixTimer.Stop();  
      PrecChangeTimer.Start();
-      precisionChangeFast(mmp_d, mmp_f); // From Float to Double
+      precisionChange(mmp_d, mmp_f); // From Float to Double
      PrecChangeTimer.Stop();

      d=real(innerProduct(p_d,mmp_d));    
@ -311,7 +311,7 @@ public:
 	SolverTimer.Stop();

 	for(int s=0;s<nshift;s++){
-	  precisionChangeFast(psi_d[s],psi_f[s]);
+	  precisionChange(psi_d[s],psi_f[s]);
 	}

 	
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@ -211,7 +211,7 @@ public:
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    tmp_d = tmp_d - mmp_d;
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
-    //    assert(norm2(tmp_d)< 1.0e-4);
+    assert(norm2(tmp_d)< 1.0);

    axpy(mmp_d,mass[0],p_d,mmp_d);
    RealD rn = norm2(p_d);
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -419,14 +419,15 @@ until convergence
 	}
      }

-      if ( Nconv < Nstop )
+      if ( Nconv < Nstop ) {
 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
-
+	std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
+      }
      eval=eval2;
      
      //Keep only converged
-      eval.resize(Nconv);// Nstop?
-      evec.resize(Nconv,grid);// Nstop?
+      eval.resize(Nstop);// was Nconv
+      evec.resize(Nstop,grid);// was Nconv
      basisSortInPlace(evec,eval,reverse);
      
    }
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -27,9 +27,10 @@ Author: Christoph Lehner <christoph@lhnr.de>
 *************************************************************************************/
 /*  END LEGAL */

+#define Mheader "SharedMemoryMpi: "
+
 #include <Grid/GridCore.h>
 #include <pwd.h>
-#include <syscall.h>

 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
@ -39,11 +40,118 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #endif
 #ifdef GRID_SYCL
 #define GRID_SYCL_LEVEL_ZERO_IPC
+#include <syscall.h>
+#define SHM_SOCKETS 
+#endif
+
+#include <sys/socket.h>
+#include <sys/un.h>
+
+NAMESPACE_BEGIN(Grid); 
+
+#ifdef SHM_SOCKETS
+
+/*
+ * Barbaric extra intranode communication route in case we need sockets to pass FDs
+ * Forced by level_zero not being nicely designed
+ */
+static int sock;
+static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
+static char sock_path[256];
+class UnixSockets {
+public:
+  static void Open(int rank)
+  {
+    int errnum;
+
+    sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0);
+
+    struct sockaddr_un sa_un = { 0 };
+    sa_un.sun_family = AF_UNIX;
+    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
+    unlink(sa_un.sun_path);
+    if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
+      perror("bind failure");
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  static int RecvFileDescriptor(void)
+  {
+    int n;
+    int fd;
+    char buf[1];
+    struct iovec iov;
+    struct msghdr msg;
+    struct cmsghdr *cmsg;
+    char cms[CMSG_SPACE(sizeof(int))];
+
+    iov.iov_base = buf;
+    iov.iov_len = 1;
+
+    memset(&msg, 0, sizeof msg);
+    msg.msg_name = 0;
+    msg.msg_namelen = 0;
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+
+    msg.msg_control = (caddr_t)cms;
+    msg.msg_controllen = sizeof cms;
+
+    if((n=recvmsg(sock, &msg, 0)) < 0) {
+      perror("recvmsg failed");
+      return -1;
+    }
+    if(n == 0){
+      perror("recvmsg returned 0");
+      return -1;
+    }
+    cmsg = CMSG_FIRSTHDR(&msg);
+
+    memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
+
+    return fd;
+  }
+
+  static void SendFileDescriptor(int fildes,int xmit_to_rank)
+  {
+    struct msghdr msg;
+    struct iovec iov;
+    struct cmsghdr *cmsg = NULL;
+    char ctrl[CMSG_SPACE(sizeof(int))];
+    char data = ' ';
+
+    memset(&msg, 0, sizeof(struct msghdr));
+    memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
+    iov.iov_base = &data;
+    iov.iov_len = sizeof(data);
+    
+    sprintf(sock_path,sock_path_fmt,xmit_to_rank);
+    
+    struct sockaddr_un sa_un = { 0 };
+    sa_un.sun_family = AF_UNIX;
+    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
+
+    msg.msg_name = (void *)&sa_un;
+    msg.msg_namelen = sizeof(sa_un);
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+    msg.msg_controllen =  CMSG_SPACE(sizeof(int));
+    msg.msg_control = ctrl;
+
+    cmsg = CMSG_FIRSTHDR(&msg);
+    cmsg->cmsg_level = SOL_SOCKET;
+    cmsg->cmsg_type = SCM_RIGHTS;
+    cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+
+    *((int *) CMSG_DATA(cmsg)) = fildes;
+
+    sendmsg(sock, &msg, 0);
+  };
+};
 #endif


-NAMESPACE_BEGIN(Grid); 
-#define header "SharedMemoryMpi: "
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
@ -66,8 +174,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);

  if ( WorldRank == 0) {
-    std::cout << header " World communicator of size " <<WorldSize << std::endl;  
-    std::cout << header " Node  communicator of size " <<WorldShmSize << std::endl;
+    std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;  
+    std::cout << Mheader " Node  communicator of size " <<WorldShmSize << std::endl;
  }
  // WorldShmComm, WorldShmSize, WorldShmRank

@ -344,7 +452,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
 #ifdef GRID_MPI3_SHMGET
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);

@ -429,7 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }

-  std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+  std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;

  SharedMemoryZero(ShmCommBuf,bytes);
@ -472,7 +580,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
@ -480,8 +588,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef SHM_SOCKETS
+  UnixSockets::Open(WorldShmRank);
+#endif
  for(int r=0;r<WorldShmSize;r++){

+    MPI_Barrier(WorldShmComm);
+
 #ifndef GRID_MPI3_SHM_NONE
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
@ -489,24 +602,32 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    void * thisBuf = ShmCommBuf;
    if(!Stencil_force_mpi) {
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
-    typedef struct { int fd; pid_t pid ; } clone_mem_t;
+    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;

    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
      
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
-
+    
    if ( r==WorldShmRank ) { 
      auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
+	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
      handle.pid = getpid();
+      memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
+#ifdef SHM_SOCKETS
+      for(int rr=0;rr<WorldShmSize;rr++){
+	if(rr!=r){
+	  UnixSockets::SendFileDescriptor(handle.fd,rr);
+	}
+      }
+#endif
    }
 #endif
 #ifdef GRID_CUDA
@ -534,6 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
    { 
+      MPI_Barrier(WorldShmComm);
      int ierr=MPI_Bcast(&handle,
 			 sizeof(handle),
 			 MPI_BYTE,
@ -549,6 +671,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    if ( r!=WorldShmRank ) {
      thisBuf = nullptr;
+      int myfd;
+#ifdef SHM_SOCKETS
+      myfd=UnixSockets::RecvFileDescriptor();
+#else
      std::cout<<"mapping seeking remote pid/fd "
 	       <<handle.pid<<"/"
 	       <<handle.fd<<std::endl;
@ -556,16 +682,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
-      int myfd  = syscall(438,pidfd,handle.fd,0);
-
-      std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
-      
+      myfd  = syscall(438,pidfd,handle.fd,0);
+      int err_t = errno;
+      if (myfd < 0) {
+        fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
+	perror("pidfd_getfd failed ");
+	assert(0);
+      }
+#endif
+      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
+      memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));

      auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
-	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
+	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
+	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
@ -600,6 +732,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #else
    WorldShmCommBufs[r] = ShmCommBuf;
 #endif
+    MPI_Barrier(WorldShmComm);
  }

  _ShmAllocBytes=bytes;
@ -611,7 +744,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -648,7 +781,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    //    std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@ -658,7 +791,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHM_NONE
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -705,7 +838,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@ -47,3 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
+#include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -697,8 +697,68 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
-
  // the above should guarantee that the operations are local
+  
+#if 1
+
+  size_t nsite = 1;
+  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
+  
+  size_t tbytes = 4*nsite*sizeof(int);
+  int *table = (int*)malloc(tbytes);
+ 
+  thread_for(idx, nsite, {
+      Coordinate from_coor, to_coor;
+      size_t rem = idx;
+      for(int i=0;i<nd;i++){
+	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
+	from_coor[i] = base_i + FromLowerLeft[i];
+	to_coor[i] = base_i + ToLowerLeft[i];
+      }
+      
+      int foidx = Fg->oIndex(from_coor);
+      int fiidx = Fg->iIndex(from_coor);
+      int toidx = Tg->oIndex(to_coor);
+      int tiidx = Tg->iIndex(to_coor);
+      int* tt = table + 4*idx;
+      tt[0] = foidx;
+      tt[1] = fiidx;
+      tt[2] = toidx;
+      tt[3] = tiidx;
+    });
+  
+  int* table_d = (int*)acceleratorAllocDevice(tbytes);
+  acceleratorCopyToDevice(table,table_d,tbytes);
+
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  autoView(from_v,From,AcceleratorRead);
+  autoView(to_v,To,AcceleratorWrite);
+  
+  accelerator_for(idx,nsite,1,{
+      static const int words=sizeof(vobj)/sizeof(vector_type);
+      int* tt = table_d + 4*idx;
+      int from_oidx = *tt++;
+      int from_lane = *tt++;
+      int to_oidx = *tt++;
+      int to_lane = *tt;
+
+      const vector_type* from = (const vector_type *)&from_v[from_oidx];
+      vector_type* to = (vector_type *)&to_v[to_oidx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	putlane(to[w], stmp, to_lane);
+      }
+    });
+  
+  acceleratorFreeDevice(table_d);    
+  free(table);
+  
+
+#else  
  Coordinate ldf = Fg->_ldimensions;
  Coordinate rdf = Fg->_rdimensions;
  Coordinate isf = Fg->_istride;
@ -707,9 +767,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  Coordinate ist = Tg->_istride;
  Coordinate ost = Tg->_ostride;

-  autoView( t_v , To, AcceleratorWrite);
-  autoView( f_v , From, AcceleratorRead);
-  accelerator_for(idx,Fg->lSites(),1,{
+  autoView( t_v , To, CpuWrite);
+  autoView( f_v , From, CpuRead);
+  thread_for(idx,Fg->lSites(),{
    sobj s;
    Coordinate Fcoor(nd);
    Coordinate Tcoor(nd);
@ -722,17 +782,24 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
    }
    if (in_region) {
-      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
-      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
-      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
-      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
-      vector_type * fp = (vector_type *)&f_v[odx_f];
-      vector_type * tp = (vector_type *)&t_v[odx_t];
+#if 0      
+      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
+      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
+      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
+      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
+      scalar_type * fp = (scalar_type *)&f_v[odx_f];
+      scalar_type * tp = (scalar_type *)&t_v[odx_t];
      for(int w=0;w<words;w++){
 	tp[w].putlane(fp[w].getlane(idx_f),idx_t);
      }
+#else
+    peekLocalSite(s,f_v,Fcoor);
+    pokeLocalSite(s,t_v,Tcoor);
+#endif
    }
  });
+
+#endif
 }


@ -825,6 +892,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 }


+//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
+//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@ -841,11 +910,70 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int

  for(int d=0;d<nh;d++){
    if ( d!=orthog ) {
-    assert(lg->_processors[d]  == hg->_processors[d]);
-    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
-  }
+      assert(lg->_processors[d]  == hg->_processors[d]);
+      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    }
  }

+#if 1
+  size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
+  size_t tbytes = 4*nsite*sizeof(int);
+  int *table = (int*)malloc(tbytes);
+  
+  thread_for(idx,nsite,{
+    Coordinate lcoor(nl);
+    Coordinate hcoor(nh);
+    lcoor[orthog] = slice_lo;
+    hcoor[orthog] = slice_hi;
+    size_t rem = idx;
+    for(int mu=0;mu<nl;mu++){
+      if(mu != orthog){
+	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu];
+	lcoor[mu] = hcoor[mu] = xmu;
+      }
+    }
+    int loidx = lg->oIndex(lcoor);
+    int liidx = lg->iIndex(lcoor);
+    int hoidx = hg->oIndex(hcoor);
+    int hiidx = hg->iIndex(hcoor);
+    int* tt = table + 4*idx;
+    tt[0] = loidx;
+    tt[1] = liidx;
+    tt[2] = hoidx;
+    tt[3] = hiidx;
+    });
+   
+  int* table_d = (int*)acceleratorAllocDevice(tbytes);
+  acceleratorCopyToDevice(table,table_d,tbytes);
+
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  autoView(lowDim_v,lowDim,AcceleratorRead);
+  autoView(higherDim_v,higherDim,AcceleratorWrite);
+  
+  accelerator_for(idx,nsite,1,{
+      static const int words=sizeof(vobj)/sizeof(vector_type);
+      int* tt = table_d + 4*idx;
+      int from_oidx = *tt++;
+      int from_lane = *tt++;
+      int to_oidx = *tt++;
+      int to_lane = *tt;
+
+      const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
+      vector_type* to = (vector_type *)&higherDim_v[to_oidx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	putlane(to[w], stmp, to_lane);
+      }
+    });
+  
+  acceleratorFreeDevice(table_d);    
+  free(table);
+  
+#else
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
@ -861,6 +989,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
      pokeLocalSite(s,higherDimv,hcoor);
    }
  });
+#endif
 }


--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -0,0 +1,174 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/PaddedCell.h
+
+    Copyright (C) 2019
+
+Author: Peter Boyle pboyle@bnl.gov
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include<Grid/cshift/Cshift.h>
+
+NAMESPACE_BEGIN(Grid);
+
+//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
+template<typename vobj>
+struct CshiftImplBase{
+  virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
+  virtual ~CshiftImplBase(){}
+};
+template<typename vobj>
+struct CshiftImplDefault: public CshiftImplBase<vobj>{
+  Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
+};
+template<typename Gimpl>
+struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
+  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
+};  
+
+class PaddedCell {
+public:
+  GridCartesian * unpadded_grid;
+  int dims;
+  int depth;
+  std::vector<GridCartesian *> grids;
+
+  ~PaddedCell()
+  {
+    DeleteGrids();
+  }
+  PaddedCell(int _depth,GridCartesian *_grid)
+  {
+    unpadded_grid = _grid;
+    depth=_depth;
+    dims=_grid->Nd();
+    AllocateGrids();
+    Coordinate local     =unpadded_grid->LocalDimensions();
+    for(int d=0;d<dims;d++){
+      assert(local[d]>=depth);
+    }
+  }
+  void DeleteGrids(void)
+  {
+    for(int d=0;d<grids.size();d++){
+      delete grids[d];
+    }
+    grids.resize(0);
+  };
+  void AllocateGrids(void)
+  {
+    Coordinate local     =unpadded_grid->LocalDimensions();
+    Coordinate simd      =unpadded_grid->_simd_layout;
+    Coordinate processors=unpadded_grid->_processors;
+    Coordinate plocal    =unpadded_grid->LocalDimensions();
+    Coordinate global(dims);
+
+    // expand up one dim at a time
+    for(int d=0;d<dims;d++){
+
+      plocal[d] += 2*depth; 
+
+      for(int d=0;d<dims;d++){
+	global[d] = plocal[d]*processors[d];
+      }
+
+      grids.push_back(new GridCartesian(global,simd,processors));
+    }
+  };
+  template<class vobj>
+  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
+  {
+    Lattice<vobj> out(unpadded_grid);
+
+    Coordinate local     =unpadded_grid->LocalDimensions();
+    Coordinate fll(dims,depth); // depends on the MPI spread
+    Coordinate tll(dims,0); // depends on the MPI spread
+    localCopyRegion(in,out,fll,tll,local);
+    return out;
+  }
+  template<class vobj>
+  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
+  {
+    GridBase *old_grid = in.Grid();
+    int dims = old_grid->Nd();
+    Lattice<vobj> tmp = in;
+    for(int d=0;d<dims;d++){
+      tmp = Expand(d,tmp,cshift); // rvalue && assignment
+    }
+    return tmp;
+  }
+  // expand up one dim at a time
+  template<class vobj>
+  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
+  {
+    GridBase *old_grid = in.Grid();
+    GridCartesian *new_grid = grids[dim];//These are new grids
+    Lattice<vobj>  padded(new_grid);
+    Lattice<vobj> shifted(old_grid);    
+    Coordinate local     =old_grid->LocalDimensions();
+    Coordinate plocal    =new_grid->LocalDimensions();
+    if(dim==0) conformable(old_grid,unpadded_grid);
+    else       conformable(old_grid,grids[dim-1]);
+
+    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
+
+    double tins=0, tshift=0;
+    
+    // Middle bit
+    double t = usecond();
+    for(int x=0;x<local[dim];x++){
+      InsertSliceLocal(in,padded,x,depth+x,dim);
+    }
+    tins += usecond() - t;
+    
+    // High bit
+    t = usecond();
+    shifted = cshift.Cshift(in,dim,depth);
+    tshift += usecond() - t;
+
+    t=usecond();
+    for(int x=0;x<depth;x++){
+      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
+    }
+    tins += usecond() - t;
+    
+    // Low bit
+    t = usecond();
+    shifted = cshift.Cshift(in,dim,-depth);
+    tshift += usecond() - t;
+    
+    t = usecond();
+    for(int x=0;x<depth;x++){
+      InsertSliceLocal(shifted,padded,x,x,dim);
+    }
+    tins += usecond() - t;
+
+    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
+    
+    return padded;
+  }
+
+};
+ 
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@ -104,6 +104,7 @@ template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iSca
 template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
 template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
 template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
+template<typename vtype> using iLorentzComplex            = iVector<iScalar<iScalar<vtype> >, Nd > ;
 template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
 template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
 template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
@ -178,6 +179,15 @@ typedef iLorentzColourMatrix<vComplexF>  vLorentzColourMatrixF;
 typedef iLorentzColourMatrix<vComplexD>  vLorentzColourMatrixD;
 typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;

+// LorentzComplex
+typedef iLorentzComplex<Complex  > LorentzComplex;
+typedef iLorentzComplex<ComplexF > LorentzComplexF;
+typedef iLorentzComplex<ComplexD > LorentzComplexD;
+
+typedef iLorentzComplex<vComplex > vLorentzComplex;
+typedef iLorentzComplex<vComplexF> vLorentzComplexF;
+typedef iLorentzComplex<vComplexD> vLorentzComplexD;
+
 // DoubleStored gauge field
 typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
@ -307,6 +317,10 @@ typedef Lattice<vLorentzColourMatrixF>  LatticeLorentzColourMatrixF;
 typedef Lattice<vLorentzColourMatrixD>  LatticeLorentzColourMatrixD;
 typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;

+typedef Lattice<vLorentzComplex>  LatticeLorentzComplex;
+typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF;
+typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD;
+
 // DoubleStored gauge field
 typedef Lattice<vDoubleStoredColourMatrix>   LatticeDoubleStoredColourMatrix;
 typedef Lattice<vDoubleStoredColourMatrixF>  LatticeDoubleStoredColourMatrixF;
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@ -34,10 +34,24 @@ directory

 NAMESPACE_BEGIN(Grid);

+///////////////////////////////////
+// Smart configuration base class
+///////////////////////////////////
+template< class Field >
+class ConfigurationBase
+{
+public:
+  ConfigurationBase() {}
+  virtual ~ConfigurationBase() {}
+  virtual void set_Field(Field& U) =0;
+  virtual void smeared_force(Field&) = 0;
+  virtual Field& get_SmearedU() =0;
+  virtual Field &get_U(bool smeared = false) = 0;
+};
+
 template <class GaugeField >
 class Action 
 {
-
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
@ -77,11 +91,39 @@ public:
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
+  /////////////////////////////
  // Heatbath?
+  /////////////////////////////
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
  virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
+
+  /////////////////////////////////////////////////////////////
+  // virtual smeared interface through configuration container
+  /////////////////////////////////////////////////////////////
+  virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
+  {
+    refresh(U.get_U(is_smeared),sRNG,pRNG);
+  }
+  virtual RealD S(ConfigurationBase<GaugeField>& U)
+  {
+    return S(U.get_U(is_smeared));
+  }
+  virtual RealD Sinitial(ConfigurationBase<GaugeField>& U) 
+  {
+    return Sinitial(U.get_U(is_smeared));
+  }
+  virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
+  {
+    deriv(U.get_U(is_smeared),dSdU); 
+    if ( is_smeared ) {
+      U.smeared_force(dSdU);
+    }
+  }
+  ///////////////////////////////
+  // Logging
+  ///////////////////////////////
  virtual std::string action_name()    = 0;                             // return the action name
  virtual std::string LogParameters()  = 0;                             // prints action parameters
  virtual ~Action(){}
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@ -30,6 +30,8 @@ directory
 #ifndef QCD_ACTION_CORE
 #define QCD_ACTION_CORE

+#include <Grid/qcd/action/gauge/GaugeImplementations.h>
+
 #include <Grid/qcd/action/ActionBase.h>
 NAMESPACE_CHECK(ActionBase);
 #include <Grid/qcd/action/ActionSet.h>
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -507,6 +507,7 @@ public:
    }
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
+    accelerator_barrier();
  }

 };
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -332,8 +332,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  /////////////////////////////
  {
    GRID_TRACE("Gather");
-    st.HaloExchangeOptGather(in,compressor);
-    accelerator_barrier();
+    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
  }
  
  std::vector<std::vector<CommsRequest_t> > requests;
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -423,14 +423,14 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();

 #define KERNEL_CALL_EXT(A)						\
-  const uint64_t    NN = Nsite*Ls;					\
  const uint64_t    sz = st.surface_list.size();			\
  auto ptr = &st.surface_list[0];					\
  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
      int sF = ptr[ss];							\
-      int sU = ss/Ls;							\
+      int sU = sF/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
-    });									
+    });									\
+  accelerator_barrier();

 #define ASM_CALL(A)							\
  thread_for( sss, Nsite, {						\
@ -474,9 +474,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
+     // dependent on result of merge
     acceleratorFenceComputeStream();
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
@ -506,9 +507,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) {
+     // Dependent on result of merge
     acceleratorFenceComputeStream();
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;}
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@ -176,7 +176,7 @@ public:
      return PeriodicBC::CshiftLink(Link,mu,shift);
  }

-  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
+  static inline void       setDirections(const std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
 };
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@ -43,7 +43,7 @@ public:
 private:
  RealD c_plaq;
  RealD c_rect;
-
+  typename WilsonLoops<Gimpl>::StapleAndRectStapleAllWorkspace workspace;
 public:
  PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){};

@ -79,27 +79,18 @@ public:
    GridBase *grid = Umu.Grid();

    std::vector<GaugeLinkField> U (Nd,grid);
-    std::vector<GaugeLinkField> U2(Nd,grid);
-
    for(int mu=0;mu<Nd;mu++){
      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-      WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
    }
+    std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
+    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);

    GaugeLinkField dSdU_mu(grid);
    GaugeLinkField staple(grid);

    for (int mu=0; mu < Nd; mu++){
-
-      // Staple in direction mu
-
-      WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
-
-      dSdU_mu = Ta(U[mu]*staple)*factor_p;
-
-      WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
-
-      dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
+      dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
+      dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
 	  
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
@ -53,9 +53,10 @@ NAMESPACE_BEGIN(Grid);
      Integer ReliableUpdateFreq;
    protected:

+      //Action evaluation
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
-#if 0
+#if 1
 	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
 	ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
 	msCG(schurOp,in, out);
@ -70,9 +71,10 @@ NAMESPACE_BEGIN(Grid);
 	msCG(schurOpD, in, out);
 #endif
      }
+      //Force evaluation
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
-	SchurDifferentiableOperator<ImplF>  schurOpF (numerator ? NumOpF  : DenOpF);
+	SchurDifferentiableOperator<ImplF>  schurOpF(numerator ? NumOpF  : DenOpF);

 	FermionFieldD inD(NumOpD.FermionRedBlackGrid());
 	FermionFieldD outD(NumOpD.FermionRedBlackGrid());
@ -84,20 +86,15 @@ NAMESPACE_BEGIN(Grid);
      virtual void ImportGauge(const typename ImplD::GaugeField &Ud){

 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
-	typename ImplD::GaugeField Ud2(NumOpD.GaugeGrid());
 	precisionChange(Uf, Ud);
-	precisionChange(Ud2, Ud);

-	std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " << norm2(Ud2)<<std::endl;
+	std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " <<std::endl;
 	
 	NumOpD.ImportGauge(Ud);
 	DenOpD.ImportGauge(Ud);

 	NumOpF.ImportGauge(Uf);
 	DenOpF.ImportGauge(Uf);
-
-	NumOpD.ImportGauge(Ud2);
-	DenOpD.ImportGauge(Ud2);
      }
      
    public:
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@ -207,20 +207,27 @@ NAMESPACE_BEGIN(Grid);
        //X = (Mdag M)^-1 V^dag phi
        //Y = (Mdag)^-1 V^dag  phi
        Vpc.MpcDag(PhiOdd,Y);          // Y= Vdag phi
+	std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;
        X=Zero();
        DerivativeSolver(Mpc,Y,X);     // X= (MdagM)^-1 Vdag phi
+	std::cout << GridLogMessage <<" X "<<norm2(X)<<std::endl;
        Mpc.Mpc(X,Y);                  // Y=  Mdag^-1 Vdag phi
+	std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;

        // phi^dag V (Mdag M)^-1 dV^dag  phi
        Vpc.MpcDagDeriv(force , X, PhiOdd );   dSdU = force;
+	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
  
        // phi^dag dV (Mdag M)^-1 V^dag  phi
        Vpc.MpcDeriv(force , PhiOdd, X );      dSdU = dSdU+force;
+	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;

        //    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi
        //    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi
        Mpc.MpcDeriv(force,Y,X);              dSdU = dSdU-force;
+	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
        Mpc.MpcDagDeriv(force,X,Y);           dSdU = dSdU-force;
+	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;

        // FIXME No force contribution from EvenEven assumed here
        // Needs a fix for clover.
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@ -283,12 +283,13 @@ public:
      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;

      TheIntegrator.print_timer();
-
+      
+      TheIntegrator.Smearer.set_Field(Ucur);
      for (int obs = 0; obs < Observables.size(); obs++) {
      	std::cout << GridLogDebug << "Observables # " << obs << std::endl;
      	std::cout << GridLogDebug << "Observables total " << Observables.size() << std::endl;
      	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
-        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
+        Observables[obs]->TrajectoryComplete(traj + 1, TheIntegrator.Smearer, sRNG, pRNG);
      }
      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
    }
--- a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
@ -35,13 +35,16 @@ class CheckpointerParameters : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(CheckpointerParameters, 
 				  std::string, config_prefix, 
+				  std::string, smeared_prefix, 
 				  std::string, rng_prefix, 
 				  int, saveInterval, 
+				  bool, saveSmeared, 
 				  std::string, format, );

-  CheckpointerParameters(std::string cf = "cfg", std::string rn = "rng",
+  CheckpointerParameters(std::string cf = "cfg", std::string sf="cfg_smr" , std::string rn = "rng",
 			 int savemodulo = 1, const std::string &f = "IEEE64BIG")
    : config_prefix(cf),
+      smeared_prefix(sf),
      rng_prefix(rn),
      saveInterval(savemodulo),
      format(f){};
@ -61,13 +64,21 @@ template <class Impl>
 class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> {
 public:
  void build_filenames(int traj, CheckpointerParameters &Params,
-                       std::string &conf_file, std::string &rng_file) {
+                       std::string &conf_file,
+                       std::string &smear_file,
+		       std::string &rng_file) {
    {
      std::ostringstream os;
      os << Params.rng_prefix << "." << traj;
      rng_file = os.str();
    }

+    {
+      std::ostringstream os;
+      os << Params.smeared_prefix << "." << traj;
+      smear_file = os.str();
+    }
+
    {
      std::ostringstream os;
      os << Params.config_prefix << "." << traj;
@ -84,6 +95,11 @@ public:
  }
  virtual void initialize(const CheckpointerParameters &Params) = 0;

+  virtual void TrajectoryComplete(int traj,
+                                  typename Impl::Field &U,
+                                  GridSerialRNG &sRNG,
+                                  GridParallelRNG &pRNG) { assert(0); } ; // HMC should pass the smart config with smeared and unsmeared
+  
  virtual void CheckpointRestore(int traj, typename Impl::Field &U,
                                 GridSerialRNG &sRNG,
                                 GridParallelRNG &pRNG) = 0;
--- a/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
@ -61,11 +61,14 @@ public:
    fout.close();
  }

-  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
+  void TrajectoryComplete(int traj,
+			  ConfigurationBase<Field> &SmartConfig,
+			  GridSerialRNG &sRNG, GridParallelRNG &pRNG)
+  {

    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
-      this->build_filenames(traj, Params, config, rng);
+      std::string config, rng, smr;
+      this->build_filenames(traj, Params, config, smr, rng);

      uint32_t nersc_csum;
      uint32_t scidac_csuma;
@ -74,9 +77,15 @@ public:
      BinarySimpleUnmunger<sobj_double, sobj> munge;
      truncate(rng);
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      truncate(config);
+      std::cout << GridLogMessage << "Written Binary RNG " << rng
+                << " checksum " << std::hex 
+		<< nersc_csum   <<"/"
+		<< scidac_csuma   <<"/"
+		<< scidac_csumb 
+		<< std::dec << std::endl;

-      BinaryIO::writeLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
+      truncate(config);
+      BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(false), config, munge, 0, Params.format,
 						      nersc_csum,scidac_csuma,scidac_csumb);

      std::cout << GridLogMessage << "Written Binary Configuration " << config
@ -85,6 +94,18 @@ public:
 		<< scidac_csuma   <<"/"
 		<< scidac_csumb 
 		<< std::dec << std::endl;
+
+      if ( Params.saveSmeared ) {
+	truncate(smr);
+	BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(true), smr, munge, 0, Params.format,
+							nersc_csum,scidac_csuma,scidac_csumb);
+	std::cout << GridLogMessage << "Written Binary Smeared Configuration " << smr
+                << " checksum " << std::hex 
+		<< nersc_csum   <<"/"
+		<< scidac_csuma   <<"/"
+		<< scidac_csumb 
+		<< std::dec << std::endl;
+      }
    }

  };
--- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
@ -69,17 +69,27 @@ public:
    }
  }

-  void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
+  void TrajectoryComplete(int traj,
+			  ConfigurationBase<GaugeField> &SmartConfig,
+			  GridSerialRNG &sRNG,
                          GridParallelRNG &pRNG) {
    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
+      std::string config, rng, smr;
      this->build_filenames(traj, Params, config, rng);
-      GridBase *grid = U.Grid();
+      GridBase *grid = SmartConfig.get_U(false).Grid();
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
+      std::cout << GridLogMessage << "Written BINARY RNG " << rng
+                << " checksum " << std::hex 
+		<< nersc_csum<<"/"
+		<< scidac_csuma<<"/"
+		<< scidac_csumb
+		<< std::dec << std::endl;
+
+      
      IldgWriter _IldgWriter(grid->IsBoss());
      _IldgWriter.open(config);
-      _IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config);
+      _IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(false), traj, config, config);
      _IldgWriter.close();

      std::cout << GridLogMessage << "Written ILDG Configuration on " << config
@ -88,6 +98,21 @@ public:
 		<< scidac_csuma<<"/"
 		<< scidac_csumb
 		<< std::dec << std::endl;
+
+      if ( Params.saveSmeared ) { 
+	IldgWriter _IldgWriter(grid->IsBoss());
+	_IldgWriter.open(smr);
+	_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
+	_IldgWriter.close();
+
+	std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
+                << " checksum " << std::hex 
+		<< nersc_csum<<"/"
+		<< scidac_csuma<<"/"
+		<< scidac_csumb
+		<< std::dec << std::endl;
+      }
+
    }
  };

--- a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
@ -52,23 +52,29 @@ public:
    Params.format = "IEEE64BIG";  // fixed, overwrite any other choice
  }

-  void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
-                          GridParallelRNG &pRNG) {
+  virtual void TrajectoryComplete(int traj,
+                                  ConfigurationBase<GaugeField> &SmartConfig,
+                                  GridSerialRNG &sRNG,
+                                  GridParallelRNG &pRNG)
+  {
    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
-      this->build_filenames(traj, Params, config, rng);
-
+      std::string config, rng, smr;
+      this->build_filenames(traj, Params, config, smr, rng);
+      
      int precision32 = 1;
      int tworow = 0;
      NerscIO::writeRNGState(sRNG, pRNG, rng);
-      NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32);
+      NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(false), config, tworow, precision32);
+      if ( Params.saveSmeared ) {
+	NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(true), smr, tworow, precision32);
+      }
    }
  };

  void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
                         GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
+    std::string config, rng, smr;
+    this->build_filenames(traj, Params, config, smr, rng );
    this->check_filename(rng);
    this->check_filename(config);

--- a/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
@ -70,19 +70,37 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
    }
  }

-  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
+  void TrajectoryComplete(int traj, 
+			  ConfigurationBase<Field> &SmartConfig,
+			  GridSerialRNG &sRNG,
                          GridParallelRNG &pRNG) {
    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
-      this->build_filenames(traj, Params, config, rng);
-      GridBase *grid = U.Grid();
+      std::string config, rng,smr;
+      this->build_filenames(traj, Params, config, smr, rng);
+      GridBase *grid = SmartConfig.get_U(false).Grid();
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      ScidacWriter _ScidacWriter(grid->IsBoss());
-      _ScidacWriter.open(config);
-      _ScidacWriter.writeScidacFieldRecord(U, MData);
-      _ScidacWriter.close();
+      std::cout << GridLogMessage << "Written Binary RNG " << rng
+                << " checksum " << std::hex 
+		<< nersc_csum   <<"/"
+		<< scidac_csuma   <<"/"
+		<< scidac_csumb 
+		<< std::dec << std::endl;

+
+      {
+	ScidacWriter _ScidacWriter(grid->IsBoss());
+	_ScidacWriter.open(config);
+	_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(false), MData);
+	_ScidacWriter.close();
+      }
+      
+      if ( Params.saveSmeared ) {
+	ScidacWriter _ScidacWriter(grid->IsBoss());
+	_ScidacWriter.open(smr);
+	_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(true), MData);
+	_ScidacWriter.close();
+      }
      std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl;
    }
  };
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@ -66,6 +66,7 @@ public:
 template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy>
 class Integrator {
 protected:
+public:
  typedef FieldImplementation_ FieldImplementation;
  typedef typename FieldImplementation::Field MomentaField;  //for readability
  typedef typename FieldImplementation::Field Field;
@ -96,7 +97,6 @@ protected:
  {
    t_P[level] += ep;
    update_P(P, U, level, ep);
-
    std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
  }

@ -130,28 +130,20 @@ protected:
      Field force(U.Grid());
      conformable(U.Grid(), Mom.Grid());

-      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      double start_force = usecond();

-      std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl;
-      
      as[level].actions.at(a)->deriv_timer_start();
-      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
+      as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta
      as[level].actions.at(a)->deriv_timer_stop();

-      std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl;
-
-      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
      auto name = as[level].actions.at(a)->action_name();
-      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);

      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-
-      //      DumpSliceNorm("force ",force,Nd-1);
+      
      MomFilter->applyFilter(force);
+
      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl;
-      DumpSliceNorm("force filtered ",force,Nd-1);
      
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
@ -377,14 +369,9 @@ public:
 	auto name = as[level].actions.at(actionID)->action_name();
        std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;

-        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
-
-	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl;
-
 	as[level].actions.at(actionID)->refresh_timer_start();
-        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
+        as[level].actions.at(actionID)->refresh(Smearer, sRNG, pRNG);
 	as[level].actions.at(actionID)->refresh_timer_stop();
-	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl;

      }

@ -425,10 +412,9 @@ public:

        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
-        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
 	        as[level].actions.at(actionID)->S_timer_start();
-        Hterm = as[level].actions.at(actionID)->S(Us);
+        Hterm = as[level].actions.at(actionID)->S(Smearer);
   	        as[level].actions.at(actionID)->S_timer_stop();
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
@ -469,12 +455,11 @@ public:
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
-        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
-	        as[level].actions.at(actionID)->S_timer_start();

-        Hterm = as[level].actions.at(actionID)->Sinitial(Us);
-   	        as[level].actions.at(actionID)->S_timer_stop();
+	as[level].actions.at(actionID)->S_timer_start();
+        Hterm = as[level].actions.at(actionID)->S(Smearer);
+	as[level].actions.at(actionID)->S_timer_stop();

        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
--- a/Grid/qcd/observables/hmc_observable.h
+++ b/Grid/qcd/observables/hmc_observable.h
@ -34,6 +34,13 @@ NAMESPACE_BEGIN(Grid);
 template <class Field>
 class HmcObservable {
 public:
+  virtual void TrajectoryComplete(int traj,
+                                  ConfigurationBase<Field> &SmartConfig,
+                                  GridSerialRNG &sRNG,
+                                  GridParallelRNG &pRNG)
+  {
+    TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
+  };
  virtual void TrajectoryComplete(int traj,
                                  Field &U,
                                  GridSerialRNG &sRNG,
--- a/Grid/qcd/observables/plaquette.h
+++ b/Grid/qcd/observables/plaquette.h
@ -42,6 +42,18 @@ public:
  // necessary for HmcObservable compatibility
  typedef typename Impl::Field Field;

+  virtual void TrajectoryComplete(int traj,
+                                  ConfigurationBase<Field> &SmartConfig,
+                                  GridSerialRNG &sRNG,
+                                  GridParallelRNG &pRNG)
+  {
+    std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
+    std::cout << GridLogMessage << "Unsmeared plaquette"<<std::endl;
+    TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
+    std::cout << GridLogMessage << "Smeared plaquette"<<std::endl;
+    TrajectoryComplete(traj,SmartConfig.get_U(true),sRNG,pRNG); // Unsmeared observable
+    std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
+  };
  void TrajectoryComplete(int traj,
                          Field &U,
                          GridSerialRNG &sRNG,
--- a/Grid/qcd/smearing/GaugeConfiguration.h
+++ b/Grid/qcd/smearing/GaugeConfiguration.h
@ -7,26 +7,27 @@

 NAMESPACE_BEGIN(Grid);

+
 //trivial class for no smearing
 template< class Impl >
-class NoSmearing
+class NoSmearing : public ConfigurationBase<typename Impl::Field>
 {
 public:
  INHERIT_FIELD_TYPES(Impl);

-  Field* ThinField;
+  Field* ThinLinks;

-  NoSmearing(): ThinField(NULL) {}
+  NoSmearing(): ThinLinks(NULL) {}

-  void set_Field(Field& U) { ThinField = &U; }
+  virtual void set_Field(Field& U) { ThinLinks = &U; }

-  void smeared_force(Field&) const {}
+  virtual void smeared_force(Field&) {}

-  Field& get_SmearedU() { return *ThinField; }
+  virtual Field& get_SmearedU() { return *ThinLinks; }

-  Field &get_U(bool smeared = false)
+  virtual Field &get_U(bool smeared = false)
  {
-    return *ThinField;
+    return *ThinLinks;
  }
 };

@ -42,19 +43,24 @@ public:
  It stores a list of smeared configurations.
 */
 template <class Gimpl>
-class SmearedConfiguration
+class SmearedConfiguration : public ConfigurationBase<typename Gimpl::Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);

-private:
+protected:
  const unsigned int smearingLevels;
  Smear_Stout<Gimpl> *StoutSmearing;
  std::vector<GaugeField> SmearedSet;
-
+public:
+  GaugeField*  ThinLinks; /* Pointer to the thin links configuration */ // move to base???
+protected:
+  
  // Member functions
  //====================================================================
-  void fill_smearedSet(GaugeField &U)
+
+  // Overridden in masked version
+  virtual void fill_smearedSet(GaugeField &U)
  {
    ThinLinks = &U;  // attach the smearing routine to the field U

@ -82,9 +88,10 @@ private:
      }
    }
  }
-  //====================================================================
-  GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
-                                  const GaugeField& GaugeK) const 
+
+  //overridden in masked verson
+  virtual GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
+					  const GaugeField& GaugeK) const 
  {
    GridBase* grid = GaugeK.Grid();
    GaugeField C(grid), SigmaK(grid), iLambda(grid);
@ -213,8 +220,6 @@ private:

  //====================================================================
 public:
-  GaugeField*
-      ThinLinks; /* Pointer to the thin links configuration */

  /* Standard constructor */
  SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
@ -230,7 +235,7 @@ public:
    : smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {}

  // attach the smeared routines to the thin links U and fill the smeared set
-  void set_Field(GaugeField &U)
+  virtual void set_Field(GaugeField &U)
  {
    double start = usecond();
    fill_smearedSet(U);
@ -240,7 +245,7 @@ public:
  }

  //====================================================================
-  void smeared_force(GaugeField &SigmaTilde) const
+  virtual void smeared_force(GaugeField &SigmaTilde) 
  {
    if (smearingLevels > 0)
    {
@ -267,14 +272,16 @@ public:
      }
      double end = usecond();
      double time = (end - start)/ 1e3;
-      std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;  
+      std::cout << GridLogMessage << " GaugeConfiguration: Smeared Force chain rule took " << time << " ms" << std::endl;
    }  // if smearingLevels = 0 do nothing
+    SigmaTilde=Gimpl::projectForce(SigmaTilde); // Ta
+      
  }
  //====================================================================

-  GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
+  virtual GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }

-  GaugeField &get_U(bool smeared = false)
+  virtual GaugeField &get_U(bool smeared = false)
  {
    // get the config, thin links by default
    if (smeared)
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@ -0,0 +1,813 @@
+/*!
+  @file GaugeConfiguration.h
+  @brief Declares the GaugeConfiguration class
+*/
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+/*!
+  @brief Smeared configuration masked container
+  Modified for a multi-subset smearing (aka Luscher Flowed HMC)
+*/
+template <class Gimpl>
+class SmearedConfigurationMasked : public SmearedConfiguration<Gimpl>
+{
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+private:
+  // These live in base class
+  //  const unsigned int smearingLevels;
+  //  Smear_Stout<Gimpl> *StoutSmearing;
+  //  std::vector<GaugeField> SmearedSet;
+  
+  std::vector<LatticeLorentzComplex> masks;
+
+  typedef typename SU3Adjoint::AMatrix AdjMatrix;
+  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
+  typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField;
+
+  // Adjoint vector to GaugeField force
+  void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu)
+  {
+    Complex ci(0,1);
+    GaugeLinkField Fdet_pol(Fdet.Grid());
+    Fdet_pol=Zero();
+    for(int e=0;e<8;e++){
+      ColourMatrix te;
+      SU3::generator(e, te);
+      auto tmp=peekColour(Fdet_nu,e);
+      Fdet_pol=Fdet_pol + ci*tmp*te; // but norm of te is different.. why?
+    }
+    pokeLorentz(Fdet, Fdet_pol, nu);
+  }
+  void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
+  {
+    GaugeLinkField UtaU(PlaqL.Grid());
+    GaugeLinkField D(PlaqL.Grid());
+    AdjMatrixField Dbc(PlaqL.Grid());
+    LatticeComplex tmp(PlaqL.Grid());
+    const int Ngen = SU3Adjoint::Dimension;
+    Complex ci(0,1);
+    ColourMatrix   ta,tb,tc;
+    
+    for(int a=0;a<Ngen;a++) {
+      SU3::generator(a, ta);
+      // Qlat Tb = 2i Tb^Grid
+      UtaU= 2.0*ci*adj(PlaqL)*ta*PlaqR;
+      for(int c=0;c<Ngen;c++) {
+	SU3::generator(c, tc);
+	D = Ta( (2.0)*ci*tc *UtaU);
+	for(int b=0;b<Ngen;b++){
+	  SU3::generator(b, tb);
+	  tmp =-trace(ci*tb*D); 
+	  PokeIndex<ColourIndex>(Dbc,tmp,b,c);  // Adjoint rep
+	}
+      }
+      tmp = trace(MpInvJx * Dbc);
+      PokeIndex<ColourIndex>(Fdet2,tmp,a);
+    }
+  }
+  
+  void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd)
+  {
+    GaugeLinkField Nx(PlaqL.Grid());
+    const int Ngen = SU3Adjoint::Dimension;
+    Complex ci(0,1);
+    ColourMatrix   tb;
+    ColourMatrix   tc;
+    for(int b=0;b<Ngen;b++) {
+      SU3::generator(b, tb);
+      Nx = (2.0)*Ta( adj(PlaqL)*ci*tb * PlaqR );
+      for(int c=0;c<Ngen;c++) {
+	SU3::generator(c, tc);
+	auto tmp =closure( -trace(ci*tc*Nx)); 
+	PokeIndex<ColourIndex>(NxAd,tmp,c,b); 
+      }
+    }
+  }
+  void ApplyMask(GaugeField &U,int smr)
+  {
+    LatticeComplex tmp(U.Grid());
+    GaugeLinkField Umu(U.Grid());
+    for(int mu=0;mu<Nd;mu++){
+      Umu=PeekIndex<LorentzIndex>(U,mu);
+      tmp=PeekIndex<LorentzIndex>(masks[smr],mu);
+      Umu=Umu*tmp;
+      PokeIndex<LorentzIndex>(U, Umu, mu);
+    }
+  }
+public:
+
+  void logDetJacobianForceLevel(const GaugeField &U, GaugeField &force ,int smr)
+  {
+    GridBase* grid = U.Grid();
+    ColourMatrix   tb;
+    ColourMatrix   tc;
+    ColourMatrix   ta;
+    GaugeField C(grid);
+    GaugeField Umsk(grid);
+    std::vector<GaugeLinkField> Umu(Nd,grid);
+    GaugeLinkField Cmu(grid); // U and staple; C contains factor of epsilon
+    GaugeLinkField Zx(grid);  // U times Staple, contains factor of epsilon
+    GaugeLinkField Nxx(grid);  // Nxx fundamental space
+    GaugeLinkField Utmp(grid);
+    GaugeLinkField PlaqL(grid);
+    GaugeLinkField PlaqR(grid);
+    const int Ngen = SU3Adjoint::Dimension;
+    AdjMatrix TRb;
+    ColourMatrix Ident;
+    LatticeComplex  cplx(grid);
+    
+    AdjVectorField  dJdXe_nMpInv(grid); 
+    AdjVectorField  dJdXe_nMpInv_y(grid); 
+    AdjMatrixField  MpAd(grid);    // Mprime luchang's notes
+    AdjMatrixField  MpAdInv(grid); // Mprime inverse
+    AdjMatrixField  NxxAd(grid);    // Nxx in adjoint space
+    AdjMatrixField  JxAd(grid);     
+    AdjMatrixField  ZxAd(grid);
+    AdjMatrixField  mZxAd(grid);
+    AdjMatrixField  X(grid);
+    Complex ci(0,1);
+
+    RealD t0 = usecond();
+    Ident = ComplexD(1.0);
+    for(int d=0;d<Nd;d++){
+      Umu[d] = peekLorentz(U, d);
+    }
+    int mu= (smr/2) %Nd;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Mask the gauge field
+    ////////////////////////////////////////////////////////////////////////////////
+    auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
+
+    Umsk = U;
+    ApplyMask(Umsk,smr);
+    Utmp = peekLorentz(Umsk,mu);
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Retrieve the eps/rho parameter(s) -- could allow all different but not so far
+    ////////////////////////////////////////////////////////////////////////////////
+    double rho=this->StoutSmearing->SmearRho[1];
+    int idx=0;
+    for(int mu=0;mu<4;mu++){
+    for(int nu=0;nu<4;nu++){
+      if ( mu!=nu) assert(this->StoutSmearing->SmearRho[idx]==rho);
+      else         assert(this->StoutSmearing->SmearRho[idx]==0.0);
+      idx++;
+    }}
+    //////////////////////////////////////////////////////////////////
+    // Assemble the N matrix
+    //////////////////////////////////////////////////////////////////
+    // Computes ALL the staples -- could compute one only and do it here
+    RealD time;
+    time=-usecond();
+    this->StoutSmearing->BaseSmear(C, U);
+    Cmu = peekLorentz(C, mu);
+
+    //////////////////////////////////////////////////////////////////
+    // Assemble Luscher exp diff map J matrix 
+    //////////////////////////////////////////////////////////////////
+    // Ta so Z lives in Lie algabra
+    Zx  = Ta(Cmu * adj(Umu[mu]));
+    time+=usecond();
+    std::cout << GridLogMessage << "Z took "<<time<< " us"<<std::endl;
+
+    time=-usecond();
+    // Move Z to the Adjoint Rep == make_adjoint_representation
+    ZxAd = Zero();
+    for(int b=0;b<8;b++) {
+      // Adj group sets traceless antihermitian T's -- Guido, really????
+      SU3::generator(b, tb);         // Fund group sets traceless hermitian T's
+      SU3Adjoint::generator(b,TRb);
+      TRb=-TRb;
+      cplx = 2.0*trace(ci*tb*Zx); // my convention 1/2 delta ba
+      ZxAd = ZxAd + cplx * TRb; // is this right? YES - Guido used Anti herm Ta's and with bloody wrong sign.
+    }
+    time+=usecond();
+    std::cout << GridLogMessage << "ZxAd took "<<time<< " us"<<std::endl;
+
+    //////////////////////////////////////
+    // J(x) = 1 + Sum_k=1..N (-Zac)^k/(k+1)!
+    //////////////////////////////////////
+    time=-usecond();
+    X=1.0; 
+    JxAd = X;
+    mZxAd = (-1.0)*ZxAd; 
+    RealD kpfac = 1;
+    for(int k=1;k<12;k++){
+      X=X*mZxAd;
+      kpfac = kpfac /(k+1);
+      JxAd = JxAd + X * kpfac;
+    }
+    time+=usecond();
+    std::cout << GridLogMessage << "Jx took "<<time<< " us"<<std::endl;
+
+    //////////////////////////////////////
+    // dJ(x)/dxe
+    //////////////////////////////////////
+    time=-usecond();
+    std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid);
+    AdjMatrixField tbXn(grid);
+    AdjMatrixField sumXtbX(grid);
+    AdjMatrixField t2(grid);
+    AdjMatrixField dt2(grid);
+    AdjMatrixField t3(grid);
+    AdjMatrixField dt3(grid);
+    AdjMatrixField aunit(grid);
+    for(int b=0;b<8;b++){
+      aunit = ComplexD(1.0);
+      SU3Adjoint::generator(b, TRb); //dt2
+
+      X  = (-1.0)*ZxAd; 
+      t2 = X;
+      dt2 = TRb;
+      for (int j = 20; j > 1; --j) {
+	t3 = t2*(1.0 / (j + 1))  + aunit;
+	dt3 = dt2*(1.0 / (j + 1));
+	t2 = X * t3;
+	dt2 = TRb * t3 + X * dt3;
+      }
+      dJdX[b] = -dt2; 
+    }
+    time+=usecond();
+    std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl;
+    /////////////////////////////////////////////////////////////////
+    // Mask Umu for this link
+    /////////////////////////////////////////////////////////////////
+    time=-usecond();
+    PlaqL = Ident;
+    PlaqR = Utmp*adj(Cmu);
+    ComputeNxy(PlaqL,PlaqR,NxxAd);
+    time+=usecond();
+    std::cout << GridLogMessage << "ComputeNxy took "<<time<< " us"<<std::endl;
+    
+    ////////////////////////////
+    // Mab
+    ////////////////////////////
+    MpAd = Complex(1.0,0.0);
+    MpAd = MpAd - JxAd * NxxAd;
+
+    /////////////////////////
+    // invert the 8x8
+    /////////////////////////
+    time=-usecond();
+    MpAdInv = Inverse(MpAd);
+    time+=usecond();
+    std::cout << GridLogMessage << "MpAdInv took "<<time<< " us"<<std::endl;
+    
+    RealD t3a = usecond();
+    /////////////////////////////////////////////////////////////////
+    // Nxx Mp^-1
+    /////////////////////////////////////////////////////////////////
+    AdjVectorField  FdetV(grid);
+    AdjVectorField  Fdet1_nu(grid);
+    AdjVectorField  Fdet2_nu(grid);
+    AdjVectorField  Fdet2_mu(grid);
+    AdjVectorField  Fdet1_mu(grid);
+
+    AdjMatrixField nMpInv(grid);
+    nMpInv= NxxAd *MpAdInv;
+
+    AdjMatrixField MpInvJx(grid);
+    AdjMatrixField MpInvJx_nu(grid);
+    MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
+
+    Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
+    Fdet2_mu=FdetV;
+    Fdet1_mu=Zero();
+    
+    for(int e =0 ; e<8 ; e++){
+      LatticeComplexD tr(grid);
+      ColourMatrix te;
+      SU3::generator(e, te);
+      tr = trace(dJdX[e] * nMpInv);
+      pokeColour(dJdXe_nMpInv,tr,e);
+    }
+    ///////////////////////////////
+    // Mask it off
+    ///////////////////////////////
+    auto tmp=PeekIndex<LorentzIndex>(masks[smr],mu);
+    dJdXe_nMpInv = dJdXe_nMpInv*tmp;
+    
+    //    dJdXe_nMpInv needs to multiply:
+    //       Nxx_mu (site local)                           (1)
+    //       Nxy_mu one site forward  in each nu direction (3)
+    //       Nxy_mu one site backward in each nu direction (3)
+    //       Nxy_nu 0,0  ; +mu,0; 0,-nu; +mu-nu   [ 3x4 = 12]
+    // 19 terms.
+    AdjMatrixField Nxy(grid);
+
+    GaugeField Fdet1(grid);
+    GaugeField Fdet2(grid);
+    GaugeLinkField Fdet_pol(grid); // one polarisation
+
+    RealD t4 = usecond();
+    for(int nu=0;nu<Nd;nu++){
+
+      if (nu!=mu) {
+	///////////////// +ve nu /////////////////
+	//     __
+	//    |  |
+	//    x==    // nu polarisation -- clockwise
+
+	time=-usecond();
+	PlaqL=Ident;
+
+	PlaqR=(-rho)*Gimpl::CovShiftForward(Umu[nu], nu,
+ 	       Gimpl::CovShiftForward(Umu[mu], mu,
+	         Gimpl::CovShiftBackward(Umu[nu], nu,
+		   Gimpl::CovShiftIdentityBackward(Utmp, mu))));
+	time+=usecond();
+	std::cout << GridLogMessage << "PlaqLR took "<<time<< " us"<<std::endl;
+
+	time=-usecond();
+	dJdXe_nMpInv_y =   dJdXe_nMpInv;
+	ComputeNxy(PlaqL,PlaqR,Nxy);
+	Fdet1_nu = transpose(Nxy)*dJdXe_nMpInv_y;
+	time+=usecond();
+	std::cout << GridLogMessage << "ComputeNxy (occurs 6x) took "<<time<< " us"<<std::endl;
+
+	time=-usecond();
+	PlaqR=(-1.0)*PlaqR;
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
+	Fdet2_nu = FdetV;
+	time+=usecond();
+	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
+	
+	//    x==
+	//    |  |
+	//    .__|    // nu polarisation -- anticlockwise
+
+	PlaqR=(rho)*Gimpl::CovShiftForward(Umu[nu], nu,
+		      Gimpl::CovShiftBackward(Umu[mu], mu,
+    	 	        Gimpl::CovShiftIdentityBackward(Umu[nu], nu)));
+
+	PlaqL=Gimpl::CovShiftIdentityBackward(Utmp, mu);
+
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1);
+	ComputeNxy(PlaqL, PlaqR,Nxy);
+	Fdet1_nu = Fdet1_nu+transpose(Nxy)*dJdXe_nMpInv_y;
+	
+
+	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Fdet2_nu = Fdet2_nu+FdetV;
+	
+	///////////////// -ve nu /////////////////
+	//  __
+	// |  |
+	// x==          // nu polarisation -- clockwise
+
+	PlaqL=(rho)* Gimpl::CovShiftForward(Umu[mu], mu,
+		       Gimpl::CovShiftForward(Umu[nu], nu,
+			 Gimpl::CovShiftIdentityBackward(Utmp, mu)));
+
+        PlaqR = Gimpl::CovShiftIdentityForward(Umu[nu], nu);
+
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1);
+	ComputeNxy(PlaqL,PlaqR,Nxy);
+	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
+
+	MpInvJx_nu = Cshift(MpInvJx,nu,1);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Fdet2_nu = Fdet2_nu+FdetV;
+	
+	// x==
+	// |  |
+	// |__|         // nu polarisation
+
+	PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[nu], nu,
+ 	        Gimpl::CovShiftIdentityBackward(Utmp, mu));
+
+	PlaqR=Gimpl::CovShiftBackward(Umu[mu], mu,
+	        Gimpl::CovShiftIdentityForward(Umu[nu], nu));
+
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1);
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv_y,nu,1);
+
+	ComputeNxy(PlaqL,PlaqR,Nxy);
+	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
+
+	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
+	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Fdet2_nu = Fdet2_nu+FdetV;
+
+	/////////////////////////////////////////////////////////////////////
+	// Set up the determinant force contribution in 3x3 algebra basis
+	/////////////////////////////////////////////////////////////////////
+	InsertForce(Fdet1,Fdet1_nu,nu);
+	InsertForce(Fdet2,Fdet2_nu,nu);
+	
+	//////////////////////////////////////////////////
+	// Parallel direction terms
+	//////////////////////////////////////////////////
+
+        //     __
+	//    |  "
+	//    |__"x    // mu polarisation
+	PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[mu], mu,
+		      Gimpl::CovShiftBackward(Umu[nu], nu,
+   		        Gimpl::CovShiftIdentityBackward(Utmp, mu)));
+
+	PlaqR=Gimpl::CovShiftIdentityBackward(Umu[nu], nu);
+	
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,-1);
+
+	ComputeNxy(PlaqL,PlaqR,Nxy);
+	Fdet1_mu = Fdet1_mu + transpose(Nxy)*dJdXe_nMpInv_y;
+
+	MpInvJx_nu = Cshift(MpInvJx,nu,-1);
+
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Fdet2_mu = Fdet2_mu+FdetV;
+
+	//  __
+	// "  |
+	// x__|          // mu polarisation
+
+	PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[mu], mu,
+		       Gimpl::CovShiftForward(Umu[nu], nu,
+		 	 Gimpl::CovShiftIdentityBackward(Utmp, mu)));
+
+        PlaqR=Gimpl::CovShiftIdentityForward(Umu[nu], nu);
+
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1);
+
+	ComputeNxy(PlaqL,PlaqR,Nxy);
+	Fdet1_mu = Fdet1_mu + transpose(Nxy)*dJdXe_nMpInv_y;
+
+	MpInvJx_nu = Cshift(MpInvJx,nu,1);
+
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Fdet2_mu = Fdet2_mu+FdetV;
+	
+      }
+    }
+    RealD t5 = usecond();
+
+    Fdet1_mu = Fdet1_mu + transpose(NxxAd)*dJdXe_nMpInv;
+
+    InsertForce(Fdet1,Fdet1_mu,mu);
+    InsertForce(Fdet2,Fdet2_mu,mu);
+
+    force= (-0.5)*( Fdet1 + Fdet2);
+    RealD t1 = usecond();
+    std::cout << GridLogMessage << " logDetJacobianForce level took "<<t1-t0<<" us "<<std::endl;
+    std::cout << GridLogMessage << " logDetJacobianForce t3-t0 "<<t3a-t0<<" us "<<std::endl;
+    std::cout << GridLogMessage << " logDetJacobianForce t4-t3 dJdXe_nMpInv "<<t4-t3a<<" us "<<std::endl;
+    std::cout << GridLogMessage << " logDetJacobianForce t5-t4 mu nu loop "<<t5-t4<<" us "<<std::endl;
+    std::cout << GridLogMessage << " logDetJacobianForce t1-t5 "<<t1-t5<<" us "<<std::endl;
+    std::cout << GridLogMessage << " logDetJacobianForce level took "<<t1-t0<<" us "<<std::endl;
+  }
+  RealD logDetJacobianLevel(const GaugeField &U,int smr)
+  {
+    GridBase* grid = U.Grid();
+    GaugeField C(grid);
+    GaugeLinkField Nb(grid);
+    GaugeLinkField Z(grid);
+    GaugeLinkField Umu(grid), Cmu(grid);
+    ColourMatrix   Tb;
+    ColourMatrix   Tc;
+    typedef typename SU3Adjoint::AMatrix AdjMatrix;
+    typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
+    typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField;
+    const int Ngen = SU3Adjoint::Dimension;
+    AdjMatrix TRb;
+    LatticeComplex       cplx(grid); 
+    AdjVectorField  AlgV(grid); 
+    AdjMatrixField  Mab(grid);
+    AdjMatrixField  Ncb(grid);
+    AdjMatrixField  Jac(grid);
+    AdjMatrixField  Zac(grid);
+    AdjMatrixField  mZac(grid);
+    AdjMatrixField  X(grid);
+
+    int mu= (smr/2) %Nd;
+
+    auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
+
+    //////////////////////////////////////////////////////////////////
+    // Assemble the N matrix
+    //////////////////////////////////////////////////////////////////
+    // Computes ALL the staples -- could compute one only here
+    this->StoutSmearing->BaseSmear(C, U);
+    Cmu = peekLorentz(C, mu);
+    Umu = peekLorentz(U, mu);
+    Complex ci(0,1);
+    for(int b=0;b<Ngen;b++) {
+      SU3::generator(b, Tb);
+      // Qlat Tb = 2i Tb^Grid
+      Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu));
+      for(int c=0;c<Ngen;c++) {
+	SU3::generator(c, Tc);
+	auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important
+	PokeIndex<ColourIndex>(Ncb,tmp,c,b); 
+      }
+    }      
+
+    //////////////////////////////////////////////////////////////////
+    // Assemble Luscher exp diff map J matrix 
+    //////////////////////////////////////////////////////////////////
+    // Ta so Z lives in Lie algabra
+    Z  = Ta(Cmu * adj(Umu));
+
+    // Move Z to the Adjoint Rep == make_adjoint_representation
+    Zac = Zero();
+    for(int b=0;b<8;b++) {
+      // Adj group sets traceless antihermitian T's -- Guido, really????
+      // Is the mapping of these the same? Same structure constants
+      // Might never have been checked.
+      SU3::generator(b, Tb);         // Fund group sets traceless hermitian T's
+      SU3Adjoint::generator(b,TRb);
+      TRb=-TRb;
+      cplx = 2.0*trace(ci*Tb*Z); // my convention 1/2 delta ba
+      Zac = Zac + cplx * TRb; // is this right? YES - Guido used Anti herm Ta's and with bloody wrong sign.
+    }
+
+    //////////////////////////////////////
+    // J(x) = 1 + Sum_k=1..N (-Zac)^k/(k+1)!
+    //////////////////////////////////////
+    X=1.0; 
+    Jac = X;
+    mZac = (-1.0)*Zac; 
+    RealD kpfac = 1;
+    for(int k=1;k<12;k++){
+      X=X*mZac;
+      kpfac = kpfac /(k+1);
+      Jac = Jac + X * kpfac;
+    }
+
+    ////////////////////////////
+    // Mab
+    ////////////////////////////
+    Mab = Complex(1.0,0.0);
+    Mab = Mab - Jac * Ncb;
+
+    ////////////////////////////
+    // det
+    ////////////////////////////
+    LatticeComplex       det(grid); 
+    det = Determinant(Mab);
+
+    ////////////////////////////
+    // ln det
+    ////////////////////////////
+    LatticeComplex       ln_det(grid); 
+    ln_det = log(det);
+
+    ////////////////////////////
+    // Masked sum
+    ////////////////////////////
+    ln_det = ln_det * mask;
+    Complex result = sum(ln_det);
+    return result.real();
+  }
+public:
+  RealD logDetJacobian(void)
+  {
+    RealD ln_det = 0;
+    if (this->smearingLevels > 0)
+    {
+      double start = usecond();
+      for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
+	ln_det+= logDetJacobianLevel(this->get_smeared_conf(ismr-1),ismr);
+      }
+      ln_det +=logDetJacobianLevel(*(this->ThinLinks),0);
+
+      double end = usecond();
+      double time = (end - start)/ 1e3;
+      std::cout << GridLogMessage << "GaugeConfigurationMasked: logDetJacobian took " << time << " ms" << std::endl;  
+    }
+    return ln_det;
+  }
+  void logDetJacobianForce(GaugeField &force)
+  {
+    force =Zero();
+    GaugeField force_det(force.Grid());
+
+    if (this->smearingLevels > 0)
+    {
+      double start = usecond();
+
+      GaugeLinkField tmp_mu(force.Grid());
+
+      for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
+
+	// remove U in UdSdU...
+	for (int mu = 0; mu < Nd; mu++) {
+	  tmp_mu = adj(peekLorentz(this->get_smeared_conf(ismr), mu)) * peekLorentz(force, mu);
+	  pokeLorentz(force, tmp_mu, mu);
+	}
+	
+      	// Propagate existing force
+        force = this->AnalyticSmearedForce(force, this->get_smeared_conf(ismr - 1), ismr);
+
+	// Add back U in UdSdU...
+	for (int mu = 0; mu < Nd; mu++) {
+	  tmp_mu = peekLorentz(this->get_smeared_conf(ismr - 1), mu) * peekLorentz(force, mu);
+	  pokeLorentz(force, tmp_mu, mu);
+	}
+    	
+	// Get this levels determinant force
+	force_det = Zero();
+	logDetJacobianForceLevel(this->get_smeared_conf(ismr-1),force_det,ismr);
+
+	// Sum the contributions
+	force = force + force_det;
+      }
+    
+      // remove U in UdSdU...
+      for (int mu = 0; mu < Nd; mu++) {
+	tmp_mu = adj(peekLorentz(this->get_smeared_conf(0), mu)) * peekLorentz(force, mu);
+	pokeLorentz(force, tmp_mu, mu);
+      }
+
+      force = this->AnalyticSmearedForce(force, *this->ThinLinks,0);
+
+      for (int mu = 0; mu < Nd; mu++) {
+	tmp_mu = peekLorentz(*this->ThinLinks, mu) * peekLorentz(force, mu);
+	pokeLorentz(force, tmp_mu, mu);
+      }
+
+      force_det = Zero();
+
+      logDetJacobianForceLevel(*this->ThinLinks,force_det,0);
+
+      force = force + force_det;
+
+      force=Ta(force); // Ta
+      
+      double end = usecond();
+      double time = (end - start)/ 1e3;
+      std::cout << GridLogMessage << "GaugeConfigurationMasked: lnDetJacobianForce took " << time << " ms" << std::endl;  
+    }  // if smearingLevels = 0 do nothing
+  }
+
+private:
+  //====================================================================
+  // Override base clas here to mask it
+  virtual void fill_smearedSet(GaugeField &U)
+  {
+    this->ThinLinks = &U;  // attach the smearing routine to the field U
+
+    // check the pointer is not null
+    if (this->ThinLinks == NULL)
+      std::cout << GridLogError << "[SmearedConfigurationMasked] Error in ThinLinks pointer\n";
+
+    if (this->smearingLevels > 0)
+    {
+      std::cout << GridLogMessage << "[SmearedConfigurationMasked] Filling SmearedSet\n";
+      GaugeField previous_u(this->ThinLinks->Grid());
+
+      GaugeField smeared_A(this->ThinLinks->Grid());
+      GaugeField smeared_B(this->ThinLinks->Grid());
+
+      previous_u = *this->ThinLinks;
+      double start = usecond();
+      for (int smearLvl = 0; smearLvl < this->smearingLevels; ++smearLvl)
+      {
+        this->StoutSmearing->smear(smeared_A, previous_u);
+	ApplyMask(smeared_A,smearLvl);
+	smeared_B = previous_u;
+	ApplyMask(smeared_B,smearLvl);
+	// Replace only the masked portion
+	this->SmearedSet[smearLvl] = previous_u-smeared_B + smeared_A;
+        previous_u = this->SmearedSet[smearLvl];
+
+        // For debug purposes
+        RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(previous_u);
+        std::cout << GridLogMessage << "[SmearedConfigurationMasked] smeared Plaq: " << impl_plaq << std::endl;
+      }
+      double end = usecond();
+      double time = (end - start)/ 1e3;
+      std::cout << GridLogMessage << "GaugeConfigurationMasked: Link smearing took " << time << " ms" << std::endl;  
+    }
+  }
+  //====================================================================
+  // Override base to add masking
+  virtual GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
+					  const GaugeField& GaugeK,int level) 
+  {
+    GridBase* grid = GaugeK.Grid();
+    GaugeField C(grid), SigmaK(grid), iLambda(grid);
+    GaugeField SigmaKPrimeA(grid);
+    GaugeField SigmaKPrimeB(grid);
+    GaugeLinkField iLambda_mu(grid);
+    GaugeLinkField iQ(grid), e_iQ(grid);
+    GaugeLinkField SigmaKPrime_mu(grid);
+    GaugeLinkField GaugeKmu(grid), Cmu(grid);
+    
+    this->StoutSmearing->BaseSmear(C, GaugeK);
+    SigmaK = Zero();
+    iLambda = Zero();
+
+    SigmaKPrimeA = SigmaKPrime;
+    ApplyMask(SigmaKPrimeA,level);
+    SigmaKPrimeB = SigmaKPrime - SigmaKPrimeA;
+    
+    // Could get away with computing only one polarisation here
+    // int mu= (smr/2) %Nd;
+    // SigmaKprime_A has only one component
+    for (int mu = 0; mu < Nd; mu++)
+    {
+      Cmu = peekLorentz(C, mu);
+      GaugeKmu = peekLorentz(GaugeK, mu);
+      SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
+      iQ = Ta(Cmu * adj(GaugeKmu));
+      this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
+      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
+      pokeLorentz(iLambda, iLambda_mu, mu);
+    }
+    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // propagate the rest of the force as identity map, just add back
+    ////////////////////////////////////////////////////////////////////////////////////
+    SigmaK = SigmaK+SigmaKPrimeB;
+
+    return SigmaK;
+  }
+
+public:
+
+  /* Standard constructor */
+  SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
+    : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
+  {
+    assert(Nsmear%(2*Nd)==0); // Or multiply by 8??
+
+    // was resized in base class
+    assert(this->SmearedSet.size()==Nsmear);
+    
+    GridRedBlackCartesian * UrbGrid;
+    UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
+    LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
+    LatticeComplex tmp(_UGrid);
+
+    for (unsigned int i = 0; i < this->smearingLevels; ++i) {
+
+      masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
+
+      int mu= (i/2) %Nd;
+      int cb= (i%2);
+      LatticeComplex tmpcb(UrbGrid);
+	
+      masks[i]=Zero();
+      ////////////////////
+      // Setup the mask
+      ////////////////////
+      tmp = Zero();
+      pickCheckerboard(cb,tmpcb,one);
+      setCheckerboard(tmp,tmpcb);
+      PokeIndex<LorentzIndex>(masks[i],tmp, mu);
+	
+    }
+    delete UrbGrid;
+  }
+  
+  virtual void smeared_force(GaugeField &SigmaTilde) 
+  {
+    if (this->smearingLevels > 0)
+    {
+      double start = usecond();
+      GaugeField force = SigmaTilde; // actually = U*SigmaTilde
+      GaugeLinkField tmp_mu(SigmaTilde.Grid());
+
+      // Remove U from UdSdU
+      for (int mu = 0; mu < Nd; mu++)
+      {
+        // to get just SigmaTilde
+        tmp_mu = adj(peekLorentz(this->SmearedSet[this->smearingLevels - 1], mu)) * peekLorentz(force, mu);
+        pokeLorentz(force, tmp_mu, mu);
+      }
+
+      for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
+        force = this->AnalyticSmearedForce(force, this->get_smeared_conf(ismr - 1),ismr);
+      }
+      
+      force = this->AnalyticSmearedForce(force, *this->ThinLinks,0);
+
+      // Add U to UdSdU
+      for (int mu = 0; mu < Nd; mu++)
+      {
+        tmp_mu = peekLorentz(*this->ThinLinks, mu) * peekLorentz(force, mu);
+        pokeLorentz(SigmaTilde, tmp_mu, mu);
+      }
+
+
+      double end = usecond();
+      double time = (end - start)/ 1e3;
+      std::cout << GridLogMessage << " GaugeConfigurationMasked: Smeared Force chain rule took " << time << " ms" << std::endl;
+
+    }  // if smearingLevels = 0 do nothing
+    SigmaTilde=Gimpl::projectForce(SigmaTilde); // Ta
+  }
+
+};
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/smearing/JacobianAction.h
+++ b/Grid/qcd/smearing/JacobianAction.h
@ -0,0 +1,87 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/gauge/JacobianAction.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+////////////////////////////////////////////////////////////////////////
+// Jacobian Action .. 
+////////////////////////////////////////////////////////////////////////
+template <class Gimpl>
+class JacobianAction : public Action<typename Gimpl::GaugeField> {
+public:  
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  SmearedConfigurationMasked<Gimpl> * smearer;
+  /////////////////////////// constructors
+  explicit JacobianAction(SmearedConfigurationMasked<Gimpl> * _smearer ) { smearer=_smearer;};
+
+  virtual std::string action_name() {return "JacobianAction";}
+
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    sstream << GridLogMessage << "[JacobianAction] " << std::endl;
+    return sstream.str();
+  }
+
+  //////////////////////////////////
+  // Usual cases are not used
+  //////////////////////////////////
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){ assert(0);};
+  virtual RealD S(const GaugeField &U) { assert(0); }
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) { assert(0);  }
+
+  //////////////////////////////////
+  // Functions of smart configs only
+  //////////////////////////////////
+  virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
+  {
+    return;
+  }
+  virtual RealD S(ConfigurationBase<GaugeField>& U)
+  {
+    // det M = e^{ - ( - logDetM) }
+    assert( &U == smearer );
+    return -smearer->logDetJacobian();
+  }
+  virtual RealD Sinitial(ConfigurationBase<GaugeField>& U) 
+  {
+    return S(U);
+  }
+  virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
+  {
+    assert( &U == smearer );
+    smearer->logDetJacobianForce(dSdU);
+  }
+
+private:
+ };
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@ -40,7 +40,9 @@ template <class Gimpl>
 class Smear_Stout : public Smear<Gimpl> {
 private:
  int OrthogDim = -1;
+public:
  const std::vector<double> SmearRho;
+private:
  // Smear<Gimpl>* ownership semantics:
  //    Smear<Gimpl>* passed in to constructor are owned by caller, so we don't delete them here
  //    Smear<Gimpl>* created within constructor need to be deleted as part of the destructor
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@ -37,13 +37,14 @@ NAMESPACE_BEGIN(Grid);
 // Make these members of an Impl class for BC's.

 namespace PeriodicBC { 
-
+  //Out(x) = Link(x)*field(x+mu)
  template<class covariant,class gauge> Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link, 
 									   int mu,
 									   const Lattice<covariant> &field)
  {
    return Link*Cshift(field,mu,1);// moves towards negative mu
  }
+  //Out(x) = Link^dag(x-mu)*field(x-mu)
  template<class covariant,class gauge> Lattice<covariant> CovShiftBackward(const Lattice<gauge> &Link, 
 									    int mu,
 									    const Lattice<covariant> &field)
@ -52,19 +53,19 @@ namespace PeriodicBC {
    tmp = adj(Link)*field;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
-
+  //Out(x) = Link^dag(x-mu)
  template<class gauge> Lattice<gauge>
  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) 
  {
    return Cshift(adj(Link), mu, -1);
  }
-
+  //Out(x) = Link(x)
  template<class gauge> Lattice<gauge>
  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu)
  {
    return Link;
  }
-
+  //Link(x) = Link(x+mu)
  template<class gauge> Lattice<gauge>
  ShiftStaple(const Lattice<gauge> &Link, int mu)
  {
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@ -34,6 +34,61 @@ directory

 NAMESPACE_BEGIN(Grid);

+template<int N, class Vec>
+Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
+{
+  GridBase *grid=Umu.Grid();
+  auto lvol = grid->lSites();
+  Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
+  typedef typename Vec::scalar_type scalar;
+  autoView(Umu_v,Umu,CpuRead);
+  autoView(ret_v,ret,CpuWrite);
+  thread_for(site,lvol,{
+    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
+    Coordinate lcoor;
+    grid->LocalIndexToLocalCoor(site, lcoor);
+    iScalar<iScalar<iMatrix<scalar, N> > > Us;
+    peekLocalSite(Us, Umu_v, lcoor);
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	scalar tmp= Us()()(i,j);
+	ComplexD ztmp(real(tmp),imag(tmp));
+	EigenU(i,j)=ztmp;
+      }}
+    ComplexD detD  = EigenU.determinant();
+    typename Vec::scalar_type det(detD.real(),detD.imag());
+    pokeLocalSite(det,ret_v,lcoor);
+  });
+  return ret;
+}
+
+template<int N, class Vec>
+static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
+{
+  Umu      = ProjectOnGroup(Umu);
+  auto det = Determinant(Umu);
+
+  det = conjugate(det);
+
+  for(int i=0;i<N;i++){
+    auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
+    element = element * det;
+    PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
+  }
+}
+template<int N,class Vec>
+static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<Vec, N> >,Nd> > &U)
+{
+  GridBase *grid=U.Grid();
+  // Reunitarise
+  for(int mu=0;mu<Nd;mu++){
+    auto Umu = PeekIndex<LorentzIndex>(U,mu);
+    Umu      = ProjectOnGroup(Umu);
+    ProjectSUn(Umu);
+    PokeIndex<LorentzIndex>(U,Umu,mu);
+  }
+}
+
 template <int ncolour>
 class SU {
 public:
@ -741,8 +796,14 @@ public:
    typedef Lattice<vMatrixType> LatticeMatrixType;

    LatticeMatrixType Umu(out.Grid());
+    LatticeMatrixType tmp(out.Grid());
    for (int mu = 0; mu < Nd; mu++) {
-      LieRandomize(pRNG, Umu, 1.0);
+      //      LieRandomize(pRNG, Umu, 1.0);
+      //      PokeIndex<LorentzIndex>(out, Umu, mu);
+      gaussian(pRNG,Umu);
+      tmp = Ta(Umu);
+      taExp(tmp,Umu);
+      ProjectSUn(Umu);
      PokeIndex<LorentzIndex>(out, Umu, mu);
    }
  }
@ -799,12 +860,12 @@ public:
 };

 template<int N>
-LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
+Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
 {
  GridBase *grid=Umu.Grid();
  auto lvol = grid->lSites();
-  LatticeComplexD ret(grid);
-
+  Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
+  
  autoView(Umu_v,Umu,CpuRead);
  autoView(ret_v,ret,CpuWrite);
  thread_for(site,lvol,{
@ -812,42 +873,21 @@ LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N>
    Coordinate lcoor;
    grid->LocalIndexToLocalCoor(site, lcoor);
    iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
+    iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
    peekLocalSite(Us, Umu_v, lcoor);
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	EigenU(i,j) = Us()()(i,j);
      }}
-    ComplexD det = EigenU.determinant();
-    pokeLocalSite(det,ret_v,lcoor);
+    Eigen::MatrixXcd EigenUinv = EigenU.inverse();
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	Ui()()(i,j) = EigenUinv(i,j);
+      }}
+    pokeLocalSite(Ui,ret_v,lcoor);
  });
  return ret;
 }
-template<int N>
-static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
-{
-  Umu      = ProjectOnGroup(Umu);
-  auto det = Determinant(Umu);
-
-  det = conjugate(det);
-
-  for(int i=0;i<N;i++){
-    auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
-    element = element * det;
-    PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
-  }
-}
-template<int N>
-static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<vComplexD, N> >,Nd> > &U)
-{
-  GridBase *grid=U.Grid();
-  // Reunitarise
-  for(int mu=0;mu<Nd;mu++){
-    auto Umu = PeekIndex<LorentzIndex>(U,mu);
-    Umu      = ProjectOnGroup(Umu);
-    ProjectSUn(Umu);
-    PokeIndex<LorentzIndex>(U,Umu,mu);
-  }
-}
 // Explicit specialisation for SU(3).
 // Explicit specialisation for SU(3).
 static void
--- a/Grid/qcd/utils/SUnAdjoint.h
+++ b/Grid/qcd/utils/SUnAdjoint.h
@ -51,6 +51,7 @@ public:
  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> > LatticeAdjFieldF;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> > LatticeAdjFieldD;

+  typedef Lattice<iScalar<iScalar<iVector<vComplex, Dimension> > > >  LatticeAdjVector;

  template <class cplx>
  static void generator(int Index, iSUnAdjointMatrix<cplx> &iAdjTa) {
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -290,7 +290,7 @@ public:
  }
 */
  //////////////////////////////////////////////////
-  // the sum over all staples on each site
+  // the sum over all nu-oriented staples for nu != mu on each site
  //////////////////////////////////////////////////
  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

@ -300,6 +300,10 @@ public:
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
+    Staple(staple, U, mu);
+  }
+
+  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
    staple = Zero();

    for (int nu = 0; nu < Nd; nu++) {
@ -335,6 +339,202 @@ public:
    }
  }

+  /////////////
+  //Staples for each direction mu, summed over nu != mu
+  //staple: output staples for each mu (Nd)
+  //U: link array (Nd)
+  /////////////
+  static void StapleAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U) {
+    assert(staple.size() == Nd); assert(U.size() == Nd);
+    for(int mu=0;mu<Nd;mu++) Staple(staple[mu], U, mu);
+  }
+
+
+  //A workspace class allowing reuse of the stencil
+  class WilsonLoopPaddedStencilWorkspace{
+    std::unique_ptr<GeneralLocalStencil> stencil;
+    size_t nshift;
+
+    void generateStencil(GridBase* padded_grid){
+      double t0 = usecond();
+      
+      //Generate shift arrays
+      std::vector<Coordinate> shifts = this->getShifts();
+      nshift = shifts.size();
+      
+      double t1 = usecond();
+      //Generate local stencil
+      stencil.reset(new GeneralLocalStencil(padded_grid,shifts));
+      double t2 = usecond();
+      std::cout << GridLogPerformance << " WilsonLoopPaddedWorkspace timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms" << std::endl;   
+    }
+  public:
+    //Get the stencil. If not already generated, or if generated using a different Grid than in PaddedCell, it will be created on-the-fly
+    const GeneralLocalStencil & getStencil(const PaddedCell &pcell){
+      assert(pcell.depth >= this->paddingDepth());
+      if(!stencil || stencil->Grid() != (GridBase*)pcell.grids.back() ) generateStencil((GridBase*)pcell.grids.back());
+      return *stencil;
+    }
+    size_t Nshift() const{ return nshift; }
+    
+    virtual std::vector<Coordinate> getShifts() const = 0;
+    virtual int paddingDepth() const = 0; //padding depth required
+    
+    virtual ~WilsonLoopPaddedStencilWorkspace(){}
+  };
+
+  //This workspace allows the sharing of a common PaddedCell object between multiple stencil workspaces
+  class WilsonLoopPaddedWorkspace{
+    std::vector<WilsonLoopPaddedStencilWorkspace*> stencil_wk;
+    std::unique_ptr<PaddedCell> pcell;
+
+    void generatePcell(GridBase* unpadded_grid){
+      assert(stencil_wk.size());
+      int max_depth = 0;
+      for(auto const &s : stencil_wk) max_depth=std::max(max_depth, s->paddingDepth());
+      
+      pcell.reset(new PaddedCell(max_depth, dynamic_cast<GridCartesian*>(unpadded_grid)));
+    }
+    
+  public:
+    //Add a stencil definition. This should be done before the first call to retrieve a stencil object.
+    //Takes ownership of the pointer
+    void addStencil(WilsonLoopPaddedStencilWorkspace *stencil){
+      assert(!pcell);
+      stencil_wk.push_back(stencil);
+    }
+
+    const GeneralLocalStencil & getStencil(const size_t stencil_idx, GridBase* unpadded_grid){
+      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
+      return stencil_wk[stencil_idx]->getStencil(*pcell);
+    }      
+    const PaddedCell & getPaddedCell(GridBase* unpadded_grid){
+      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
+      return *pcell;
+    }
+    
+    ~WilsonLoopPaddedWorkspace(){
+      for(auto &s : stencil_wk) delete s;
+    }
+  };
+
+  //A workspace class allowing reuse of the stencil
+  class StaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
+  public:
+    std::vector<Coordinate> getShifts() const override{
+      std::vector<Coordinate> shifts;
+      for(int mu=0;mu<Nd;mu++){
+	for(int nu=0;nu<Nd;nu++){
+	  if(nu != mu){
+	    Coordinate shift_0(Nd,0);
+	    Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
+	    Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
+	    Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
+	    Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
+      
+	    //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
+	    shifts.push_back(shift_0);
+	    shifts.push_back(shift_nu);
+	    shifts.push_back(shift_mu);
+      
+	    //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
+	    shifts.push_back(shift_mnu);
+	    shifts.push_back(shift_mnu);
+	    shifts.push_back(shift_mnu_pmu);
+	  }
+	}
+      }
+      return shifts;
+    }
+
+    int paddingDepth() const override{ return 1; }
+  }; 
+
+  //Padded cell implementation of the staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
+    StaplePaddedAllWorkspace wk;
+    StaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
+  }
+  
+  //Padded cell implementation of the staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  //gStencil: the precomputed generalized local stencil for the staple
+  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
+    double t0 = usecond();
+    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
+    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
+    assert(Cell.depth >= 1);
+    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
+
+    int shift_mu_off = gStencil._npoints/Nd;
+    
+    //Open views to padded gauge links and keep open over mu loop
+    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
+    size_t vsize = Nd*sizeof(GaugeViewType);
+    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
+    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
+    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
+    
+    GaugeMat gStaple(ggrid);
+
+    int outer_off = 0;
+    for(int mu=0;mu<Nd;mu++){
+      { //view scope
+	autoView( gStaple_v , gStaple, AcceleratorWrite);
+	auto gStencil_v = gStencil.View();
+	
+	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
+	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
+	    stencil_ss = Zero();
+	    int off = outer_off;
+	    
+	    for(int nu=0;nu<Nd;nu++){
+	      if(nu != mu){	  
+		GeneralStencilEntry const* e = gStencil_v.GetEntry(off++,ss);
+		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+      
+		stencil_ss = stencil_ss + U2 * U1 * U0;
+
+		e = gStencil_v.GetEntry(off++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(off++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+		stencil_ss = stencil_ss + U2 * U1 * U0;
+	      }
+	    }
+		
+	    coalescedWrite(gStaple_v[ss],stencil_ss);
+	  }
+	  );
+      } //ensure views are all closed!
+      
+      staple[mu] = Cell.Extract(gStaple);
+      outer_off += shift_mu_off;
+    }//mu loop
+
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
+    free(Ug_dirs_v_host);
+    acceleratorFreeDevice(Ug_dirs_v);
+    
+    double t1=usecond();
+    
+    std::cout << GridLogPerformance << "StaplePaddedAll timing:" << (t1-t0)/1000 << "ms" << std::endl;   
+  }
+
+   
  //////////////////////////////////////////////////
  // the sum over all staples on each site in direction mu,nu, upper part
  //////////////////////////////////////////////////
@ -707,18 +907,14 @@ public:
  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
-    U2 = U * Cshift(U, mu, 1);
+    U2 = U * Gimpl::CshiftLink(U, mu, 1);
  }

  ////////////////////////////////////////////////////////////////////////////
-  // Hop by two optimisation strategy does not work nicely with Gparity. (could
-  // do,
-  // but need to track two deep where cross boundary and apply a conjugation).
-  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do
-  // so .
+  // Hop by two optimisation strategy. Use RectStapleDouble to obtain 'U2'
  ////////////////////////////////////////////////////////////////////////////
-  static void RectStapleOptimised(GaugeMat &Stap, std::vector<GaugeMat> &U2,
-                                  std::vector<GaugeMat> &U, int mu) {
+  static void RectStapleOptimised(GaugeMat &Stap, const std::vector<GaugeMat> &U2,
+                                  const std::vector<GaugeMat> &U, int mu) {

    Stap = Zero();

@ -732,9 +928,9 @@ public:

        // Up staple    ___ ___
        //             |       |
-        tmp = Cshift(adj(U[nu]), nu, -1);
+        tmp = Gimpl::CshiftLink(adj(U[nu]), nu, -1);
        tmp = adj(U2[mu]) * tmp;
-        tmp = Cshift(tmp, mu, -2);
+        tmp = Gimpl::CshiftLink(tmp, mu, -2);

        Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);

@ -742,14 +938,14 @@ public:
        //             |___ ___|
        //
        tmp = adj(U2[mu]) * U[nu];
-        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Cshift(tmp, mu, -2));
+        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CshiftLink(tmp, mu, -2));

        //              ___ ___
        //             |    ___|
        //             |___ ___|
        //

-        Stap += Cshift(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
+        Stap += Gimpl::CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);

        //              ___ ___
        //             |___    |
@ -758,7 +954,7 @@ public:

        //  tmp= Staple2x1* Cshift(U[mu],mu,-2);
        //  Stap+= Cshift(tmp,mu,1) ;
-        Stap += Cshift(Staple2x1, mu, 1) * Cshift(U[mu], mu, -1);
+        Stap += Gimpl::CshiftLink(Staple2x1, mu, 1) * Gimpl::CshiftLink(U[mu], mu, -1);
        ;

        //       --
@ -766,10 +962,10 @@ public:
        //
        //      |  |

-        tmp = Cshift(adj(U2[nu]), nu, -2);
+        tmp = Gimpl::CshiftLink(adj(U2[nu]), nu, -2);
        tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
-        tmp = U2[nu] * Cshift(tmp, nu, 2);
-        Stap += Cshift(tmp, mu, 1);
+        tmp = U2[nu] * Gimpl::CshiftLink(tmp, nu, 2);
+        Stap += Gimpl::CshiftLink(tmp, mu, 1);

        //      |  |
        //
@ -778,25 +974,12 @@ public:

        tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
        tmp = adj(U2[nu]) * tmp;
-        tmp = Cshift(tmp, nu, -2);
-        Stap += Cshift(tmp, mu, 1);
+        tmp = Gimpl::CshiftLink(tmp, nu, -2);
+        Stap += Gimpl::CshiftLink(tmp, mu, 1);
      }
    }
  }

-  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
-    RectStapleUnoptimised(Stap, Umu, mu);
-  }
-  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
-                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
-                         int mu) {
-    if (Gimpl::isPeriodicGaugeField()) {
-      RectStapleOptimised(Stap, U2, U, mu);
-    } else {
-      RectStapleUnoptimised(Stap, Umu, mu);
-    }
-  }
-
  static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu,
                                    int mu) {
    GridBase *grid = Umu.Grid();
@ -895,6 +1078,288 @@ public:
    }
  }

+  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
+    RectStapleUnoptimised(Stap, Umu, mu);
+  }
+  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
+                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
+                         int mu) {
+    RectStapleOptimised(Stap, U2, U, mu);
+  }
+  //////////////////////////////////////////////////////
+  //Compute the rectangular staples for all orientations
+  //Stap : Array of staples (Nd)
+  //U: Gauge links in each direction (Nd)
+  /////////////////////////////////////////////////////
+  static void RectStapleAll(std::vector<GaugeMat> &Stap, const std::vector<GaugeMat> &U){
+    assert(Stap.size() == Nd); assert(U.size() == Nd);
+    std::vector<GaugeMat> U2(Nd,U[0].Grid());
+    for(int mu=0;mu<Nd;mu++) RectStapleDouble(U2[mu], U[mu], mu);
+    for(int mu=0;mu<Nd;mu++) RectStapleOptimised(Stap[mu], U2, U, mu);
+  }
+
+  //A workspace class allowing reuse of the stencil
+  class RectStaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
+  public:
+    std::vector<Coordinate> getShifts() const override{
+      std::vector<Coordinate> shifts;
+      for (int mu = 0; mu < Nd; mu++){
+	for (int nu = 0; nu < Nd; nu++) {
+	  if (nu != mu) {
+	    auto genShift = [&](int mushift,int nushift){
+	      Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
+	    };
+
+	    //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+	    shifts.push_back(genShift(0,0));
+	    shifts.push_back(genShift(0,+1));
+	    shifts.push_back(genShift(+1,+1));
+	    shifts.push_back(genShift(+2,0));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(+1,-1));
+	    shifts.push_back(genShift(+2,-1));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+	    shifts.push_back(genShift(-1,0));
+	    shifts.push_back(genShift(-1,-1));
+	    shifts.push_back(genShift(-1,-1));
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(+1,-1));
+
+	    //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+	    shifts.push_back(genShift(-1,0));
+	    shifts.push_back(genShift(-1,0));
+	    shifts.push_back(genShift(-1,+1));
+	    shifts.push_back(genShift(0,+1));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+	    shifts.push_back(genShift(0,0));
+	    shifts.push_back(genShift(0,+1));
+	    shifts.push_back(genShift(0,+2));
+	    shifts.push_back(genShift(+1,+1));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(0,-2));
+	    shifts.push_back(genShift(0,-2));
+	    shifts.push_back(genShift(+1,-2));
+	    shifts.push_back(genShift(+1,-1));
+	  }
+	}
+      }
+      return shifts;
+    }
+
+    int paddingDepth() const override{ return 2; }
+  }; 
+
+  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
+    RectStaplePaddedAllWorkspace wk;
+    RectStaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
+  }
+  
+  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  //gStencil: the stencil
+  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
+    double t0 = usecond();
+    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
+    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
+    assert(Cell.depth >= 2);
+    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
+
+    size_t nshift = gStencil._npoints;
+    int mu_off_delta = nshift / Nd;
+    
+    //Open views to padded gauge links and keep open over mu loop
+    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
+    size_t vsize = Nd*sizeof(GaugeViewType);
+    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
+    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
+    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
+
+    GaugeMat gStaple(ggrid); //temp staple object on padded grid
+
+    int offset = 0;
+    for(int mu=0; mu<Nd; mu++){
+
+      { //view scope
+	autoView( gStaple_v , gStaple, AcceleratorWrite);
+	auto gStencil_v = gStencil.View();
+
+	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
+	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
+	    stencil_ss = Zero();
+	    int s=offset;
+	    for(int nu=0;nu<Nd;nu++){
+	      if(nu != mu){
+		//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+		GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
+		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+	    
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
+
+		//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
+
+	      }
+	    }
+	    coalescedWrite(gStaple_v[ss],stencil_ss);
+	  }
+	  );
+	offset += mu_off_delta;
+      }//kernel/view scope
+
+      staple[mu] = Cell.Extract(gStaple);    
+    }//mu loop
+  
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
+    free(Ug_dirs_v_host);
+    acceleratorFreeDevice(Ug_dirs_v);
+    
+    double t1 = usecond();
+    
+    std::cout << GridLogPerformance << "RectStaplePaddedAll timings:" << (t1-t0)/1000 << "ms" << std::endl;   
+  }
+
+  //A workspace for reusing the PaddedCell and GeneralLocalStencil objects
+  class StapleAndRectStapleAllWorkspace: public WilsonLoopPaddedWorkspace{
+  public:
+    StapleAndRectStapleAllWorkspace(){
+      this->addStencil(new StaplePaddedAllWorkspace);
+      this->addStencil(new RectStaplePaddedAllWorkspace);
+    }
+  };     
+    
+  //////////////////////////////////////////////////////
+  //Compute the 1x1 and 1x2 staples for all orientations
+  //Stap : Array of staples (Nd)
+  //RectStap: Array of rectangular staples (Nd)
+  //U: Gauge links in each direction (Nd)
+  /////////////////////////////////////////////////////
+  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U){
+    StapleAndRectStapleAllWorkspace wk;
+    StapleAndRectStapleAll(Stap,RectStap,U,wk);
+  }
+  
+  //////////////////////////////////////////////////////
+  //Compute the 1x1 and 1x2 staples for all orientations
+  //Stap : Array of staples (Nd)
+  //RectStap: Array of rectangular staples (Nd)
+  //U: Gauge links in each direction (Nd)
+  //wk: a workspace containing stored PaddedCell and GeneralLocalStencil objects to maximize reuse
+  /////////////////////////////////////////////////////
+  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U, StapleAndRectStapleAllWorkspace &wk){
+#if 0
+    StapleAll(Stap, U);
+    RectStapleAll(RectStap, U);
+#else
+    double t0 = usecond();
+
+    GridCartesian* unpadded_grid = dynamic_cast<GridCartesian*>(U[0].Grid());
+    const PaddedCell &Ghost = wk.getPaddedCell(unpadded_grid);
+        
+    CshiftImplGauge<Gimpl> cshift_impl;
+    std::vector<GaugeMat> U_pad(Nd, Ghost.grids.back());
+    for(int mu=0;mu<Nd;mu++) U_pad[mu] = Ghost.Exchange(U[mu], cshift_impl);
+    double t1 = usecond();
+    StaplePaddedAll(Stap, U_pad, Ghost, wk.getStencil(0,unpadded_grid) );
+    double t2 = usecond();
+    RectStaplePaddedAll(RectStap, U_pad, Ghost, wk.getStencil(1,unpadded_grid));
+    double t3 = usecond();
+    std::cout << GridLogPerformance << "StapleAndRectStapleAll timings: pad:" << (t1-t0)/1000 << "ms, staple:" << (t2-t1)/1000 << "ms, rect-staple:" << (t3-t2)/1000 << "ms" << std::endl;
+#endif
+  }
+
  //////////////////////////////////////////////////
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
  //////////////////////////////////////////////////
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@ -79,60 +79,60 @@ public:
    this->_entries.resize(npoints* osites);
    this->_entries_p = &_entries[0];

+    thread_for(site, osites, {
+	Coordinate Coor;
+	Coordinate NbrCoor;

-    Coordinate Coor;
-    Coordinate NbrCoor;
-    for(Integer site=0;site<osites;site++){
-      for(Integer ii=0;ii<npoints;ii++){
-	Integer lex = site*npoints+ii;
-	GeneralStencilEntry SE;
-	////////////////////////////////////////////////
-	// Outer index of neighbour Offset calculation
-	////////////////////////////////////////////////
-	grid->oCoorFromOindex(Coor,site);
-	for(int d=0;d<Coor.size();d++){
-	  int rd = grid->_rdimensions[d];
-	  NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
+	for(Integer ii=0;ii<npoints;ii++){
+	  Integer lex = site*npoints+ii;
+	  GeneralStencilEntry SE;
+	  ////////////////////////////////////////////////
+	  // Outer index of neighbour Offset calculation
+	  ////////////////////////////////////////////////
+	  grid->oCoorFromOindex(Coor,site);
+	  for(int d=0;d<Coor.size();d++){
+	    int rd = grid->_rdimensions[d];
+	    NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
+	  }
+	  SE._offset      = grid->oIndexReduced(NbrCoor);
+
+	  ////////////////////////////////////////////////
+	  // Inner index permute calculation
+	  // Simpler version using icoor calculation
+	  ////////////////////////////////////////////////
+	  SE._permute =0;
+	  for(int d=0;d<Coor.size();d++){
+
+	    int fd = grid->_fdimensions[d];
+	    int rd = grid->_rdimensions[d];
+	    int ly = grid->_simd_layout[d];
+
+	    assert((ly==1)||(ly==2));
+
+	    int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
+	    int x = Coor[d];                // x in [0... rd-1] as an oSite 
+
+	    int permute_dim  = grid->PermuteDim(d);
+	    int permute_slice=0;
+	    if(permute_dim){    
+	      int  num = shift%rd; // Slice within dest osite cell of slice zero
+	      int wrap = shift/rd; // Number of osite local volume cells crossed through
+	      // x+num < rd dictates whether we are in same permute state as slice 0
+	      if ( x< rd-num ) permute_slice=wrap;
+	      else             permute_slice=(wrap+1)%ly;
+	    }
+	    if ( permute_slice ) {
+	      int ptype       =grid->PermuteType(d);
+	      uint8_t mask    =0x1<<ptype;
+	      SE._permute    |= mask;
+	    }
+	  }	
+	  ////////////////////////////////////////////////
+	  // Store in look up table
+	  ////////////////////////////////////////////////
+	  this->_entries[lex] = SE;
 	}
-	SE._offset      = grid->oIndexReduced(NbrCoor);
-
-	////////////////////////////////////////////////
-	// Inner index permute calculation
-	// Simpler version using icoor calculation
-	////////////////////////////////////////////////
-	SE._permute =0;
-	for(int d=0;d<Coor.size();d++){
-
-	  int fd = grid->_fdimensions[d];
-	  int rd = grid->_rdimensions[d];
-	  int ly = grid->_simd_layout[d];
-
-	  assert((ly==1)||(ly==2));
-
-	  int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
-	  int x = Coor[d];                // x in [0... rd-1] as an oSite 
-
-	  int permute_dim  = grid->PermuteDim(d);
-	  int permute_slice=0;
-	  if(permute_dim){    
-	    int  num = shift%rd; // Slice within dest osite cell of slice zero
-	    int wrap = shift/rd; // Number of osite local volume cells crossed through
-                                  // x+num < rd dictates whether we are in same permute state as slice 0
-	    if ( x< rd-num ) permute_slice=wrap;
-	    else             permute_slice=(wrap+1)%ly;
-	  }
-	  if ( permute_slice ) {
-	    int ptype       =grid->PermuteType(d);
-	    uint8_t mask    =grid->Nsimd() >> (ptype + 1);		
-	    SE._permute    |= mask;
-	  }
-	}	
-	////////////////////////////////////////////////
-	// Store in look up table
-	////////////////////////////////////////////////
-	this->_entries[lex] = SE;
-      }
-    }      
+      });
  }
  
 };
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -32,6 +32,7 @@

 #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate
 #include <Grid/stencil/Lebesgue.h>   // subdir aggregate
+#include <Grid/stencil/GeneralLocalStencil.h>

 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
@ -339,8 +340,8 @@ public:
  // Vectors that live on the symmetric heap in case of SHMEM
  // These are used; either SHM objects or refs to the above symmetric heap vectors
  // depending on comms target
-  Vector<cobj *> u_simd_send_buf;
-  Vector<cobj *> u_simd_recv_buf;
+  std::vector<cobj *> u_simd_send_buf;
+  std::vector<cobj *> u_simd_recv_buf;

  int u_comm_offset;
  int _unified_buffer_size;
@ -348,7 +349,7 @@ public:
  ////////////////////////////////////////
  // Stencil query
  ////////////////////////////////////////
-#ifdef SHM_FAST_PATH
+#if 1
  inline int SameNode(int point) {

    int dimension    = this->_directions[point];
@ -434,7 +435,6 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    accelerator_barrier();
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
@ -452,7 +452,6 @@ public:
    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
    acceleratorCopySynchronise();
-    // Everyone agrees we are all done
    _grid->StencilBarrier(); 
  }
  ////////////////////////////////////////////////////////////////////////
@ -541,6 +540,7 @@ public:
      compress.Point(point);
      HaloGatherDir(source,compress,point,face_idx);
    }
+    accelerator_barrier();
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);

@ -666,11 +666,9 @@ public:
    for(int i=0;i<mm.size();i++){
      decompressor::MergeFace(decompress,mm[i]);
    }
-    if ( mm.size() )    acceleratorFenceComputeStream();
    for(int i=0;i<dd.size();i++){
      decompressor::DecompressFace(decompress,dd[i]);
    }
-    if ( dd.size() )    acceleratorFenceComputeStream();
  }
  ////////////////////////////////////////
  // Set up routines
@ -708,6 +706,7 @@ public:
 	}
      }
    }
+    std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
--- a/Grid/tensors/Tensor_SIMT.h
+++ b/Grid/tensors/Tensor_SIMT.h
@ -73,6 +73,16 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
    return vec;
  }
 }
+//'perm_mask' acts as a bitmask
+template<class vobj> accelerator_inline
+vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0)
+{
+  auto obj = vec, tmp = vec;
+  for (int d=0;d<nd;d++)
+    if (perm_mask & (0x1 << d)) { permute(obj,tmp,d); tmp=obj;}
+  return obj;
+}
+
 template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
 {
@ -83,7 +93,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 {
  vstream(vec, extracted);
 }
-#else
+#else //==GRID_SIMT


 //#ifndef GRID_SYCL
@ -166,6 +176,14 @@ typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,
  return extractLane(plane,vec);
 }
 template<class vobj> accelerator_inline
+typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd()))
+{
+  int plane = lane;
+  for (int d=0;d<nd;d++)
+    plane = (perm_mask & (0x1 << d)) ? plane ^ (vobj::Nsimd() >> (d + 1)) : plane;
+  return extractLane(plane,vec);
+}
+template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 {
  insertLane(lane,vec,extracted);
--- a/Grid/tensors/Tensor_exp.h
+++ b/Grid/tensors/Tensor_exp.h
@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c


 // Specialisation: Cayley-Hamilton exponential for SU(3)
-#ifndef GRID_ACCELERATED
+#if 0
 template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
 accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
 {
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -526,7 +526,7 @@ inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 //////////////////////////////////////////////

 #ifdef GRID_SYCL
-inline void acceleratorFenceComputeStream(void){ theGridAccelerator->submit_barrier();};
+inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); };
 #else
 // Ordering within a stream guaranteed on Nvidia & AMD
 inline void acceleratorFenceComputeStream(void){ };
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@ -0,0 +1,224 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Copyright (C) 2023
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
+#include <Grid/qcd/smearing/JacobianAction.h>
+
+using namespace Grid;
+
+int main(int argc, char **argv)
+{
+  std::cout << std::setprecision(12);
+  
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef MobiusFermionD FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+  typedef Grid::XmlReader       Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  MD.name    = std::string("Leap Frog");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = 12;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 0;
+  HMCparams.Trajectories     = 200;
+  HMCparams.NoMetropolisUntil=  20;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  HMCparams.StartingType     =std::string("HotStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_EODWF_lat";
+  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
+  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
+  CPparams.saveInterval  = 1;
+  CPparams.saveSmeared   = true;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 16;
+  Real beta         = 2.13;
+  Real light_mass   = 0.01;
+  Real strange_mass = 0.04;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD b   = 1.0; // Scale factor two
+  RealD c   = 0.0;
+
+  OneFlavourRationalParams OFRp;
+  OFRp.lo       = 1.0e-2;
+  OFRp.hi       = 64;
+  OFRp.MaxIter  = 10000;
+  OFRp.tolerance= 1.0e-10;
+  OFRp.degree   = 14;
+  OFRp.precision= 40;
+
+  std::vector<Real> hasenbusch({ 0.1 });
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+  LatticeGaugeField Uhot(GridPtr);
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+
+  double StoppingCondition = 1e-10;
+  double MaxCGIterations = 30000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+
+  bool ApplySmearing = true;
+  
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(2);
+  ActionLevel<HMCWrapper::Field> Level3(4);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+
+  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
+  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 CG,
+	 CG, CG,
+	 CG, CG, 
+	 OFRp, false);
+
+  EOFA.is_smeared = ApplySmearing;
+  Level1.push_back(&EOFA);
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]);
+    light_num.push_back(hasenbusch[h]);
+  }
+  light_num.push_back(pv_mass);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionAction *> Denominators;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
+    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
+  }
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    Quotients[h]->is_smeared = ApplySmearing;
+    Level1.push_back(Quotients[h]);
+  }
+
+  /////////////////////////////////////////////////////////////
+  // lnDetJacobianAction
+  /////////////////////////////////////////////////////////////
+  double rho = 0.1;  // smearing parameter
+  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
+  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
+  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
+  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
+  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
+  if( ApplySmearing ) Level2.push_back(&Jacobian);
+  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
+
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  //  GaugeAction.is_smeared = ApplySmearing;
+  GaugeAction.is_smeared = true;
+  Level3.push_back(&GaugeAction);
+
+  std::cout << GridLogMessage << " ************************************************"<< std::endl;
+  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
+  std::cout << GridLogMessage << " ************************************************"<< std::endl;
+  std::cout << GridLogMessage <<  std::endl;
+  std::cout << GridLogMessage <<  std::endl;
+
+
+  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+
+  TheHMC.Run(SmearingPolicy); // for smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
@ -227,7 +227,7 @@ int main(int argc, char **argv) {
  //  std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });

-  int SP_iters=10000;
+  int SP_iters=9000;
  
  RationalActionParams OFRp; // Up/down
  OFRp.lo       = 6.0e-5;
@ -362,12 +362,12 @@ int main(int argc, char **argv) {

  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
-  SFRp.lo       = 0.25;
+  SFRp.lo       = 0.1;
  SFRp.hi       = 25.0;
  SFRp.MaxIter  = 10000;
-  SFRp.tolerance= 1.0e-5;
+  SFRp.tolerance= 1.0e-8;
  SFRp.mdtolerance= 2.0e-4;
-  SFRp.degree   = 8;
+  SFRp.degree   = 12;
  SFRp.precision= 50;
  
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
--- a/HMC/Mobius2p1f_EOFA_96I_hmc.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc
@ -146,6 +146,8 @@ NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  using namespace Grid;

+  std::cout << " Grid Initialise "<<std::endl;
+  
  Grid_init(&argc, &argv);

  CartesianCommunicator::BarrierWorld();
@ -170,24 +172,24 @@ int main(int argc, char **argv) {
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
-  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
-  MD.name    = std::string("Force Gradient");
-  //typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
-  // MD.name    = std::string("MinimumNorm2");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  MD.name    = std::string("MinimumNorm2");
  // TrajL = 2
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
-  MD.MDsteps =  12;
+  MD.MDsteps =  14;
  MD.trajL   = 0.5;

  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 1077;
-  HMCparams.Trajectories     = 1;
+  HMCparams.Trajectories     = 20;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  //  HMCparams.StartingType     =std::string("ColdStart");
-  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.StartingType     =std::string("ColdStart");
+  //  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);

@ -223,7 +225,7 @@ int main(int argc, char **argv) {
  Real pv_mass      = 1.0;
  //  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
-  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
+  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });

  auto GridPtr   = TheHMC.Resources.GetCartesian();
@ -244,11 +246,6 @@ int main(int argc, char **argv) {
  Coordinate shm;

  GlobalSharedMemory::GetShmDims(mpi,shm);
-  
-  Coordinate CommDim(Nd);
-  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-
-  Coordinate NonDirichlet(Nd+1,0);

  //////////////////////////
  // Fermion Grids
@ -277,15 +274,13 @@ int main(int argc, char **argv) {
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  FermionActionF::ImplParams ParamsF(boundary);
-  Params.dirichlet=NonDirichlet;
-  ParamsF.dirichlet=NonDirichlet;

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
-  double StoppingCondition = 1e-8;
-  double MDStoppingCondition = 1e-7;
-  double MDStoppingConditionLoose = 1e-7;
-  double MDStoppingConditionStrange = 1e-7;
+  double StoppingCondition = 1e-9;
+  double MDStoppingCondition = 1e-8;
+  double MDStoppingConditionLoose = 1e-8;
+  double MDStoppingConditionStrange = 1e-8;
  double MaxCGIterations = 300000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
@ -305,12 +300,12 @@ int main(int argc, char **argv) {

  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
-  SFRp.lo       = 0.25;
-  SFRp.hi       = 25.0;
+  SFRp.lo       = 0.1;
+  SFRp.hi       = 30.0;
  SFRp.MaxIter  = 10000;
-  SFRp.tolerance= 1.0e-5;
-  SFRp.mdtolerance= 2.0e-4;
-  SFRp.degree   = 8;
+  SFRp.tolerance= 1.0e-8;
+  SFRp.mdtolerance= 2.0e-6;
+  SFRp.degree   = 10;
  SFRp.precision= 50;
  
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
@ -370,19 +365,17 @@ int main(int argc, char **argv) {
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
-  std::vector<int> dirichlet_den;
-  std::vector<int> dirichlet_num;

  int n_hasenbusch = hasenbusch.size();
-  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
+  light_den.push_back(light_mass); 
  for(int h=0;h<n_hasenbusch;h++){
-    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(0);
+    light_den.push_back(hasenbusch[h]);
  }

  for(int h=0;h<n_hasenbusch;h++){
-    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(0);
+    light_num.push_back(hasenbusch[h]);
  }
-  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
+  light_num.push_back(pv_mass);

  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
@ -408,9 +401,7 @@ int main(int argc, char **argv) {
    std::cout << GridLogMessage
 	      << " 2f quotient Action ";
    std::cout << "det D("<<light_den[h]<<")";
-    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
    std::cout << "/ det D("<<light_num[h]<<")";
-    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
    std::cout << std::endl;

    FermionAction::ImplParams ParamsNum(boundary);
@ -418,21 +409,11 @@ int main(int argc, char **argv) {
    FermionActionF::ImplParams ParamsDenF(boundary);
    FermionActionF::ImplParams ParamsNumF(boundary);
    
-    ParamsNum.dirichlet = NonDirichlet;
-    ParamsDen.dirichlet = NonDirichlet;
-
-    ParamsNum.partialDirichlet = 0;
-    ParamsDen.partialDirichlet = 0;
-    
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));

-    ParamsDenF.dirichlet = ParamsDen.dirichlet;
-    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));

-    ParamsNumF.dirichlet = ParamsNum.dirichlet;
-    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
    NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));

    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
@ -469,7 +450,6 @@ int main(int argc, char **argv) {
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
-  //  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@ -425,7 +425,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)

  err = r_eo-result;
  n2e= norm2(err);
-  std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
  assert(n2e<1.0e-4);

  pickCheckerboard(Even,src_e,err);
--- a/documentation/David_notes.txt
+++ b/documentation/David_notes.txt
@ -0,0 +1,90 @@
+Branch: develop
+
+Files:
+
+Grid/lattice/PaddedCell.h -- Halo exchange
+tests/Test_general_stencil.cc -- test local off axis stencil addressing
+tests/debug/Test_padded_cell.cc -- test PaddedCell halo exchange and the General local stencil  by computing ALL plaquettes on lattice
+
+Functionality:
+
+-- extend a lattice field:
+Grid/lattice/PaddedCell.h
+
+// Constructor
+  PaddedCell(int _depth,GridCartesian *_grid)
+
+// Expand a field "in" to depth "d"
+  template<class vobj>
+  inline Lattice<vobj> Exchange(Lattice<vobj> &in)
+  
+// Take the "apple core" of in to a smaller local volume
+  template<class vobj>
+  inline Lattice<vobj> Extract(Lattice<vobj> &in)
+
+-- Plaquette test:
+tests/debug/Test_padded_cell.cc
+  /////////////////////////////////////////////////
+  // Create a padded cell of extra padding depth=1
+  /////////////////////////////////////////////////
+  int depth = 1;
+  PaddedCell Ghost(depth,&GRID);
+  LatticeGaugeField Ughost = Ghost.Exchange(Umu);
+
+///// Array for the site plaquette
+  GridBase *GhostGrid = Ughost.Grid();
+  LatticeComplex gplaq(GhostGrid); 
+
+  std::vector<Coordinate> shifts;
+  for(int mu=0;mu<Nd;mu++){
+    for(int nu=mu+1;nu<Nd;nu++){
+  
+      //    Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x)
+      Coordinate shift_0(Nd,0);
+      Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
+      Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
+      shifts.push_back(shift_0);
+      shifts.push_back(shift_mu);
+      shifts.push_back(shift_nu);
+      shifts.push_back(shift_0);
+    }
+  }
+  GeneralLocalStencil gStencil(GhostGrid,shifts);
+
+  gplaq=Zero();
+  {
+    autoView( gp_v , gplaq, CpuWrite);
+    autoView( t_v , trplaq, CpuRead);
+    autoView( U_v , Ughost, CpuRead);
+    for(int ss=0;ss<gp_v.size();ss++){
+      int s=0;
+      for(int mu=0;mu<Nd;mu++){
+	for(int nu=mu+1;nu<Nd;nu++){
+
+	  auto SE0 = gStencil.GetEntry(s+0,ss);
+	  auto SE1 = gStencil.GetEntry(s+1,ss);
+	  auto SE2 = gStencil.GetEntry(s+2,ss);
+	  auto SE3 = gStencil.GetEntry(s+3,ss);
+	
+	  int o0 = SE0->_offset;
+	  int o1 = SE1->_offset;
+	  int o2 = SE2->_offset;
+	  int o3 = SE3->_offset;
+	  
+	  auto U0 = U_v[o0](mu);
+	  auto U1 = U_v[o1](nu);
+	  auto U2 = adj(U_v[o2](mu));
+	  auto U3 = adj(U_v[o3](nu));
+
+	  gpermute(U0,SE0->_permute);
+	  gpermute(U1,SE1->_permute);
+	  gpermute(U2,SE2->_permute);
+	  gpermute(U3,SE3->_permute);
+	  
+	  gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 );
+	  s=s+4;
+	}
+      }
+    }
+  }
+  cplaq = Ghost.Extract(gplaq);
--- a/examples/socket_grid.cc
+++ b/examples/socket_grid.cc
@ -0,0 +1,133 @@
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <err.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+static int sock;
+static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
+static char sock_path[256];
+
+class UnixSockets {
+public:
+  static void Open(int rank)
+  {
+    int errnum;
+
+    sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0);
+    printf("allocated socket %d\n",sock);
+
+    struct sockaddr_un sa_un = { 0 };
+    sa_un.sun_family = AF_UNIX;
+    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
+    unlink(sa_un.sun_path);
+    if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
+      perror("bind failure");
+      exit(EXIT_FAILURE);
+    }
+    printf("bound socket %d to %s\n",sock,sa_un.sun_path);
+  }
+
+  static int RecvFileDescriptor(void)
+  {
+    int n;
+    int fd;
+    char buf[1];
+    struct iovec iov;
+    struct msghdr msg;
+    struct cmsghdr *cmsg;
+    char cms[CMSG_SPACE(sizeof(int))];
+
+    iov.iov_base = buf;
+    iov.iov_len = 1;
+
+    memset(&msg, 0, sizeof msg);
+    msg.msg_name = 0;
+    msg.msg_namelen = 0;
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+
+    msg.msg_control = (caddr_t)cms;
+    msg.msg_controllen = sizeof cms;
+
+    if((n=recvmsg(sock, &msg, 0)) < 0) {
+      perror("recvmsg failed");
+      return -1;
+    }
+    if(n == 0){
+      perror("recvmsg returned 0");
+      return -1;
+    }
+    cmsg = CMSG_FIRSTHDR(&msg);
+    memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
+    printf("received fd %d from socket %d\n",fd,sock);
+    return fd;
+  }
+
+  static void SendFileDescriptor(int fildes,int xmit_to_rank)
+  {
+    struct msghdr msg;
+    struct iovec iov;
+    struct cmsghdr *cmsg = NULL;
+    char ctrl[CMSG_SPACE(sizeof(int))];
+    char data = ' ';
+
+    memset(&msg, 0, sizeof(struct msghdr));
+    memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
+    iov.iov_base = &data;
+    iov.iov_len = sizeof(data);
+    
+    sprintf(sock_path,sock_path_fmt,xmit_to_rank);
+    printf("sending FD %d over socket %d to rank %d AF_UNIX path %s\n",fildes,sock,xmit_to_rank,sock_path);fflush(stdout);
+    
+    struct sockaddr_un sa_un = { 0 };
+    sa_un.sun_family = AF_UNIX;
+    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
+
+    msg.msg_name = (void *)&sa_un;
+    msg.msg_namelen = sizeof(sa_un);
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+    msg.msg_controllen =  CMSG_SPACE(sizeof(int));
+    msg.msg_control = ctrl;
+
+    cmsg = CMSG_FIRSTHDR(&msg);
+    cmsg->cmsg_level = SOL_SOCKET;
+    cmsg->cmsg_type = SCM_RIGHTS;
+    cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+
+    *((int *) CMSG_DATA(cmsg)) = fildes;
+
+    if ( sendmsg(sock, &msg, 0) == -1 ) perror("sendmsg failed");
+  };
+};
+
+int main(int argc, char **argv)
+{
+  int me = fork()?0:1;
+  
+  UnixSockets::Open(me);
+  
+  // need MPI barrier
+  sleep(10);
+  const char * message = "Hello, World\n";
+  if( me ) {
+    int fd = open("foo",O_RDWR|O_CREAT,0666);
+    if ( fd < 0 ) {
+      perror("failed to open file");
+      exit(EXIT_FAILURE);
+    }
+    // rank 1 sends ot rank 0
+    UnixSockets::SendFileDescriptor(fd,0);
+    close(fd);
+  } else {
+    // rank 0 sends receives frmo rank 1
+    int fd = UnixSockets::RecvFileDescriptor();
+    write(fd,(const void *)message,strlen(message));
+    close(fd);
+  }
+}
--- a/grid-config.in
+++ b/grid-config.in
@ -60,7 +60,7 @@ while test $# -gt 0; do
    ;;
    
    --cxxflags)
-      echo @GRID_CXXFLAGS@
+      echo @GRID_CXXFLAGS@ -I@prefix@/include
    ;;
    
    --cxx)
@ -72,11 +72,11 @@ while test $# -gt 0; do
    ;;
    
    --ldflags)
-      echo @GRID_LDFLAGS@
+      echo @GRID_LDFLAGS@ -L@prefix@/lib
    ;;
    
    --libs)
-      echo @GRID_LIBS@
+      echo @GRID_LIBS@ -lGrid
    ;;
    
    --summary)
--- a/systems/Lumi/benchmarks/bench2.slurm
+++ b/systems/Lumi/benchmarks/bench2.slurm
@ -0,0 +1,44 @@
+#!/bin/bash -l
+#SBATCH --job-name=bench_lehner
+#SBATCH --partition=small-g
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-node=8
+#SBATCH --time=00:10:00
+#SBATCH --account=project_465000546
+#SBATCH --gpu-bind=none
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+CPU_BIND="map_cpu:48,56,32,40,16,24,1,8"
+echo $CPU_BIND
+
+cat << EOF > select_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3 4 5 6 7)
+export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
+export HIP_VISIBLE_DEVICES=\$GPU
+unset ROCR_VISIBLE_DEVICES
+echo RANK \$SLURM_LOCALID using GPU \$GPU    
+exec \$*
+EOF
+
+chmod +x ./select_gpu
+
+root=/scratch/project_465000546/boylepet/Grid/systems/Lumi
+source ${root}/sourceme.sh
+
+export OMP_NUM_THREADS=7
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+
+for vol in 16.16.16.64 32.32.32.64  32.32.32.128
+do
+srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
+#srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
+
+srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
+#srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
+done
+
--- a/systems/Lumi/config-command
+++ b/systems/Lumi/config-command
@ -0,0 +1,30 @@
+spack load c-lime
+spack load gmp
+spack load mpfr
+CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
+GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
+MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
+echo clime X$CLIME
+echo gmp X$GMP
+echo mpfr X$MPFR
+
+../../configure \
+--enable-comms=mpi-auto \
+--with-lime=$CLIME \
+--enable-unified=no \
+--enable-shm=nvlink \
+--enable-accelerator=hip \
+--enable-gen-simd-width=64 \
+--enable-simd=GPU \
+--enable-accelerator-cshift \
+--with-gmp=$GMP \
+--with-mpfr=$MPFR \
+--with-fftw=$FFTW_DIR/.. \
+--disable-fermion-reps \
+--disable-gparity \
+CXX=hipcc MPICXX=mpicxx \
+  CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
+  LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp" 
+
+
+
--- a/systems/Lumi/sourceme.sh
+++ b/systems/Lumi/sourceme.sh
@ -0,0 +1,5 @@
+source ~/spack/share/spack/setup-env.sh
+module load CrayEnv LUMI/22.12 partition/G  cray-fftw/3.3.10.1 rocm
+spack load c-lime
+spack load gmp
+spack load mpfr
--- a/systems/PVC/setup.sh
+++ b/systems/PVC/setup.sh
@ -3,8 +3,14 @@ export https_proxy=http://proxy-chain.intel.com:911
 export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH

 module load intel-release
-source /opt/intel/oneapi/PVC_setup.sh
+module load intel-comp-rt/embargo-ci-neo
+
+#source /opt/intel/oneapi/PVC_setup.sh
 #source /opt/intel/oneapi/ATS_setup.sh
+#module load intel-nightly/20230331
+#module load intel-comp-rt/ci-neo-master/026093
+
+#module load intel/mpich
 module load intel/mpich/pvc45.3
 export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH

--- a/systems/Sunspot/benchmarks/bench.pbs
+++ b/systems/Sunspot/benchmarks/bench.pbs
@ -0,0 +1,46 @@
+#!/bin/bash
+
+#PBS -l select=1:system=sunspot,place=scatter
+#PBS -A LatticeQCD_aesp_CNDA
+#PBS -l walltime=01:00:00
+#PBS -N dwf
+#PBS -k doe
+
+HDIR=/home/paboyle/
+module use /soft/testing/modulefiles/
+module load intel-UMD23.05.25593.11/23.05.25593.11
+module load tools/pti-gpu  
+export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
+export PATH=$HDIR/tools/bin:$PATH
+
+export TZ='/usr/share/zoneinfo/US/Central'
+export OMP_PROC_BIND=spread
+export OMP_NUM_THREADS=3
+unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+qsub jobscript.pbs
+
+echo Jobid: $PBS_JOBID
+echo Running on host `hostname`
+echo Running on nodes `cat $PBS_NODEFILE`
+
+echo NODES
+cat $PBS_NODEFILE
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS=12         # Number of MPI ranks per node
+NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
+NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
+
+NTOTRANKS=$(( NNODES * NRANKS ))
+
+echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
+echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
+
+    
+CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
+	     ./gpu_tile_compact.sh \
+	./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
+	--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
--- a/systems/Sunspot/benchmarks/gpu_tile_compact.sh
+++ b/systems/Sunspot/benchmarks/gpu_tile_compact.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+
+display_help() {
+  echo " Will map gpu tile to rank in compact and then round-robin fashion"
+  echo " Usage (only work for one node of ATS/PVC):"
+  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
+  echo
+  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
+  echo "   0 Rank 0.0"
+  echo "   1 Rank 0.1"
+  echo "   2 Rank 1.0"
+  echo "   3 Rank 1.1"
+  echo "   4 Rank 2.0"
+  echo "   5 Rank 2.1"
+  echo "   6 Rank 0.0"
+  echo
+  echo " Hacked together by apl@anl.gov, please contact if bug found"
+  exit 1
+}
+
+#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
+#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
+num_gpu=6
+num_tile=2
+
+if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
+  display_help
+fi
+
+gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
+tile_id=$((PALS_LOCAL_RANKID % num_tile))
+
+unset EnableWalkerPartition
+export EnableImplicitScaling=0
+export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+export ZE_AFFINITY_MASK=$gpu_id.$tile_id
+export ONEAPI_DEVICE_FILTER=gpu,level_zero
+export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
+
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK"
+
+if [ $PALS_LOCAL_RANKID = 0 ]
+then
+    onetrace --chrome-device-timeline "$@"
+#    "$@"
+else
+"$@"
+fi
--- a/systems/Sunspot/config-command
+++ b/systems/Sunspot/config-command
@ -0,0 +1,16 @@
+TOOLS=$HOME/tools
+../../configure \
+	--enable-simd=GPU \
+	--enable-gen-simd-width=64 \
+	--enable-comms=mpi-auto \
+	--enable-accelerator-cshift \
+	--disable-gparity \
+	--disable-fermion-reps \
+	--enable-shm=nvlink \
+	--enable-accelerator=sycl \
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=icpx \
+	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -lapmidg -L$TOOLS/lib64/" \
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
+
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@ -1,2 +1,4 @@
-CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi --enable-unified=yes
+BREW=/opt/local/
+MPICXX=mpicxx CXX=c++-12 ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug
+

--- a/tests/Test_general_stencil.cc
+++ b/tests/Test_general_stencil.cc
@ -115,6 +115,7 @@ int main(int argc, char ** argv)
 	  if (SE->_permute & 0x2 ) { permute(check[i],tmp,1); tmp=check[i];}
 	  if (SE->_permute & 0x4 ) { permute(check[i],tmp,2); tmp=check[i];}
 	  if (SE->_permute & 0x8 ) { permute(check[i],tmp,3); tmp=check[i];}
+	  //	  std::cout<<GridLogMessage<<"stencil["<<i<<"] "<< check[i]<< " perm "<<(uint32_t)SE->_permute <<std::endl;
 	}

 	Real nrmC = norm2(Check);
@ -138,18 +139,17 @@ int main(int argc, char ** argv)
 	  ddiff = check -bar;
 	  diff =norm2(ddiff);
 	  if ( diff > 0){
-	    std::cout <<"Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]
-		      <<") " <<check<<" vs "<<bar<<std::endl;
+	    std::cout <<"Diff at Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]
+		      <<") stencil " <<check<<" vs cshift "<<bar<<std::endl;
 	  }

-
 	}}}}

 	if (nrm > 1.0e-4) {
 	  autoView( check , Check, CpuRead);
 	  autoView(   bar ,   Bar, CpuRead);
 	  for(int i=0;i<check.size();i++){
-	    std::cout << i<<" Check "<<check[i]<< "\n"<<i<<" Bar "<<bar[i]<<std::endl;
+	    std::cout << i<<" ERROR Check \n"<<check[i]<< "\n"<<i<<" Bar \n"<<bar[i]<<std::endl;
 	  }
 	}
 	if (nrm > 1.0e-4) exit(-1);
--- a/tests/core/Test_fft_pf.cc
+++ b/tests/core/Test_fft_pf.cc
@ -0,0 +1,307 @@
+    /*************************************************************************************
+
+    grid` physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate simd_layout( { vComplexD::Nsimd(),1,1,1});
+  Coordinate mpi_layout  = GridDefaultMpi();
+
+  int vol = 1;
+  for(int d=0;d<latt_size.size();d++){
+    vol = vol * latt_size[d];
+  }
+  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian RBGRID(&GRID);
+
+  ComplexD ci(0.0,1.0);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridSerialRNG          sRNG;  sRNG.SeedFixedIntegers(seeds); // naughty seeding
+  GridParallelRNG          pRNG(&GRID);
+  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeGaugeFieldD Umu(&GRID);
+
+  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
+
+  ////////////////////////////////////////////////////
+  // PF prop
+  ////////////////////////////////////////////////////
+  LatticeFermionD    src(&GRID);
+
+  gaussian(pRNG,src);
+#if 1
+    Coordinate point(4,0);
+    src=Zero();
+    SpinColourVectorD ferm; gaussian(sRNG,ferm);
+    pokeSite(ferm,src,point);
+#endif
+  
+  {
+    std::cout<<"****************************************"<<std::endl;
+    std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator \n";
+    std::cout<<"****************************************"<<std::endl;
+
+    //    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
+    LatticeFermionD    tmp(&GRID);
+    LatticeFermionD    ref(&GRID);
+    LatticeFermionD    diff(&GRID);
+
+    const int Ls=48+1;
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
+
+    RealD mass=0.1;
+    RealD M5  =0.8;
+    OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0);
+
+    // Momentum space prop
+    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
+    bool fiveD = false; //calculate 4d free propagator
+
+    std::cout << " Free propagator " <<std::endl;
+    Dov.FreePropagator(src,ref,mass) ;
+    std::cout << " Free propagator norm "<< norm2(ref) <<std::endl;
+
+    Gamma G5(Gamma::Algebra::Gamma5);
+
+    LatticeFermionD    src5(FGrid); src5=Zero();
+    LatticeFermionD    tmp5(FGrid); 
+    LatticeFermionD    result5(FGrid); result5=Zero();
+    LatticeFermionD    result4(&GRID); 
+    const int sdir=0;
+
+    ////////////////////////////////////////////////////////////////////////
+    // Import
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Free propagator Import "<< norm2(src) <<std::endl;
+    Dov.ImportPhysicalFermionSource  (src,src5);
+    std::cout << " Free propagator Imported "<< norm2(src5) <<std::endl;
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Conjugate gradient on normal equations system
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
+    Dov.Mdag(src5,tmp5);
+    src5=tmp5;
+    MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov);
+    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
+    CG(HermOp,src5,result5);
+    ////////////////////////////////////////////////////////////////////////
+    // Domain wall physical field propagator
+    ////////////////////////////////////////////////////////////////////////
+    Dov.ExportPhysicalFermionSolution(result5,result4);
+
+    // From DWF4d.pdf :
+    //
+    // Dov_pf = 2/(1-m) D_cayley_ovlap  [ Page 43 ]
+    // Dinv_cayley_ovlap = 2/(1-m) Dinv_pf 
+    // Dinv_cayley_surface =1/(1-m) ( Dinv_cayley_ovlap - 1 ) =>  2/(1-m)^2 Dinv_pf - 1/(1-m) * src   [ Eq.2.67 ]
+
+    RealD scale = 2.0/(1.0-mass)/(1.0-mass);
+    result4 = result4 * scale;
+    result4 = result4 - src*(1.0/(1.0-mass)); // Subtract contact term
+    DumpSliceNorm("Src",src);
+    DumpSliceNorm("Grid",result4);
+    DumpSliceNorm("Fourier",ref);
+
+    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
+    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
+
+    diff = result4- ref;
+    DumpSliceNorm("diff ",diff);
+    
+  }
+  
+  ////////////////////////////////////////////////////
+  // Dwf prop
+  ////////////////////////////////////////////////////
+  {
+    std::cout<<"****************************************"<<std::endl;
+    std::cout << "Testing Dov(Hw) Mom space 4d propagator \n";
+    std::cout<<"****************************************"<<std::endl;
+
+    LatticeFermionD    tmp(&GRID);
+    LatticeFermionD    ref(&GRID);
+    LatticeFermionD    diff(&GRID);
+
+    const int Ls=48;
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
+
+    RealD mass=0.1;
+    RealD M5  =0.8;
+
+    OverlapWilsonCayleyTanhFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,1.0);
+
+    // Momentum space prop
+    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
+    Dov.FreePropagator(src,ref,mass) ;
+
+    Gamma G5(Gamma::Algebra::Gamma5);
+
+    LatticeFermionD    src5(FGrid); src5=Zero();
+    LatticeFermionD    tmp5(FGrid); 
+    LatticeFermionD    result5(FGrid); result5=Zero();
+    LatticeFermionD    result4(&GRID); 
+    const int sdir=0;
+
+    ////////////////////////////////////////////////////////////////////////
+    // Domain wall physical field source; need D_minus
+    ////////////////////////////////////////////////////////////////////////
+    /*
+	chi_5[0]   = chiralProjectPlus(chi);
+	chi_5[Ls-1]= chiralProjectMinus(chi);
+    */      
+    tmp =   (src + G5*src)*0.5;      InsertSlice(tmp,src5,   0,sdir);
+    tmp =   (src - G5*src)*0.5;      InsertSlice(tmp,src5,Ls-1,sdir);
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Conjugate gradient on normal equations system
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
+    Dov.Dminus(src5,tmp5);
+    src5=tmp5;
+    Dov.Mdag(src5,tmp5);
+    src5=tmp5;
+    MdagMLinearOperator<OverlapWilsonCayleyTanhFermionD,LatticeFermionD> HermOp(Dov);
+    ConjugateGradient<LatticeFermionD> CG(1.0e-16,10000);
+    CG(HermOp,src5,result5);
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Domain wall physical field propagator
+    ////////////////////////////////////////////////////////////////////////
+    /*
+      psi  = chiralProjectMinus(psi_5[0]);
+      psi += chiralProjectPlus(psi_5[Ls-1]);
+    */
+    ExtractSlice(tmp,result5,0   ,sdir);   result4 =         (tmp-G5*tmp)*0.5;
+    ExtractSlice(tmp,result5,Ls-1,sdir);   result4 = result4+(tmp+G5*tmp)*0.5;
+    
+    std::cout << " Taking difference" <<std::endl;
+    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
+    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
+    DumpSliceNorm("Grid",result4);
+    DumpSliceNorm("Fourier",ref);
+    diff = ref - result4;
+    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
+    
+    DumpSliceNorm("diff",diff);
+
+  }
+
+  
+  {
+    std::cout<<"****************************************"<<std::endl;
+    std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator with q\n";
+    std::cout<<"****************************************"<<std::endl;
+
+    //    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
+    LatticeFermionD    tmp(&GRID);
+    LatticeFermionD    ref(&GRID);
+    LatticeFermionD    diff(&GRID);
+
+    const int Ls=48+1;
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
+
+    RealD mass=0.1;
+    RealD M5  =0.8;
+    OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0);
+
+    // Momentum space prop
+    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
+    bool fiveD = false; //calculate 4d free propagator
+
+    std::cout << " Free propagator " <<std::endl;
+    Dov.FreePropagator(src,ref,mass) ;
+    std::cout << " Free propagator norm "<< norm2(ref) <<std::endl;
+
+    Gamma G5(Gamma::Algebra::Gamma5);
+
+    LatticeFermionD    src5(FGrid); src5=Zero();
+    LatticeFermionD    tmp5(FGrid); 
+    LatticeFermionD    result5(FGrid); result5=Zero();
+    LatticeFermionD    result4(&GRID); 
+    const int sdir=0;
+
+    ////////////////////////////////////////////////////////////////////////
+    // Import
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Free propagator Import "<< norm2(src) <<std::endl;
+    Dov.ImportPhysicalFermionSource  (src,src5);
+    std::cout << " Free propagator Imported "<< norm2(src5) <<std::endl;
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Conjugate gradient on normal equations system
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
+    Dov.Mdag(src5,tmp5);
+    src5=tmp5;
+    MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov);
+    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
+    CG(HermOp,src5,result5);
+    ////////////////////////////////////////////////////////////////////////
+    // Domain wall physical field propagator
+    ////////////////////////////////////////////////////////////////////////
+    Dov.ExportPhysicalFermionSolution(result5,result4);
+
+    // From DWF4d.pdf :
+    //
+    // Dov_pf = 2/(1-m) D_cayley_ovlap  [ Page 43 ]
+    // Dinv_cayley_ovlap = 2/(1-m) Dinv_pf 
+    // Dinv_cayley_surface =1/(1-m) ( Dinv_cayley_ovlap - 1 ) =>  2/(1-m)^2 Dinv_pf - 1/(1-m) * src   [ Eq.2.67 ]
+
+    RealD scale = 2.0/(1.0-mass)/(1.0-mass);
+    result4 = result4 * scale;
+    result4 = result4 - src*(1.0/(1.0-mass)); // Subtract contact term
+    DumpSliceNorm("Src",src);
+    DumpSliceNorm("Grid",result4);
+    DumpSliceNorm("Fourier",ref);
+
+    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
+    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
+
+    diff = result4- ref;
+    DumpSliceNorm("diff ",diff);
+    
+  }
+
+  
+  Grid_finalize();
+}
--- a/tests/core/Test_lie_generators.cc
+++ b/tests/core/Test_lie_generators.cc
@ -63,7 +63,9 @@ int main(int argc, char** argv) {
  std::cout << "Dimension of adjoint representation: "<< SU2Adjoint::Dimension << std::endl;

  // guard as this code fails to compile for Nc != 3
-#if (Nc == 3)
+#if 1
+
+  std::cout << " Printing  Adjoint Generators"<< std::endl;
    
  SU2Adjoint::printGenerators();
  SU2::testGenerators();
@ -148,10 +150,33 @@ int main(int argc, char** argv) {
    typename AdjointRep<Nc>::LatticeMatrix Vrmu = peekLorentz(Vr,mu);
    pokeLorentz(UrVr,Urmu*Vrmu, mu);
  }
-    
+
+  typedef typename SU_Adjoint<Nc>::AMatrix AdjointMatrix;
  typename AdjointRep<Nc>::LatticeField Diff_check = UVr - UrVr;
  std::cout << GridLogMessage << "Group structure SU("<<Nc<<") check difference (Adjoint representation) : " << norm2(Diff_check) << std::endl;
-    
+
+  std::cout << GridLogMessage << "****************************************** " << std::endl;
+  std::cout << GridLogMessage << " MAP BETWEEN FUNDAMENTAL AND ADJOINT CHECK " << std::endl;
+  std::cout << GridLogMessage << "****************************************** " << std::endl;
+  for(int a=0;a<Nc*Nc-1;a++){
+  for(int b=0;b<Nc*Nc-1;b++){
+  for(int c=0;c<Nc*Nc-1;c++){
+    ColourMatrix Ta;
+    ColourMatrix Tb;
+    ColourMatrix Tc;
+    SU3::generator(a, Ta);
+    SU3::generator(b, Tb);
+    SU3::generator(c, Tc);
+    AdjointMatrix TRa;
+    SU3Adjoint::generator(a,TRa);
+    Complex tr1 = trace ( Tc * ( Ta*Tb-Tb*Ta)); // i/2 fabc
+    Complex tr2 = TRa()()(b,c) * Complex(0,1);
+    std::cout << " 2 Tr( Tc[Ta,Tb]) " << 2.0*tr1<<std::endl;
+    std::cout << " - TRa_bc " << tr2<<std::endl;
+    assert(abs( (2.0*tr1-tr2) ) < 1.0e-7);
+    std::cout << "------------------"<<std::endl;
+  }}}
+  
  // Check correspondence of algebra and group transformations
  // Create a random vector
  SU3::LatticeAlgebraVector h_adj(grid);
--- a/tests/debug/Test_iwasaki_action_newstaple.cc
+++ b/tests/debug/Test_iwasaki_action_newstaple.cc
@ -0,0 +1,188 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_iwasaki_action_newstaple.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+////////////////////////////////////////////////////////////////////////
+// PlaqPlusRectangleActoin
+////////////////////////////////////////////////////////////////////////
+template<class Gimpl>
+class PlaqPlusRectangleActionOrig : public Action<typename Gimpl::GaugeField> {
+public:
+
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+private:
+  RealD c_plaq;
+  RealD c_rect;
+
+public:
+  PlaqPlusRectangleActionOrig(RealD b,RealD c): c_plaq(b),c_rect(c){};
+
+  virtual std::string action_name(){return "PlaqPlusRectangleActionOrig";}
+      
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
+      
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    sstream << GridLogMessage << "["<<action_name() <<"] c_plaq: " << c_plaq << std::endl;
+    sstream << GridLogMessage << "["<<action_name() <<"] c_rect: " << c_rect << std::endl;
+    return sstream.str();
+  }
+
+
+  virtual RealD S(const GaugeField &U) {
+    RealD vol = U.Grid()->gSites();
+
+    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
+    RealD rect = WilsonLoops<Gimpl>::avgRectangle(U);
+
+    RealD action=c_plaq*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5
+      +c_rect*(1.0 -rect)*(Nd*(Nd-1.0))*vol;
+
+    return action;
+  };
+
+  virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
+    //extend Ta to include Lorentz indexes
+    RealD factor_p = c_plaq/RealD(Nc)*0.5;
+    RealD factor_r = c_rect/RealD(Nc)*0.5;
+
+    GridBase *grid = Umu.Grid();
+
+    std::vector<GaugeLinkField> U (Nd,grid);
+    std::vector<GaugeLinkField> U2(Nd,grid);
+
+    for(int mu=0;mu<Nd;mu++){
+      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+      WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
+    }
+
+    GaugeLinkField dSdU_mu(grid);
+    GaugeLinkField staple(grid);
+
+    for (int mu=0; mu < Nd; mu++){
+
+      // Staple in direction mu
+
+      WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
+
+      dSdU_mu = Ta(U[mu]*staple)*factor_p;
+
+      WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
+
+      dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
+	  
+      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
+    }
+
+  };
+
+};
+
+// Convenience for common physically defined cases.
+//
+// RBC c1 parameterisation is not really RBC but don't have good
+// reference and we are happy to change name if prior use of this plaq coeff
+// parameterisation is made known to us. 
+template<class Gimpl>
+class RBCGaugeActionOrig : public PlaqPlusRectangleActionOrig<Gimpl> {
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+  RBCGaugeActionOrig(RealD beta,RealD c1) : PlaqPlusRectangleActionOrig<Gimpl>(beta*(1.0-8.0*c1), beta*c1) {};
+  virtual std::string action_name(){return "RBCGaugeActionOrig";}
+};
+
+template<class Gimpl>
+class IwasakiGaugeActionOrig : public RBCGaugeActionOrig<Gimpl> {
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+  IwasakiGaugeActionOrig(RealD beta) : RBCGaugeActionOrig<Gimpl>(beta,-0.331) {};
+  virtual std::string action_name(){return "IwasakiGaugeActionOrig";}
+};
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size  = GridDefaultLatt();
+  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout = GridDefaultMpi();
+  std::cout << " mpi "<<mpi_layout<<std::endl;
+  std::cout << " simd "<<simd_layout<<std::endl;
+  std::cout << " latt "<<latt_size<<std::endl;
+  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
+
+  GridParallelRNG   pRNG(&GRID);
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  LatticeGaugeField U(&GRID);
+
+  SU<Nc>::HotConfiguration(pRNG,U);
+
+  //#define PRD
+#ifdef PRD
+  typedef PeriodicGimplD Gimpl;
+#else
+  typedef ConjugateGimplD Gimpl;
+  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
+  Gimpl::setDirections(conj_dirs);
+#endif
+
+  typedef typename WilsonLoops<Gimpl>::GaugeMat GaugeMat;
+  typedef typename WilsonLoops<Gimpl>::GaugeLorentz GaugeLorentz;
+
+  GaugeLorentz derivOrig(&GRID), derivNew(&GRID);
+  double beta = 2.13;
+  IwasakiGaugeActionOrig<Gimpl> action_orig(beta);
+  IwasakiGaugeAction<Gimpl> action_new(beta);
+
+  double torig=0, tnew=0;
+  int ntest = 10;
+  for(int i=0;i<ntest;i++){
+    double t0 = usecond();
+    action_orig.deriv(U, derivOrig);
+    double t1 = usecond();
+    action_new.deriv(U, derivNew);
+    double t2 = usecond();
+
+    GaugeLorentz diff = derivOrig - derivNew;
+    double n = norm2(diff);
+    std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
+    assert(n<1e-10);
+
+    std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms,  new: " << (t2-t1)/1000 << "ms" << std::endl;
+    torig += (t1-t0)/1000; tnew += (t2-t1)/1000;
+  }
+  std::cout << GridLogMessage << "Avg timings " << ntest << " iterations: orig:" << torig/ntest << "ms,   new:" << tnew/ntest << "ms" << std::endl;
+  
+  Grid_finalize();
+}
--- a/tests/debug/Test_optimized_staple_gaugebc.cc
+++ b/tests/debug/Test_optimized_staple_gaugebc.cc
@ -0,0 +1,94 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_optimized_staple_gaugebc.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+using namespace std;
+using namespace Grid;
+ 
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size  = GridDefaultLatt();
+  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout = GridDefaultMpi();
+  std::cout << " mpi "<<mpi_layout<<std::endl;
+  std::cout << " simd "<<simd_layout<<std::endl;
+  std::cout << " latt "<<latt_size<<std::endl;
+  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
+
+  GridParallelRNG   pRNG(&GRID);
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  LatticeGaugeField U(&GRID);
+
+  SU<Nc>::HotConfiguration(pRNG,U);
+
+  //#define PRD
+#ifdef PRD
+  typedef PeriodicGimplD Gimpl;
+#else
+  typedef ConjugateGimplD Gimpl;
+  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
+  Gimpl::setDirections(conj_dirs);
+#endif
+
+  typedef typename WilsonLoops<Gimpl>::GaugeMat GaugeMat;
+  typedef typename WilsonLoops<Gimpl>::GaugeLorentz GaugeLorentz;
+
+  int count = 0;
+  double torig=0, topt=0;
+     
+  std::vector<GaugeMat> Umu(Nd,&GRID), U2(Nd,&GRID);
+  for(int mu=0;mu<Nd;mu++){
+    Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
+    WilsonLoops<Gimpl>::RectStapleDouble(U2[mu], Umu[mu], mu);
+  }
+
+  std::cout << GridLogMessage << "Checking optimized vs unoptimized RectStaple" << std::endl;
+  for(int mu=0;mu<Nd;mu++){
+    GaugeMat staple_orig(&GRID), staple_opt(&GRID), staple_U2(&GRID);
+    double t0 = usecond();
+    WilsonLoops<Gimpl>::RectStapleUnoptimised(staple_orig,U,mu);
+    double t1 = usecond();
+    WilsonLoops<Gimpl>::RectStapleOptimised(staple_opt, U2, Umu, mu);
+    double t2 = usecond();
+    torig += t1-t0;  topt += t2-t1;
+    ++count;
+    
+    GaugeMat diff = staple_orig - staple_opt;
+    double n = norm2(diff);
+    std::cout << GridLogMessage << mu << " " << n << std::endl;
+    assert(n<1e-10);
+  }
+  std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  optimized: " << topt/1000/count << "ms" << std::endl;
+  
+  Grid_finalize();
+}
--- a/tests/debug/Test_padded_cell.cc
+++ b/tests/debug/Test_padded_cell.cc
@ -0,0 +1,184 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class vobj> void gpermute(vobj & inout,int perm){
+  vobj tmp=inout;
+  if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
+  if (perm & 0x2 ) { permute(inout,tmp,1); tmp=inout;}
+  if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
+  if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
+}
+  
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size  = GridDefaultLatt();
+  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout = GridDefaultMpi();
+  std::cout << " mpi "<<mpi_layout<<std::endl;
+  std::cout << " simd "<<simd_layout<<std::endl;
+  std::cout << " latt "<<latt_size<<std::endl;
+  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
+
+  GridParallelRNG   pRNG(&GRID);
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  LatticeGaugeField Umu(&GRID);
+
+  SU<Nc>::HotConfiguration(pRNG,Umu);
+
+  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
+  LatticeComplex trplaq(&GRID);
+
+  std::vector<LatticeColourMatrix> U(Nd, Umu.Grid());
+  for (int mu = 0; mu < Nd; mu++) {
+    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+  }
+
+  std::cout << GridLogMessage << " Average plaquette "<<plaq<<std::endl;
+
+  LatticeComplex cplaq(&GRID); cplaq=Zero();
+
+  /////////////////////////////////////////////////
+  // Create a padded cell of extra padding depth=1
+  /////////////////////////////////////////////////
+  int depth = 1;
+  PaddedCell Ghost(depth,&GRID);
+  LatticeGaugeField Ughost = Ghost.Exchange(Umu);
+
+  ///////////////////////////////////////////////////////////////////
+  // Temporary debug Hack for single rank sim:
+  // Check the contents of the cell are periodcally replicated
+  // In future ONLY pad those dimensions that are not local to node
+  ///////////////////////////////////////////////////////////////////
+#if 0
+  {
+    double diff=0;
+    double n=0;
+  {
+    autoView( Ug_v , Ughost, CpuRead);
+    autoView( Ul_v , Umu   , CpuRead);
+  for(int x=0;x<latt_size[0]+2;x++){
+  for(int y=0;y<latt_size[1]+2;y++){
+  for(int z=0;z<latt_size[2]+2;z++){
+  for(int t=0;t<latt_size[3]+2;t++){
+    int lx=(x-1+latt_size[0])%latt_size[0];
+    int ly=(y-1+latt_size[1])%latt_size[1];
+    int lz=(z-1+latt_size[2])%latt_size[2];
+    int lt=(t-1+latt_size[3])%latt_size[3];
+    Coordinate gcoor({x,y,z,t});
+    Coordinate lcoor({lx,ly,lz,lt});
+    LorentzColourMatrix g;
+    LorentzColourMatrix l;
+    peekLocalSite(g,Ug_v,gcoor);
+    peekLocalSite(l,Ul_v,lcoor);
+    g=g-l;
+    assert(norm2(g)==0);
+    diff = diff + norm2(g);
+    n = n + norm2(l);
+  }}}}
+  }
+  std::cout << "padded field check diff "<< diff <<" / "<< n<<std::endl;
+  std::cout << norm2(Ughost)<< " " << norm2(Umu)<<std::endl;
+  }
+#endif
+
+  ///// Array for the site plaquette
+  GridBase *GhostGrid = Ughost.Grid();
+  LatticeComplex gplaq(GhostGrid); 
+  
+  std::vector<Coordinate> shifts;
+  for(int mu=0;mu<Nd;mu++){
+    for(int nu=mu+1;nu<Nd;nu++){
+  
+      //    Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x)
+      Coordinate shift_0(Nd,0);
+      Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
+      Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
+      shifts.push_back(shift_0);
+      shifts.push_back(shift_mu);
+      shifts.push_back(shift_nu);
+      shifts.push_back(shift_0);
+    }
+  }
+  GeneralLocalStencil gStencil(GhostGrid,shifts);
+
+  gplaq=Zero();
+  {
+    autoView( gp_v , gplaq, CpuWrite);
+    autoView( t_v , trplaq, CpuRead);
+    autoView( U_v , Ughost, CpuRead);
+    for(int ss=0;ss<gp_v.size();ss++){
+      int s=0;
+      for(int mu=0;mu<Nd;mu++){
+	for(int nu=mu+1;nu<Nd;nu++){
+
+	  auto SE0 = gStencil.GetEntry(s+0,ss);
+	  auto SE1 = gStencil.GetEntry(s+1,ss);
+	  auto SE2 = gStencil.GetEntry(s+2,ss);
+	  auto SE3 = gStencil.GetEntry(s+3,ss);
+	
+	  int o0 = SE0->_offset;
+	  int o1 = SE1->_offset;
+	  int o2 = SE2->_offset;
+	  int o3 = SE3->_offset;
+	  
+	  auto U0 = U_v[o0](mu);
+	  auto U1 = U_v[o1](nu);
+	  auto U2 = adj(U_v[o2](mu));
+	  auto U3 = adj(U_v[o3](nu));
+
+	  gpermute(U0,SE0->_permute);
+	  gpermute(U1,SE1->_permute);
+	  gpermute(U2,SE2->_permute);
+	  gpermute(U3,SE3->_permute);
+	  
+	  gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 );
+	  s=s+4;
+	}
+      }
+    }
+  }
+  cplaq = Ghost.Extract(gplaq);
+  RealD vol = cplaq.Grid()->gSites();
+  RealD faces = (Nd * (Nd-1))/2;
+  auto p = TensorRemove(sum(cplaq));
+  auto result = p.real()/vol/faces/Nc;
+
+  std::cout << GridLogMessage << " Average plaquette via padded cell "<<result<<std::endl;
+  std::cout << GridLogMessage << " Diff "<<result-plaq<<std::endl;
+  
+  assert(fabs(result-plaq)<1.0e-8);
+  Grid_finalize();
+}
--- a/tests/debug/Test_padded_cell_staple.cc
+++ b/tests/debug/Test_padded_cell_staple.cc
@ -0,0 +1,580 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell_staple.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+using namespace std;
+using namespace Grid;
+
+template <class Gimpl> class WilsonLoopsTest : public Gimpl {
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  typedef typename Gimpl::GaugeLinkField GaugeMat;
+  typedef typename Gimpl::GaugeField GaugeLorentz;
+
+
+  //Original implementation
+  static void StapleOrig(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
+			 int nu) {
+
+    GridBase *grid = Umu.Grid();
+
+    std::vector<GaugeMat> U(Nd, grid);
+    for (int d = 0; d < Nd; d++) {
+      U[d] = PeekIndex<LorentzIndex>(Umu, d);
+    }
+    staple = Zero();
+
+    if (nu != mu) {
+
+      // mu
+      // ^
+      // |__>  nu
+
+      //    __
+      //      |
+      //    __|
+      //
+
+      //Forward: Out(x) = Link(x)*field(x+mu)
+      //Backward: Out(x) = Link^dag(x-mu)*field(x-mu)
+      //ShiftStaple: Link(x) = Link(x+mu)
+
+      //tmp1 = U^dag_nu(x-nu)
+      //tmp2 = U^dag_mu(x-mu) tmp1(x-mu) = U^dag_mu(x-mu) U^dag_nu(x-nu-mu)
+      //tmp3 = U_nu(x) tmp2(x+nu) = U_nu(x)U^dag_mu(x-mu+nu) U^dag_nu(x-mu)
+      //tmp4 = tmp(x+mu) = U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
+
+      staple += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftBackward(
+										  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+				   mu);
+
+      //  __
+      // |
+      // |__
+      //
+      //
+    
+      //tmp1 = U_mu^dag(x-mu) U_nu(x-mu)
+      //tmp2 = U_nu^dag(x-nu) tmp1(x-nu) = U_nu^dag(x-nu) U_mu^dag(x-mu-nu) U_nu(x-mu-nu)
+      //tmp3 = tmp2(x+mu) = U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
+      staple += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftBackward(U[nu], nu,
+							   Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
+				   mu);
+    }
+  }
+
+  static void StaplePadded(GaugeMat &staple, const GaugeLorentz &U, int mu,
+			   int nu) {
+    if(nu==mu){
+      staple = Zero();
+      return;
+    }
+    double peek = 0, construct = 0, exchange = 0, coord = 0, stencil =0, kernel = 0, extract = 0, total = 0;
+    
+    double tstart = usecond();
+    double t=tstart;
+    
+    PaddedCell Ghost(1, (GridCartesian*)U.Grid());
+
+    construct += usecond() - t;
+      
+    t=usecond();      
+    GaugeMat U_mu = PeekIndex<LorentzIndex>(U, mu);
+    GaugeMat U_nu = PeekIndex<LorentzIndex>(U, nu);
+    peek += usecond() - t;
+
+    t=usecond();
+    CshiftImplGauge<Gimpl> cshift_impl;
+    GaugeMat Ug_mu = Ghost.Exchange(U_mu, cshift_impl);
+    GaugeMat Ug_nu = Ghost.Exchange(U_nu, cshift_impl);
+    exchange += usecond() - t;
+    
+    GridBase *ggrid = Ug_mu.Grid();
+
+    GaugeMat gStaple(ggrid);
+
+    t=usecond();
+    Coordinate shift_0(Nd,0);
+    Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
+    Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
+    Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
+    Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
+
+    std::vector<Coordinate> shifts;
+
+    //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
+    shifts.push_back(shift_0);
+    shifts.push_back(shift_nu);
+    shifts.push_back(shift_mu);
+
+    //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
+    shifts.push_back(shift_mnu);
+    shifts.push_back(shift_mnu);
+    shifts.push_back(shift_mnu_pmu);
+    coord += usecond()-t;
+
+    t=usecond();
+    GeneralLocalStencil gStencil(ggrid,shifts);
+    stencil += usecond() -t;
+
+    t=usecond();
+    {
+      autoView( gStaple_v , gStaple, AcceleratorWrite);
+      auto gStencil_v = gStencil.View();
+      autoView( Ug_mu_v , Ug_mu, AcceleratorRead);
+      autoView( Ug_nu_v , Ug_nu, AcceleratorRead);
+  
+      accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
+	  GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss);
+	  auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
+	  e = gStencil_v.GetEntry(1,ss);
+	  auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
+	  e = gStencil_v.GetEntry(2,ss);
+	  auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
+      
+	  auto stencil_ss = U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x;
+
+	  e = gStencil_v.GetEntry(3,ss);
+	  auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
+	  e = gStencil_v.GetEntry(4,ss);
+	  auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
+	  e = gStencil_v.GetEntry(5,ss);
+	  auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
+
+	  stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu;
+      
+	  coalescedWrite(gStaple_v[ss],stencil_ss);
+	}
+	);
+    } //ensure views are all closed!
+    kernel += usecond() - t;
+
+    t=usecond();
+    staple = Ghost.Extract(gStaple);
+    extract += usecond()-t;
+    
+    total += usecond() - tstart;
+    std::cout << GridLogMessage << "StaplePadded timings peek:" << peek << " construct:" << construct << " exchange:" << exchange << " coord:" << coord << " stencil:" << stencil << " kernel:" << kernel << " extract:" << extract << " total:" << total << std::endl;
+  }
+
+  static void RectStapleOrig(GaugeMat &Stap, const GaugeLorentz &Umu,
+			     int mu) {
+    GridBase *grid = Umu.Grid();
+
+    std::vector<GaugeMat> U(Nd, grid);
+    for (int d = 0; d < Nd; d++) {
+      U[d] = PeekIndex<LorentzIndex>(Umu, d);
+    }
+
+    Stap = Zero();
+
+    for (int nu = 0; nu < Nd; nu++) {
+      if (nu != mu) {
+        //           __ ___
+        //          |    __ |
+        //
+	//tmp1 = U_nu^dag(x-nu)
+	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu) U_nu^dag(x-nu-mu)
+	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu) U_mu^dag(x-2mu) U_nu^dag(x-nu-2mu)
+	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu) U_mu^dag(x-2mu+nu) U_nu^dag(x-2mu)
+	//tmp5 = U_mu(x)tmp4(x+mu) = U_mu(x)U_nu(x+mu)U_mu^dag(x+nu) U_mu^dag(x-mu+nu) U_nu^dag(x-mu)
+	//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+	
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftForward(
+							  U[mu], mu,
+							  Gimpl::CovShiftForward(
+										 U[nu], nu,
+										 Gimpl::CovShiftBackward(
+													 U[mu], mu,
+													 Gimpl::CovShiftBackward(
+																 U[mu], mu,
+																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
+				   mu);
+
+        //              __
+        //          |__ __ |
+
+	//tmp1 = U^dag_mu(x-mu)U_nu(x-mu)
+	//tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)
+	//tmp3 = U^dag_nu(x-nu)tmp2(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)
+	//tmp4 = U_mu(x)tmp3(x+mu) = U_mu(x)U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)
+	//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+	
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftForward(
+							  U[mu], mu,
+							  Gimpl::CovShiftBackward(
+										  U[nu], nu,
+										  Gimpl::CovShiftBackward(
+													  U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))),
+				   mu);
+
+        //           __
+        //          |__ __ |
+	//Forward: Out(x) = Link(x)*field(x+mu)
+	//Backward: Out(x) = Link^dag(x-mu)*field(x-mu)
+	//ShiftStaple: Link(x) = Link(x+mu)
+
+	//tmp1 = U_nu(x)U_mu(x+nu)
+	//tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U_nu(x-mu)U_mu(x+nu-mu)
+	//tmp3 = U^dag_mu(x-mu)tmp2(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)U_mu(x+nu-2mu)
+	//tmp4 = U^dag_nu(x-nu)tmp3(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)U_mu(x-2mu)
+	//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftBackward(
+							   U[nu], nu,
+							   Gimpl::CovShiftBackward(
+										   U[mu], mu,
+										   Gimpl::CovShiftBackward(
+													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))),
+				   mu);
+
+        //           __ ___
+        //          |__    |
+	//tmp1 = U_nu^dag(x-nu)U_mu(x-nu)
+	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_mu(x-mu-nu)
+	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_mu^dag(x-2mu)U_nu^dag(x-2mu-nu)U_mu(x-2mu-nu)
+	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_mu^dag(x-2mu+nu)U_nu^dag(x-2mu)U_mu(x-2mu)
+	//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftBackward(
+										  U[mu], mu,
+										  Gimpl::CovShiftBackward(
+													  U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))),
+				   mu);
+
+        //       --
+        //      |  |
+        //
+        //      |  |
+	//tmp1 = U_nu^dag(x-nu)
+	//tmp2 = U_nu^dag(x-nu)tmp1(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)
+	//tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_nu^dag(x-mu-2nu)
+	//tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_nu^dag(x-mu-nu)
+	//tmp5 = U_nu(x)tmp4(x+nu) = U_nu(x)U_nu(x+nu)U_mu^dag(x-mu+2nu)U_nu^dag(x-mu+nu)U_nu^dag(x-mu)
+	//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftForward(
+										 U[nu], nu,
+										 Gimpl::CovShiftBackward(
+													 U[mu], mu,
+													 Gimpl::CovShiftBackward(
+																 U[nu], nu,
+																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
+				   mu);
+
+        //      |  |
+        //
+        //      |  |
+        //       --
+	//tmp1 = U_nu(x)U_nu(x+nu)
+	//tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu(x-mu)U_nu(x-mu+nu)
+	//tmp3 = U_nu^dag(x-nu)tmp2(x-nu) = U_nu^dag(x-nu)U_mu^dag(x-mu-nu)U_nu(x-mu-nu)U_nu(x-mu)
+	//tmp4 = U_nu^dag(x-nu)tmp3(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)U_mu^dag(x-mu-2nu)U_nu(x-mu-2nu)U_nu(x-mu-nu)
+	//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+        Stap += Gimpl::ShiftStaple(
+				   Gimpl::CovShiftBackward(
+							   U[nu], nu,
+							   Gimpl::CovShiftBackward(
+										   U[nu], nu,
+										   Gimpl::CovShiftBackward(
+													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))),
+				   mu);
+      }
+    }
+  }
+
+
+  static void RectStaplePadded(GaugeMat &Stap, const GaugeLorentz &U,
+			       int mu) {
+    PaddedCell Ghost(2,(GridCartesian*)U.Grid());
+    GridBase *ggrid = Ghost.grids.back();
+    
+    CshiftImplGauge<Gimpl> cshift_impl;
+    std::vector<GaugeMat> Ug_dirs(Nd,ggrid);
+    for(int i=0;i<Nd;i++) Ug_dirs[i] = Ghost.Exchange(PeekIndex<LorentzIndex>(U, i), cshift_impl);
+
+    GaugeMat gStaple(ggrid);
+
+    std::vector<Coordinate> shifts;
+    for (int nu = 0; nu < Nd; nu++) {
+      if (nu != mu) {
+	auto genShift = [&](int mushift,int nushift){
+	  Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
+	};
+
+	//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+	shifts.push_back(genShift(0,0));
+	shifts.push_back(genShift(0,+1));
+	shifts.push_back(genShift(+1,+1));
+	shifts.push_back(genShift(+2,0));
+	shifts.push_back(genShift(+1,0));
+
+	//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+	shifts.push_back(genShift(0,-1));
+	shifts.push_back(genShift(0,-1));
+	shifts.push_back(genShift(+1,-1));
+	shifts.push_back(genShift(+2,-1));
+	shifts.push_back(genShift(+1,0));
+
+	//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+	shifts.push_back(genShift(-1,0));
+	shifts.push_back(genShift(-1,-1));
+	shifts.push_back(genShift(-1,-1));
+	shifts.push_back(genShift(0,-1));
+	shifts.push_back(genShift(+1,-1));
+
+	//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+	shifts.push_back(genShift(-1,0));
+	shifts.push_back(genShift(-1,0));
+	shifts.push_back(genShift(-1,+1));
+	shifts.push_back(genShift(0,+1));
+	shifts.push_back(genShift(+1,0));
+
+	//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+	shifts.push_back(genShift(0,0));
+	shifts.push_back(genShift(0,+1));
+	shifts.push_back(genShift(0,+2));
+	shifts.push_back(genShift(+1,+1));
+	shifts.push_back(genShift(+1,0));
+
+	//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+	shifts.push_back(genShift(0,-1));
+	shifts.push_back(genShift(0,-2));
+	shifts.push_back(genShift(0,-2));
+	shifts.push_back(genShift(+1,-2));
+	shifts.push_back(genShift(+1,-1));
+      }
+    }
+    size_t nshift = shifts.size();
+
+    GeneralLocalStencil gStencil(ggrid,shifts);
+    {
+      autoView( gStaple_v , gStaple, AcceleratorWrite);
+      auto gStencil_v = gStencil.View();
+
+      typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
+      size_t vsize = Nd*sizeof(GaugeViewType);
+      GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
+      for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = Ug_dirs[i].View(AcceleratorRead);
+      GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
+      acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
+
+      accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
+	  decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
+	  stencil_ss = Zero();
+	  int s=0;
+	  for(int nu=0;nu<Nd;nu++){
+	    if(nu != mu){
+	      //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+	      GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
+	      auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+	    
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+	      //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+	      //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+	      //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+	      //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
+
+	      //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+	      e = gStencil_v.GetEntry(s++,ss);
+	      U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+	      stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
+
+	    }
+	  }
+	  assert(s==nshift);
+	  coalescedWrite(gStaple_v[ss],stencil_ss);
+	}
+	);
+  
+      for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
+      free(Ug_dirs_v_host);
+      acceleratorFreeDevice(Ug_dirs_v);
+    }   
+    Stap = Ghost.Extract(gStaple);    
+  }
+
+
+
+};  
+  
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size  = GridDefaultLatt();
+  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout = GridDefaultMpi();
+  std::cout << " mpi "<<mpi_layout<<std::endl;
+  std::cout << " simd "<<simd_layout<<std::endl;
+  std::cout << " latt "<<latt_size<<std::endl;
+  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
+
+  GridParallelRNG   pRNG(&GRID);
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  LatticeGaugeField U(&GRID);
+
+  SU<Nc>::HotConfiguration(pRNG,U);
+
+  //typedef PeriodicGimplD Gimpl;
+  typedef ConjugateGimplD Gimpl;
+  std::vector<int> conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1;
+  Gimpl::setDirections(conj_dirs);
+
+  typedef typename WilsonLoopsTest<Gimpl>::GaugeMat GaugeMat;
+  typedef typename WilsonLoopsTest<Gimpl>::GaugeLorentz GaugeLorentz;
+
+  std::cout << GridLogMessage << "Checking Staple" << std::endl;
+  int count = 0;
+  double torig=0, tpadded=0;
+  
+  for(int mu=0;mu<Nd;mu++){
+    for(int nu=0;nu<Nd;nu++){
+      if(mu != nu){
+	GaugeMat staple_orig(&GRID), staple_padded(&GRID);
+	double t0 = usecond();
+	WilsonLoopsTest<Gimpl>::StapleOrig(staple_orig,U,mu,nu);
+	double t1 = usecond();
+	WilsonLoopsTest<Gimpl>::StaplePadded(staple_padded,U,mu,nu);
+	double t2 = usecond();
+	torig += t1-t0;  tpadded += t2-t1;
+	++count;
+	
+	GaugeMat diff = staple_orig - staple_padded;
+	double n = norm2(diff);
+	std::cout << GridLogMessage << mu << " " << nu << " " << n << std::endl;
+	assert(n<1e-10);
+      }
+    }
+  }
+  std::cout << GridLogMessage << "Staple timings orig: " << torig/1000/count << "ms,  padded: " << tpadded/1000/count << "ms" << std::endl;
+  count=0; torig=tpadded=0;
+    
+  std::cout << GridLogMessage << "Checking RectStaple" << std::endl;
+  for(int mu=0;mu<Nd;mu++){
+    GaugeMat staple_orig(&GRID), staple_padded(&GRID);
+    double t0 = usecond();
+    WilsonLoopsTest<Gimpl>::RectStapleOrig(staple_orig,U,mu);
+    double t1 = usecond();
+    WilsonLoopsTest<Gimpl>::RectStaplePadded(staple_padded,U,mu);
+    double t2 = usecond();
+    torig += t1-t0;  tpadded += t2-t1;
+    ++count;
+    
+    GaugeMat diff = staple_orig - staple_padded;
+    double n = norm2(diff);
+    std::cout << GridLogMessage << mu << " " << n << std::endl;
+    assert(n<1e-10);
+  }
+  std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  padded: " << tpadded/1000/count << "ms" << std::endl;
+  
+  Grid_finalize();
+}
--- a/tests/forces/Test_double_ratio.cc
+++ b/tests/forces/Test_double_ratio.cc
@ -476,7 +476,9 @@ int main (int argc, char ** argv)
  //  ForceTest<GimplTypesR>(BdyNf2eo,U,DDHMCFilter);

  //////////////////// One flavour boundary det  ////////////////////
+  /*
  RationalActionParams OFRp; // Up/down
+  int SP_iters = 3000;
  OFRp.lo       = 6.0e-5;
  OFRp.hi       = 90.0;
  OFRp.inv_pow  = 2;
@ -489,7 +491,7 @@ int main (int argc, char ** argv)
  //  OFRp.degree   = 16;
  OFRp.precision= 80;
  OFRp.BoundsCheckFreq=0;
-  /*
+  */
  OneFlavourRationalParams OFRp; // Up/down
  OFRp.lo       = 4.0e-5;
  OFRp.hi       = 90.0;
@ -499,7 +501,6 @@ int main (int argc, char ** argv)
  OFRp.degree   = 18;
  OFRp.precision= 80;
  OFRp.BoundsCheckFreq=0;
-  */
  std::vector<RealD> ActionTolByPole({
      1.0e-7,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
--- a/tests/forces/Test_fthmc.cc
+++ b/tests/forces/Test_fthmc.cc
@ -0,0 +1,219 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_fthmc.cc
+
+    Copyright (C) 2022
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
+#include <Grid/qcd/smearing/JacobianAction.h>
+
+using namespace std;
+using namespace Grid;
+
+typedef MobiusFermionD FermionAction;
+typedef WilsonImplD FimplD;
+typedef WilsonImplD FermionImplPolicy;
+
+template<class Gimpl>
+void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeField> & smU,MomentumFilterBase<LatticeGaugeField> &Filter)
+{
+  LatticeGaugeField U = smU.get_U(false); // unsmeared config
+  GridBase *UGrid = U.Grid();
+
+  std::vector<int> seeds({1,2,3,5});
+  GridSerialRNG            sRNG;         sRNG.SeedFixedIntegers(seeds);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds);
+
+  LatticeColourMatrix Pmu(UGrid); 
+  LatticeGaugeField P(UGrid); 
+  LatticeGaugeField UdSdU(UGrid); 
+
+  std::cout << GridLogMessage << "*********************************************************"<<std::endl;
+  std::cout << GridLogMessage << " Force test for "<<action.action_name()<<std::endl;
+  std::cout << GridLogMessage << "*********************************************************"<<std::endl;
+  
+  RealD eps=0.01;
+
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  std::cout << GridLogMessage << " Refresh "<<action.action_name()<<std::endl;
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  
+  Gimpl::generate_momenta(P,sRNG,RNG4);
+  //  Filter.applyFilter(P);
+
+  action.refresh(smU,sRNG,RNG4);
+
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+
+  RealD S1 = action.S(smU);
+
+  Gimpl::update_field(P,U,eps);
+  smU.set_Field(U);
+
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  std::cout << GridLogMessage << " Derivative "<<action.action_name()<<std::endl;
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  action.deriv(smU,UdSdU);
+  UdSdU = Ta(UdSdU);
+  //  Filter.applyFilter(UdSdU);
+
+  DumpSliceNorm("Force",UdSdU,Nd-1);
+  
+  Gimpl::update_field(P,U,eps);
+  smU.set_Field(U);
+
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  
+  RealD S2 = action.S(smU);
+
+  // Use the derivative
+  LatticeComplex dS(UGrid); dS = Zero();
+  for(int mu=0;mu<Nd;mu++){
+    auto UdSdUmu = PeekIndex<LorentzIndex>(UdSdU,mu);
+    Pmu= PeekIndex<LorentzIndex>(P,mu);
+    dS = dS - trace(Pmu*UdSdUmu)*eps*2.0*HMC_MOMENTUM_DENOMINATOR;
+  }
+  ComplexD dSpred    = sum(dS);
+  RealD diff =  S2-S1-dSpred.real();
+
+  std::cout<< GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  std::cout<< GridLogMessage << "S1 : "<< S1    <<std::endl;
+  std::cout<< GridLogMessage << "S2 : "<< S2    <<std::endl;
+  std::cout<< GridLogMessage << "dS : "<< S2-S1 <<std::endl;
+  std::cout<< GridLogMessage << "dSpred : "<< dSpred.real() <<std::endl;
+  std::cout<< GridLogMessage << "diff : "<< diff<<std::endl;
+  std::cout<< GridLogMessage << "*********************************************************"<<std::endl;
+  //  assert(diff<1.0);
+  std::cout<< GridLogMessage << "Done" <<std::endl;
+  std::cout << GridLogMessage << "*********************************************************"<<std::endl;
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::cout << std::setprecision(14);
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate mpi_layout  = GridDefaultMpi();
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  Coordinate shm;
+  GlobalSharedMemory::GetShmDims(mpi_layout,shm);
+
+  const int Ls=12;
+  const int Nt = latt_size[3];
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  
+  ///////////////////// Gauge Field and Gauge Forces ////////////////////////////
+  LatticeGaugeField U(UGrid);
+
+#if  0
+  FieldMetaData header;
+  std::string file("./ckpoint_lat.2000");
+  NerscIO::readConfiguration(U,header,file);
+#else
+  std::vector<int> seeds({1,2,3,4,5,6,7,8});
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds);
+  SU<Nc>::HotConfiguration(RNG4,U);
+#endif
+
+  
+  WilsonGaugeActionR  PlaqAction(6.0);
+  IwasakiGaugeActionR RectAction(2.13);
+  PlaqAction.is_smeared = true;  
+  RectAction.is_smeared = true;  
+
+  ////////////////////////////////////
+  // Fermion Action
+  ////////////////////////////////////
+  RealD mass=0.01; 
+  RealD pvmass=1.0; 
+  RealD M5=1.8; 
+  RealD b=1.5;
+  RealD c=0.5;
+  
+  // Double versions
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+  FermionAction DdwfPeriodic(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,Params);
+  FermionAction PVPeriodic  (U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,pvmass,M5,b,c,Params);
+
+  double StoppingCondition = 1.0e-8;
+  double MaxCGIterations = 50000;
+  ConjugateGradient<LatticeFermion>  CG(StoppingCondition,MaxCGIterations);
+
+  TwoFlavourRatioPseudoFermionAction<FimplD> Nf2(PVPeriodic, DdwfPeriodic,CG,CG);
+  Nf2.is_smeared = true;  
+  
+  ////////////////////////////////////////////////
+  // Plaquette only FTHMC smearer
+  ////////////////////////////////////////////////
+  double rho = 0.1;
+  Smear_Stout<PeriodicGimplR> Smearer(rho);
+  SmearedConfigurationMasked<PeriodicGimplR> SmartConfig(UGrid,2*Nd,Smearer);
+  SmearedConfiguration<PeriodicGimplR> StoutConfig(UGrid,1,Smearer);
+
+  JacobianAction<PeriodicGimplR> Jacobian(&SmartConfig);
+  
+  ////////////////////////////////////////////////
+  // Run some tests
+  ////////////////////////////////////////////////
+  MomentumFilterNone<LatticeGaugeField> FilterNone;
+
+  std::cout << " *********  FIELD TRANSFORM SMEARING ***** "<<std::endl;
+
+  SmartConfig.set_Field(U);
+  ForceTest<GimplTypesR>(PlaqAction,SmartConfig,FilterNone);
+
+  SmartConfig.set_Field(U);
+  ForceTest<GimplTypesR>(RectAction,SmartConfig,FilterNone);
+
+  SmartConfig.set_Field(U);
+  ForceTest<GimplTypesR>(Jacobian,SmartConfig,FilterNone);
+
+  SmartConfig.set_Field(U);
+  ForceTest<GimplTypesR>(Nf2,SmartConfig,FilterNone);
+
+  std::cout << " *********    STOUT SMEARING ***** "<<std::endl;
+
+  StoutConfig.set_Field(U);
+  ForceTest<GimplTypesR>(PlaqAction,StoutConfig,FilterNone);
+
+  StoutConfig.set_Field(U);
+  ForceTest<GimplTypesR>(RectAction,StoutConfig,FilterNone);
+  
+  StoutConfig.set_Field(U);
+  ForceTest<GimplTypesR>(Nf2,StoutConfig,FilterNone);
+  
+
+  Grid_finalize();
+}
Author	SHA1	Message	Date
Christopher Kelly	e28cf2b8be	Merge `1dfaa08afb` into `b8a7004365`	2023-09-03 09:29:21 -07:00
Peter Boyle	b8a7004365	Partial fraction test	2023-08-14 15:17:03 -04:00
Peter Boyle	994512048e	Merge pull request #439 from felixerben/bugfix/IRL_convergence Bugfix/irl convergence	2023-07-12 16:32:26 -04:00
Christopher Kelly	1dfaa08afb	The stencils for the staple and rect-staple padded cell implementations are now created and stored by workspace classes that allow for reuse providing the grids remain consistent The workspaces are now used by the plaq+rectangle gauge action resulting in a further 2x performance improvement as measured on a 16^4 local volume for 2 nodes (16 ranks) of Crusher	2023-06-28 15:11:24 -04:00
Christopher Kelly	f44dce390f	Implemented acclerator-optimized versions of localCopyRegion and insertSliceLocal to speed up padding Fixed const correctness on PaddedCell methods Fixed compile issues on Crusher Added timing breakdowns for PaddedCell::Expand and the padded implementations of the staples, visible under --log Performance Optimized kernel for StaplePadded Test_iwasaki_action_newstaple now repeats the calculation 10 times and reports average timings	2023-06-27 14:58:10 -04:00
Christopher Kelly	bb71e9a96a	Added PaddedCell and GeneralisedLocalStencil header includes to standard base headers Moved versions of the padded-cell implementations of staple and rect-staple from test code to WilsonLoops header Added StapleAndRectStapleAll which is now called by the plaq+rectangle action class. Under the hood it uses the padded cell implementations with maximal reuse of the padded gauge links	2023-06-27 11:23:30 -04:00
Felix Erben	78bae9417c	returning Nstop vectors even if not all meet true convergence criterion	2023-06-27 14:38:19 +01:00
Felix Erben	dd170ead01	whitespace	2023-06-27 11:37:01 +01:00
Felix Erben	014704856f	do one more iteration if not all vectors converged	2023-06-27 11:33:30 +01:00
Christopher Kelly	6f6844ccf1	Added new StapleAll and RectStapleAll functions that return the staples for all mu as an array Modified plaq+rectangle gauge actions to use the above Added a test code to confirm the above changes	2023-06-26 15:48:47 -04:00
Christopher Kelly	4c6613d72c	Modified RectStapleDouble and RectStapleOptimised to use Gauge-BC respecting CshiftLink Added test code tests/debug/Test_optimized_staple_gaugebc demonstrating equivalence of above to RectStapleUnoptimised for cconj gauge BCs Removed optimized staple only being used for periodic gauge BCs; it is now always used	2023-06-26 10:20:23 -04:00
Peter Boyle	ee92e08edb	Merge pull request #435 from fjosw/fix/warnings_in_WilsonKernelsImplementation Unused variable in WilsonKernelsImplementation	2023-06-23 11:47:19 -04:00
Peter Boyle	c1dcee9328	Merge pull request #437 from fjosw/fix/stencil_debug Added GridLogDebug to BuildSurfaceList debug message	2023-06-23 11:47:00 -04:00
Peter Boyle	6b150961fe	Better script	2023-06-23 18:09:25 +03:00
Christopher Kelly	36cc9c524f	Threaded the constructor of GeneralLocalStencil	2023-06-23 09:57:38 -04:00
Peter Boyle	5bafcaedfa	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-06-22 19:59:45 +03:00
Peter Boyle	bfeceae708	FTHMC	2023-06-22 12:58:18 -04:00
Peter Boyle	eacb66591f	Config command	2023-06-22 19:56:40 +03:00
Peter Boyle	fadaa85626	Update	2023-06-22 19:56:27 +03:00
Peter Boyle	02a5b0d786	Updating run during testing	2023-06-22 19:52:46 +03:00
Peter Boyle	0e2141442a	Dennis says broken	2023-06-22 19:19:51 +03:00
Peter Boyle	769eb0eecb	Precision coverage	2023-06-22 19:19:20 +03:00
Christopher Kelly	4241c7d4a3	Imported coalescedReadGeneralPermute GPU implementation from Christoph Fixed bug in padded staple code where extract was being called on the result before the GPU view was closed Fixed compile issue with pointer cast in padded staple code Added timing summaries of padded staple code and timing breakdown of staple implementation to Test_padded_cell_staple	2023-06-21 16:01:01 -04:00
Christopher Kelly	7b11075102	The user can now specify the implementation of Cshift used by the PaddedCell class through a virtual base class API. Implementations for default (regular Cshift) and for gauge links (which respects the gauge BCs) Fixed const-correctness for PaddedCell and ConjugateGimpl::setDirections Modified test code for padded-cell implementation of staple, rect-staple to use cconj BCs	2023-06-20 17:09:56 -04:00
Christopher Kelly	abc658dca5	Added coalescedReadGeneralPermute CPU implementation based on Christoph's GPT code In a test code, implemented a padded-cell version of the staple and rectangular-staple calculation	2023-06-20 16:14:25 -04:00
Fabian Joswig	85e35c4da1	fix: added GridLogDebug to BuildSurfaceList debug message.	2023-06-16 10:31:16 +01:00
Peter Boyle	d72e914cf0	Profiling temporary code until optimised	2023-06-15 10:43:04 -04:00
Peter Boyle	3b5254e2d5	Optional checkpoint smeared configs for FTHMC	2023-06-15 10:43:04 -04:00
Peter Boyle	f1c358b596	Additional tests	2023-06-15 10:43:04 -04:00
Peter Boyle	c0ef210265	Hot start should be properly Hot	2023-06-15 10:43:04 -04:00
Peter Boyle	e3e1cc1962	Ta project	2023-06-15 10:43:04 -04:00
Peter Boyle	723eadbb5c	Keep methods virtual	2023-06-15 10:43:04 -04:00
Peter Boyle	e24637ec1e	Clean up	2023-06-15 10:43:04 -04:00
Peter Boyle	8b01ff4ce7	Integrator over to smeared force structure	2023-06-15 10:43:04 -04:00
Peter Boyle	588197c487	Smeared action virtual class	2023-06-15 10:43:04 -04:00
Peter Boyle	1352bad2e4	Sunspot compile	2023-06-15 11:22:46 +00:00
Peter Boyle	ffd7301649	Updated masked / fthmc smeared config container	2023-06-01 06:23:02 -04:00
Peter Boyle	d2a8494044	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-06-01 06:22:33 -04:00
Peter Boyle	0982e0d19b	Jacobian action wrapper for FTHMC	2023-06-01 06:15:08 -04:00
Peter Boyle	3badbfc3c1	Refactor the Action and Smeared gauge configuration containers. Add first pass at FTHMC action	2023-06-01 06:14:28 -04:00
Peter Boyle	5465961e30	New test for FTHMC portion	2023-06-01 06:14:04 -04:00
Fabian Joswig	477b794bc5	fix: unused variable removed.	2023-05-29 14:08:53 +01:00
Peter Boyle	4835fd1a87	HIP stream synch	2023-05-27 17:58:22 +03:00
Peter Boyle	6533c25814	Lumi	2023-05-27 16:13:32 +03:00
Peter Boyle	1b2914ec09	FT-HMC smearing, derivative chain rule, log det and force first pass.	2023-05-22 10:21:37 -04:00
Peter Boyle	519f795066	Header not liked by gcc on mac? puzzling	2023-05-22 10:21:12 -04:00
Peter Boyle	4240ad5ca8	Preparing for FTHMC	2023-05-19 21:21:55 -04:00
Peter Boyle	d418347d86	public for convenience to see rho params	2023-05-19 21:21:05 -04:00
Peter Boyle	29a4bfe5e5	Clean up	2023-05-19 21:20:45 -04:00
Peter Boyle	9955bf9daf	Regresses to Qlat	2023-05-19 17:32:13 -04:00
Peter Boyle	876c8f4478	Nodes on padded cell	2023-05-11 12:35:49 -04:00
Peter Boyle	9c8750f261	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-05-11 12:29:09 -04:00
Peter Boyle	91efd08179	Option for Qlat generator basis	2023-05-11 12:27:45 -04:00
Peter Boyle	9953511b65	Mac compile	2023-05-11 12:27:29 -04:00
Peter Boyle	025fa9991a	For FTHMC	2023-05-11 12:26:14 -04:00
Peter Boyle	e8c60c355b	Padded cell code	2023-05-11 12:25:50 -04:00
Peter Boyle	6c9c7f9d85	Permute fix	2023-05-11 12:24:21 -04:00
Peter Boyle	f534523ede	Debug	2023-05-11 12:23:11 -04:00
Peter Boyle	1b8a834beb	Debug	2023-05-11 12:22:24 -04:00
Peter Boyle	3aa43e6065	Debug info	2023-04-20 14:21:13 -04:00
Peter Boyle	78ac4044ff	HMC	2023-04-20 13:28:07 -04:00
Peter Boyle	119c3db47f	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-04-18 15:13:16 -04:00
Peter Boyle	21bbdb8fc2	Crusher	2023-04-18 15:11:16 -04:00
Peter Boyle	739bd7572c	Example code	2023-04-17 21:51:55 +00:00
Peter Boyle	074627a5bd	Pass file descriptors through AF_UNIX for level_zero	2023-04-17 21:50:52 +00:00
Peter Boyle	6a23b2c599	Drop UVM	2023-04-17 21:49:58 +00:00
Peter Boyle	bd891fb3f5	tests to compile	2023-04-12 18:32:44 -04:00
Peter Boyle	3984265851	Merge pull request #432 from paboyle/hotfix/nvcc-warnings Unused statements generating warnings removed	2023-04-12 16:59:02 -04:00
Peter Boyle	45361d188f	Merge pull request #427 from fjosw/feat/bug_report_issue_template Feat/bug report issue template	2023-04-12 16:58:41 -04:00
Peter Boyle	80c9d77e02	Merge pull request #433 from paboyle/hotfix/virtual-dtor Virtual destructor for LinearOperator	2023-04-12 16:56:18 -04:00
Peter Boyle	3aff64dddb	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-04-11 12:19:15 -07:00
Peter Boyle	b4f2ca81ff	Copy queue and compute queue same as better concurrency	2023-04-11 12:18:21 -07:00
Peter Boyle	d1dea5f840	New driver	2023-04-11 12:16:52 -07:00
Peter Boyle	54f8b84d16	Fence	2023-04-11 12:16:08 -07:00
Peter Boyle	da503fef0e	Name change on barrier routine	2023-04-11 12:14:04 -07:00
Peter Boyle	4a6802098a	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-04-07 15:43:28 -04:00
Peter Boyle	f9b41a84d2	Trajectory runs to completion on Crusher within wall clock time	2023-04-07 15:42:45 -04:00
Antonin Portelli	5d7e0d18b9	virtual destructor for LinearOperator	2023-04-07 14:30:38 +01:00
Peter Boyle	86dac5ff4f	Better printing	2023-04-04 07:42:19 -07:00
Peter Boyle	4a382fad3f	Use distinct SYCL queue for copies	2023-04-04 07:41:41 -07:00
Peter Boyle	cc753670d9	Barrier elimination, surface list build	2023-04-04 07:39:14 -07:00
Peter Boyle	cc9d88ea1c	Fence changes and EXT kernel loop cout reduction	2023-04-04 07:37:23 -07:00
Peter Boyle	b281b0166e	Put the barrier in the subroutine	2023-04-04 07:36:03 -07:00
Peter Boyle	6a21f694ff	Apply barrier in Gather kernel sequence. Could place before comms, or in Gather, but decided to insist Gather means Gather is done	2023-04-04 07:33:24 -07:00
Fabian Joswig	39214702f6	feat: indentation fixed.	2023-03-28 16:30:34 +02:00
Fabian Joswig	3e4614c63a	feat: draft for bug-report issue template added.	2023-03-28 16:24:35 +02:00
Peter Boyle	ccd21f96ff	Plaquette agreeing and moving to final form (slowly) need to optimise	2023-02-01 22:57:44 -05:00
Peter Boyle	4b90cb8888	First cut passes combining padded cell with general stencil towards fast plaquette and staggered force	2023-02-01 22:14:10 -05:00