Boosted fermion attempt

Qslash term added
Better macos
2026-02-24 23:56:13 +00:00 · 2024-10-17 18:37:33 +01:00 · 2023-09-14 16:14:03 -04:00 · 2023-09-14 16:12:21 -04:00 · 2023-08-14 15:17:03 -04:00 · 2023-07-12 16:32:26 -04:00
67 changed files with 3461 additions and 308 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -0,0 +1,54 @@
+name: Bug report
+description: Report a bug.
+title: "<insert title>"
+labels: [bug]
+
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thank you for taking the time to file a bug report.
+        Please check that the code is pointing to the HEAD of develop
+        or any commit in master which is tagged with a version number.
+
+  - type: textarea
+    attributes:
+      label: "Describe the issue:"
+      description: >
+        Describe the issue and any previous attempt to solve it.
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: "Code example:"
+      description: >
+        If relevant, show how to reproduce the issue using a minimal working
+        example.
+      placeholder: |
+        << your code here >>
+      render: shell
+    validations:
+      required: false
+
+  - type: textarea
+    attributes:
+      label: "Target platform:"
+      description: >
+        Give a description of the target platform (CPU, network, compiler).
+        Please give the full CPU part description, using for example
+        `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
+        or `sysctl machdep.cpu.brand_string` (macOS) and the full output
+        the `--version` option of your compiler.
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: "Configure options:"
+      description: >
+        Please give the exact configure command used and attach
+        `config.log`, `grid.config.summary` and the output of `make V=1`.
+      render: shell
+    validations:
+      required: true
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
@@ -166,16 +166,16 @@ public:
      rsqf[s] =rsq[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      //      ps_d[s] = src_d;
-      precisionChangeFast(ps_f[s],src_d);
+      precisionChange(ps_f[s],src_d);
    }
    // r and p for primary
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    r_d = p_d;
    
    //MdagM+m[0]
-    precisionChangeFast(p_f,p_d);
+    precisionChange(p_f,p_d);
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    precisionChangeFast(tmp_d,mmp_f);
+    precisionChange(tmp_d,mmp_f);
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    tmp_d = tmp_d - mmp_d;
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
@@ -204,7 +204,7 @@ public:
  
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
-      precisionChangeFast(psi_f[s],psi_d[s]);
+      precisionChange(psi_f[s],psi_d[s]);
    }
  
    ///////////////////////////////////////
@@ -225,7 +225,7 @@ public:
      AXPYTimer.Stop();

      PrecChangeTimer.Start();
-      precisionChangeFast(r_f, r_d);
+      precisionChange(r_f, r_d);
      PrecChangeTimer.Stop();

      AXPYTimer.Start();
@@ -243,13 +243,13 @@ public:

      cp=c;
      PrecChangeTimer.Start();
-      precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
+      precisionChange(p_f, p_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f);
      MatrixTimer.Stop();  
      PrecChangeTimer.Start();
-      precisionChangeFast(mmp_d, mmp_f); // From Float to Double
+      precisionChange(mmp_d, mmp_f); // From Float to Double
      PrecChangeTimer.Stop();

      d=real(innerProduct(p_d,mmp_d));    
@@ -311,7 +311,7 @@ public:
 	SolverTimer.Stop();

 	for(int s=0;s<nshift;s++){
-	  precisionChangeFast(psi_d[s],psi_f[s]);
+	  precisionChange(psi_d[s],psi_f[s]);
 	}

 	
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -211,7 +211,7 @@ public:
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    tmp_d = tmp_d - mmp_d;
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
-    //    assert(norm2(tmp_d)< 1.0e-4);
+    assert(norm2(tmp_d)< 1.0);

    axpy(mmp_d,mass[0],p_d,mmp_d);
    RealD rn = norm2(p_d);
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -419,14 +419,15 @@ until convergence
 	}
      }

-      if ( Nconv < Nstop )
+      if ( Nconv < Nstop ) {
 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
-
+	std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
+      }
      eval=eval2;
      
      //Keep only converged
-      eval.resize(Nconv);// Nstop?
-      evec.resize(Nconv,grid);// Nstop?
+      eval.resize(Nstop);// was Nconv
+      evec.resize(Nstop,grid);// was Nconv
      basisSortInPlace(evec,eval,reverse);
      
    }
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -519,7 +519,6 @@ void MemoryManager::Audit(std::string s)
  uint64_t LruBytes1=0;
  uint64_t LruBytes2=0;
  uint64_t LruCnt=0;
-  uint64_t LockedBytes=0;
  
  std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
  for(auto it=LRU.begin();it!=LRU.end();it++){
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -27,9 +27,10 @@ Author: Christoph Lehner <christoph@lhnr.de>
 *************************************************************************************/
 /*  END LEGAL */

+#define Mheader "SharedMemoryMpi: "
+
 #include <Grid/GridCore.h>
 #include <pwd.h>
-#include <syscall.h>

 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
@@ -39,11 +40,118 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #endif
 #ifdef GRID_SYCL
 #define GRID_SYCL_LEVEL_ZERO_IPC
+#include <syscall.h>
+#define SHM_SOCKETS 
+#endif
+
+#include <sys/socket.h>
+#include <sys/un.h>
+
+NAMESPACE_BEGIN(Grid); 
+
+#ifdef SHM_SOCKETS
+
+/*
+ * Barbaric extra intranode communication route in case we need sockets to pass FDs
+ * Forced by level_zero not being nicely designed
+ */
+static int sock;
+static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
+static char sock_path[256];
+class UnixSockets {
+public:
+  static void Open(int rank)
+  {
+    int errnum;
+
+    sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0);
+
+    struct sockaddr_un sa_un = { 0 };
+    sa_un.sun_family = AF_UNIX;
+    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
+    unlink(sa_un.sun_path);
+    if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
+      perror("bind failure");
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  static int RecvFileDescriptor(void)
+  {
+    int n;
+    int fd;
+    char buf[1];
+    struct iovec iov;
+    struct msghdr msg;
+    struct cmsghdr *cmsg;
+    char cms[CMSG_SPACE(sizeof(int))];
+
+    iov.iov_base = buf;
+    iov.iov_len = 1;
+
+    memset(&msg, 0, sizeof msg);
+    msg.msg_name = 0;
+    msg.msg_namelen = 0;
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+
+    msg.msg_control = (caddr_t)cms;
+    msg.msg_controllen = sizeof cms;
+
+    if((n=recvmsg(sock, &msg, 0)) < 0) {
+      perror("recvmsg failed");
+      return -1;
+    }
+    if(n == 0){
+      perror("recvmsg returned 0");
+      return -1;
+    }
+    cmsg = CMSG_FIRSTHDR(&msg);
+
+    memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
+
+    return fd;
+  }
+
+  static void SendFileDescriptor(int fildes,int xmit_to_rank)
+  {
+    struct msghdr msg;
+    struct iovec iov;
+    struct cmsghdr *cmsg = NULL;
+    char ctrl[CMSG_SPACE(sizeof(int))];
+    char data = ' ';
+
+    memset(&msg, 0, sizeof(struct msghdr));
+    memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
+    iov.iov_base = &data;
+    iov.iov_len = sizeof(data);
+    
+    sprintf(sock_path,sock_path_fmt,xmit_to_rank);
+    
+    struct sockaddr_un sa_un = { 0 };
+    sa_un.sun_family = AF_UNIX;
+    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
+
+    msg.msg_name = (void *)&sa_un;
+    msg.msg_namelen = sizeof(sa_un);
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+    msg.msg_controllen =  CMSG_SPACE(sizeof(int));
+    msg.msg_control = ctrl;
+
+    cmsg = CMSG_FIRSTHDR(&msg);
+    cmsg->cmsg_level = SOL_SOCKET;
+    cmsg->cmsg_type = SCM_RIGHTS;
+    cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+
+    *((int *) CMSG_DATA(cmsg)) = fildes;
+
+    sendmsg(sock, &msg, 0);
+  };
+};
 #endif


-NAMESPACE_BEGIN(Grid); 
-#define header "SharedMemoryMpi: "
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
@@ -66,8 +174,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);

  if ( WorldRank == 0) {
-    std::cout << header " World communicator of size " <<WorldSize << std::endl;  
-    std::cout << header " Node  communicator of size " <<WorldShmSize << std::endl;
+    std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;  
+    std::cout << Mheader " Node  communicator of size " <<WorldShmSize << std::endl;
  }
  // WorldShmComm, WorldShmSize, WorldShmRank

@@ -170,10 +278,7 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
 }
-static inline int divides(int a,int b)
-{
-  return ( b == ( (b/a)*a ) );
-}
+
 void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  ////////////////////////////////////////////////////////////////
@@ -347,7 +452,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
 #ifdef GRID_MPI3_SHMGET
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);

@@ -432,7 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }

-  std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+  std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;

  SharedMemoryZero(ShmCommBuf,bytes);
@@ -475,7 +580,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
@@ -483,8 +588,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef SHM_SOCKETS
+  UnixSockets::Open(WorldShmRank);
+#endif
  for(int r=0;r<WorldShmSize;r++){

+    MPI_Barrier(WorldShmComm);
+
 #ifndef GRID_MPI3_SHM_NONE
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
@@ -492,24 +602,32 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    void * thisBuf = ShmCommBuf;
    if(!Stencil_force_mpi) {
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
-    typedef struct { int fd; pid_t pid ; } clone_mem_t;
+    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;

    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
      
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
-
+    
    if ( r==WorldShmRank ) { 
      auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
+	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
      handle.pid = getpid();
+      memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
+#ifdef SHM_SOCKETS
+      for(int rr=0;rr<WorldShmSize;rr++){
+	if(rr!=r){
+	  UnixSockets::SendFileDescriptor(handle.fd,rr);
+	}
+      }
+#endif
    }
 #endif
 #ifdef GRID_CUDA
@@ -537,6 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
    { 
+      MPI_Barrier(WorldShmComm);
      int ierr=MPI_Bcast(&handle,
 			 sizeof(handle),
 			 MPI_BYTE,
@@ -552,6 +671,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    if ( r!=WorldShmRank ) {
      thisBuf = nullptr;
+      int myfd;
+#ifdef SHM_SOCKETS
+      myfd=UnixSockets::RecvFileDescriptor();
+#else
      std::cout<<"mapping seeking remote pid/fd "
 	       <<handle.pid<<"/"
 	       <<handle.fd<<std::endl;
@@ -559,16 +682,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
-      int myfd  = syscall(438,pidfd,handle.fd,0);
-
-      std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
-      
+      myfd  = syscall(438,pidfd,handle.fd,0);
+      int err_t = errno;
+      if (myfd < 0) {
+        fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
+	perror("pidfd_getfd failed ");
+	assert(0);
+      }
+#endif
+      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
+      memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));

      auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
-	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
+	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
+	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
@@ -603,6 +732,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #else
    WorldShmCommBufs[r] = ShmCommBuf;
 #endif
+    MPI_Barrier(WorldShmComm);
  }

  _ShmAllocBytes=bytes;
@@ -614,7 +744,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -651,7 +781,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    //    std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -661,7 +791,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHM_NONE
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -708,7 +838,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -707,9 +707,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  Coordinate ist = Tg->_istride;
  Coordinate ost = Tg->_ostride;

-  autoView( t_v , To, AcceleratorWrite);
-  autoView( f_v , From, AcceleratorRead);
-  accelerator_for(idx,Fg->lSites(),1,{
+  autoView( t_v , To, CpuWrite);
+  autoView( f_v , From, CpuRead);
+  thread_for(idx,Fg->lSites(),{
    sobj s;
    Coordinate Fcoor(nd);
    Coordinate Tcoor(nd);
@@ -722,15 +722,20 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
    }
    if (in_region) {
-      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
-      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
-      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
-      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
-      vector_type * fp = (vector_type *)&f_v[odx_f];
-      vector_type * tp = (vector_type *)&t_v[odx_t];
+#if 0      
+      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
+      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
+      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
+      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
+      scalar_type * fp = (scalar_type *)&f_v[odx_f];
+      scalar_type * tp = (scalar_type *)&t_v[odx_t];
      for(int w=0;w<words;w++){
 	tp[w].putlane(fp[w].getlane(idx_f),idx_t);
      }
+#else
+    peekLocalSite(s,f_v,Fcoor);
+    pokeLocalSite(s,t_v,Tcoor);
+#endif
    }
  });
 }
@@ -841,9 +846,9 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int

  for(int d=0;d<nh;d++){
    if ( d!=orthog ) {
-    assert(lg->_processors[d]  == hg->_processors[d]);
-    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
-  }
+      assert(lg->_processors[d]  == hg->_processors[d]);
+      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    }
  }

  // the above should guarantee that the operations are local
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -0,0 +1,136 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/PaddedCell.h
+
+    Copyright (C) 2019
+
+Author: Peter Boyle pboyle@bnl.gov
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+class PaddedCell {
+public:
+  GridCartesian * unpadded_grid;
+  int dims;
+  int depth;
+  std::vector<GridCartesian *> grids;
+  ~PaddedCell()
+  {
+    DeleteGrids();
+  }
+  PaddedCell(int _depth,GridCartesian *_grid)
+  {
+    unpadded_grid = _grid;
+    depth=_depth;
+    dims=_grid->Nd();
+    AllocateGrids();
+    Coordinate local     =unpadded_grid->LocalDimensions();
+    for(int d=0;d<dims;d++){
+      assert(local[d]>=depth);
+    }
+  }
+  void DeleteGrids(void)
+  {
+    for(int d=0;d<grids.size();d++){
+      delete grids[d];
+    }
+    grids.resize(0);
+  };
+  void AllocateGrids(void)
+  {
+    Coordinate local     =unpadded_grid->LocalDimensions();
+    Coordinate simd      =unpadded_grid->_simd_layout;
+    Coordinate processors=unpadded_grid->_processors;
+    Coordinate plocal    =unpadded_grid->LocalDimensions();
+    Coordinate global(dims);
+
+    // expand up one dim at a time
+    for(int d=0;d<dims;d++){
+
+      plocal[d] += 2*depth; 
+
+      for(int d=0;d<dims;d++){
+	global[d] = plocal[d]*processors[d];
+      }
+
+      grids.push_back(new GridCartesian(global,simd,processors));
+    }
+  };
+  template<class vobj>
+  inline Lattice<vobj> Extract(Lattice<vobj> &in)
+  {
+    Lattice<vobj> out(unpadded_grid);
+
+    Coordinate local     =unpadded_grid->LocalDimensions();
+    Coordinate fll(dims,depth); // depends on the MPI spread
+    Coordinate tll(dims,0); // depends on the MPI spread
+    localCopyRegion(in,out,fll,tll,local);
+    return out;
+  }
+  template<class vobj>
+  inline Lattice<vobj> Exchange(Lattice<vobj> &in)
+  {
+    GridBase *old_grid = in.Grid();
+    int dims = old_grid->Nd();
+    Lattice<vobj> tmp = in;
+    for(int d=0;d<dims;d++){
+      tmp = Expand(d,tmp); // rvalue && assignment
+    }
+    return tmp;
+  }
+  // expand up one dim at a time
+  template<class vobj>
+  inline Lattice<vobj> Expand(int dim,Lattice<vobj> &in)
+  {
+    GridBase *old_grid = in.Grid();
+    GridCartesian *new_grid = grids[dim];//These are new grids
+    Lattice<vobj>  padded(new_grid);
+    Lattice<vobj> shifted(old_grid);    
+    Coordinate local     =old_grid->LocalDimensions();
+    Coordinate plocal    =new_grid->LocalDimensions();
+    if(dim==0) conformable(old_grid,unpadded_grid);
+    else       conformable(old_grid,grids[dim-1]);
+
+    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
+    // Middle bit
+    for(int x=0;x<local[dim];x++){
+      InsertSliceLocal(in,padded,x,depth+x,dim);
+    }
+    // High bit
+    shifted = Cshift(in,dim,depth);
+    for(int x=0;x<depth;x++){
+      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
+    }
+    // Low bit
+    shifted = Cshift(in,dim,-depth);
+    for(int x=0;x<depth;x++){
+      InsertSliceLocal(shifted,padded,x,x,dim);
+    }
+    return padded;
+  }
+
+};
+ 
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -104,6 +104,7 @@ template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iSca
 template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
 template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
 template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
+template<typename vtype> using iLorentzComplex            = iVector<iScalar<iScalar<vtype> >, Nd > ;
 template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
 template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
 template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
@@ -178,6 +179,15 @@ typedef iLorentzColourMatrix<vComplexF>  vLorentzColourMatrixF;
 typedef iLorentzColourMatrix<vComplexD>  vLorentzColourMatrixD;
 typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;

+// LorentzComplex
+typedef iLorentzComplex<Complex  > LorentzComplex;
+typedef iLorentzComplex<ComplexF > LorentzComplexF;
+typedef iLorentzComplex<ComplexD > LorentzComplexD;
+
+typedef iLorentzComplex<vComplex > vLorentzComplex;
+typedef iLorentzComplex<vComplexF> vLorentzComplexF;
+typedef iLorentzComplex<vComplexD> vLorentzComplexD;
+
 // DoubleStored gauge field
 typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
@@ -307,6 +317,10 @@ typedef Lattice<vLorentzColourMatrixF>  LatticeLorentzColourMatrixF;
 typedef Lattice<vLorentzColourMatrixD>  LatticeLorentzColourMatrixD;
 typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;

+typedef Lattice<vLorentzComplex>  LatticeLorentzComplex;
+typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF;
+typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD;
+
 // DoubleStored gauge field
 typedef Lattice<vDoubleStoredColourMatrix>   LatticeDoubleStoredColourMatrix;
 typedef Lattice<vDoubleStoredColourMatrixF>  LatticeDoubleStoredColourMatrixF;
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -34,10 +34,24 @@ directory

 NAMESPACE_BEGIN(Grid);

+///////////////////////////////////
+// Smart configuration base class
+///////////////////////////////////
+template< class Field >
+class ConfigurationBase
+{
+public:
+  ConfigurationBase() {}
+  virtual ~ConfigurationBase() {}
+  virtual void set_Field(Field& U) =0;
+  virtual void smeared_force(Field&) = 0;
+  virtual Field& get_SmearedU() =0;
+  virtual Field &get_U(bool smeared = false) = 0;
+};
+
 template <class GaugeField >
 class Action 
 {
-
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
@@ -77,11 +91,39 @@ public:
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
+  /////////////////////////////
  // Heatbath?
+  /////////////////////////////
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
  virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
+
+  /////////////////////////////////////////////////////////////
+  // virtual smeared interface through configuration container
+  /////////////////////////////////////////////////////////////
+  virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
+  {
+    refresh(U.get_U(is_smeared),sRNG,pRNG);
+  }
+  virtual RealD S(ConfigurationBase<GaugeField>& U)
+  {
+    return S(U.get_U(is_smeared));
+  }
+  virtual RealD Sinitial(ConfigurationBase<GaugeField>& U) 
+  {
+    return Sinitial(U.get_U(is_smeared));
+  }
+  virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
+  {
+    deriv(U.get_U(is_smeared),dSdU); 
+    if ( is_smeared ) {
+      U.smeared_force(dSdU);
+    }
+  }
+  ///////////////////////////////
+  // Logging
+  ///////////////////////////////
  virtual std::string action_name()    = 0;                             // return the action name
  virtual std::string LogParameters()  = 0;                             // prints action parameters
  virtual ~Action(){}
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@@ -30,6 +30,8 @@ directory
 #ifndef QCD_ACTION_CORE
 #define QCD_ACTION_CORE

+#include <Grid/qcd/action/gauge/GaugeImplementations.h>
+
 #include <Grid/qcd/action/ActionBase.h>
 NAMESPACE_CHECK(ActionBase);
 #include <Grid/qcd/action/ActionSet.h>
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -124,6 +124,11 @@ public:
  RealD                _b;
  RealD                _c;

+  // possible boost
+  std::vector<ComplexD> qmu;
+  void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
+  void addQmu(const FermionField &in, FermionField &out, int dag);
+  
  // Cayley form Moebius (tanh and zolotarev)
  Vector<Coeff_t> omega;
  Vector<Coeff_t> bs;    // S dependent coeffs
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -60,6 +60,50 @@ public:
  //      virtual void   Instantiatable(void)=0;
  virtual void   Instantiatable(void) =0;

+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
+  {
+    std::cout << "Free Propagator for PartialFraction"<<std::endl;
+    FermionField in_k(in.Grid());
+    FermionField prop_k(in.Grid());
+    
+    FFT theFFT((GridCartesian *) in.Grid());
+
+    //phase for boundary condition
+    ComplexField coor(in.Grid());
+    ComplexField ph(in.Grid());  ph = Zero();
+    FermionField in_buf(in.Grid()); in_buf = Zero();
+    typedef typename Simd::scalar_type Scalar;
+    Scalar ci(0.0,1.0);
+    assert(twist.size() == Nd);//check that twist is Nd
+    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    int shift = 0;
+    for(unsigned int nu = 0; nu < Nd; nu++)
+      {
+	// Shift coordinate lattice index by 1 to account for 5th dimension.
+	LatticeCoordinate(coor, nu + shift);
+	double boundary_phase = ::acos(real(boundary[nu]));
+	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
+	//momenta for propagator shifted by twist+boundary
+	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
+      }
+    in_buf = exp(ci*ph*(-1.0))*in;
+
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    
+    //phase for boundary condition
+    out = out * exp(ci*ph);
+  };
+
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+    std::vector<Complex> boundary;
+    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+    FreePropagator(in,out,mass,boundary,twist);
+  };
+
+  
  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
 public:
  INHERIT_IMPL_TYPES(Impl);

-  const int part_frac_chroma_convention=1;
+  const int part_frac_chroma_convention=0;

  void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
  void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
@@ -83,12 +83,63 @@ public:
 			   GridRedBlackCartesian &FourDimRedBlackGrid,
 			   RealD _mass,RealD M5,const ImplParams &p= ImplParams());

+  PartialFractionFermion5D(GaugeField &_Umu,
+			   GridCartesian         &FiveDimGrid,
+			   GridRedBlackCartesian &FiveDimRedBlackGrid,
+			   GridCartesian         &FourDimGrid,
+			   GridRedBlackCartesian &FourDimRedBlackGrid,
+			   RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
+
+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
+  {
+    std::cout << "Free Propagator for PartialFraction"<<std::endl;
+    FermionField in_k(in.Grid());
+    FermionField prop_k(in.Grid());
+    
+    FFT theFFT((GridCartesian *) in.Grid());
+
+    //phase for boundary condition
+    ComplexField coor(in.Grid());
+    ComplexField ph(in.Grid());  ph = Zero();
+    FermionField in_buf(in.Grid()); in_buf = Zero();
+    typedef typename Simd::scalar_type Scalar;
+    Scalar ci(0.0,1.0);
+    assert(twist.size() == Nd);//check that twist is Nd
+    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    int shift = 0;
+    for(unsigned int nu = 0; nu < Nd; nu++)
+      {
+	// Shift coordinate lattice index by 1 to account for 5th dimension.
+	LatticeCoordinate(coor, nu + shift);
+	double boundary_phase = ::acos(real(boundary[nu]));
+	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
+	//momenta for propagator shifted by twist+boundary
+	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
+      }
+    in_buf = exp(ci*ph*(-1.0))*in;
+
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    
+    //phase for boundary condition
+    out = out * exp(ci*ph);
+  };
+
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+    std::vector<Complex> boundary;
+    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+    FreePropagator(in,out,mass,boundary,twist);
+  };
+  
 protected:

  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
  virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);

  // Part frac
+  std::vector<RealD> qmu;
  RealD mass;
  RealD dw_diag;
  RealD R;
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -507,6 +507,7 @@ public:
    }
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
+    accelerator_barrier();
  }

 };
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -48,7 +48,8 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
  mass_plus(_mass), mass_minus(_mass)
-{ 
+{
+  // qmu defaults to zero size;
 }

 ///////////////////////////////////////////////////////////////
@@ -270,6 +271,34 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  M5Ddag(psi,psi,Din,lower,diag,upper);
 }

+template<class Impl>
+void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
+{
+  if ( qmu.size() ) {
+
+    Gamma::Algebra Gmu [] = {
+      Gamma::Algebra::GammaX,
+      Gamma::Algebra::GammaY,
+      Gamma::Algebra::GammaZ,
+      Gamma::Algebra::GammaT
+    };
+    std::vector<ComplexD> coeff(Nd);
+    ComplexD ci(0,1);
+
+    assert(qmu.size()==Nd);
+
+    for(int mu=0;mu<Nd;mu++){
+       coeff[mu] = ci*qmu[mu];
+       if ( dag ) coeff[mu] = conjugate(coeff[mu]);
+    }
+
+    chi = chi + Gamma(Gmu[0])*psi*coeff[0];
+    for(int mu=1;mu<Nd;mu++){
+      chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
+    }
+  }
+}
+
 template<class Impl>
 void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
@@ -277,8 +306,12 @@ void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
  
  // Assemble Din
  Meooe5D(psi,Din);
-  
+
  this->DW(Din,chi,DaggerNo);
+
+  // add i q_mu gamma_mu here
+  addQmu(Din,chi,DaggerNo);
+  
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby(chi,1.0,1.0,chi,psi); 
  
@@ -295,6 +328,9 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  FermionField Din(psi.Grid());
  // Apply Dw
  this->DW(psi,Din,DaggerYes); 
+
+  // add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
+  addQmu(psi,Din,DaggerYes);
  
  MeooeDag5D(Din,chi);
  
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -42,13 +42,13 @@ template<class Impl>
 void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
 {
  // How to check Ls matches??
-  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
  int Ls = this->Ls;
+  std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
  assert(zdata->db==Ls);// Beta has Ls coeffs

  R=(1+this->mass)/(1-this->mass);
@@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -255,15 +255,76 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
  }
 	
  {
+    // The 'conventional' Cayley overlap operator is
+    //
+    // Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
+    //
+    //
+    // With massless limit 1/2(1+g5 sgnHw)
+    //
+    // Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
+    //
+    // However, the conventional normalisation has both a leading order factor of 2 in Zq
+    // at tree level AND a mass dependent (1-m) that are convenient to absorb.
+    //
+    // In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
+    //
+    // num = -i sin kmu gmu
+    //
+    // denom ( sqrt(sk^2 + (2shk^2 - 1)^2
+    //    b_k = sk2 - M5;
+    //     
+    //    w_k = sqrt(sk + b_k*b_k);
+    //
+    //    denom= ( w_k + b_k + mass*mass) ;
+    //
+    //    denom= one/denom;
+    //    out = num*denom;
+    //
+    // Chroma, and Grid define partial fraction via 4d operator
+    //
+    //   Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
+    //
+    // Now since:
+    //
+    //      (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
+    //
+    // This corresponds to a modified mass parameter
+    //
+    // It has an annoying 
+    //
+    // 
    double R=(1+this->mass)/(1-this->mass);
    //R g5 psi[Ls] + p[0] H
    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
-	
+    
    for(int b=0;b<nblock;b++){
      int s = 2*b+1;
      double pp = p[nblock-1-b];
      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
    }
+
+    if ( qmu.size() ) {
+
+      FermionField qslash_psi(psi.Grid());
+      
+      Gamma::Algebra Gmu [] = {
+			 Gamma::Algebra::GammaX,
+			 Gamma::Algebra::GammaY,
+			 Gamma::Algebra::GammaZ,
+			 Gamma::Algebra::GammaT
+      };
+      ComplexD ci(0,1);
+      assert(qmu.size()==Nd);
+      qslash_psi = Gamma(Gmu[0])*psi;
+      for(int mu=1;mu<Nd;mu++){
+	qslash_psi = Gamma(Gmu[mu])*psi;
+      }
+      //      RealD coeff = 1.0;
+      qslash_psi = Gamma(Gamma::Algebra::Gamma5)*qslash_psi*ci ; // i g5 qslash -- 1-m factor???
+      axpby_ssp(chi,1.0,chi,1.0, qslash_psi,Ls-1,Ls-1);
+    }
+    
  }

 }
@@ -411,7 +472,7 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@@ -421,7 +482,8 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      std::cout << " importing to slice " << Ls-1 <<std::endl;
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
@@ -442,7 +504,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,

 {
  int Ls = this->Ls;
-
+  qmu.resize(0);
  assert((Ls&0x1)==1); // Odd Ls required
  int nrational=Ls-1;

@@ -460,6 +522,22 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
  Approx::zolotarev_free(zdata);

 }
+template<class Impl>
+PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
+							 GridCartesian         &FiveDimGrid,
+							 GridRedBlackCartesian &FiveDimRedBlackGrid,
+							 GridCartesian         &FourDimGrid,
+							 GridRedBlackCartesian &FourDimRedBlackGrid,
+							 RealD _mass,RealD M5,
+							 std::vector<RealD> &_qmu,
+							 const ImplParams &p)
+  : PartialFractionFermion5D<Impl>(_Umu,
+			     FiveDimGrid,FiveDimRedBlackGrid,
+			     FourDimGrid,FourDimRedBlackGrid,
+			     _mass,M5,p)
+{
+  qmu=_qmu;
+}

 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -332,8 +332,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  /////////////////////////////
  {
    GRID_TRACE("Gather");
-    st.HaloExchangeOptGather(in,compressor);
-    accelerator_barrier();
+    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
  }
  
  std::vector<std::vector<CommsRequest_t> > requests;
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -423,14 +423,14 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();

 #define KERNEL_CALL_EXT(A)						\
-  const uint64_t    NN = Nsite*Ls;					\
  const uint64_t    sz = st.surface_list.size();			\
  auto ptr = &st.surface_list[0];					\
  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
      int sF = ptr[ss];							\
-      int sU = ss/Ls;							\
+      int sU = sF/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
-    });									
+    });									\
+  accelerator_barrier();

 #define ASM_CALL(A)							\
  thread_for( sss, Nsite, {						\
@@ -474,9 +474,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
+     // dependent on result of merge
     acceleratorFenceComputeStream();
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
@@ -506,9 +507,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) {
+     // Dependent on result of merge
     acceleratorFenceComputeStream();
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;}
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
@@ -53,9 +53,10 @@ NAMESPACE_BEGIN(Grid);
      Integer ReliableUpdateFreq;
    protected:

+      //Action evaluation
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
-#if 0
+#if 1
 	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
 	ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
 	msCG(schurOp,in, out);
@@ -70,9 +71,10 @@ NAMESPACE_BEGIN(Grid);
 	msCG(schurOpD, in, out);
 #endif
      }
+      //Force evaluation
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
-	SchurDifferentiableOperator<ImplF>  schurOpF (numerator ? NumOpF  : DenOpF);
+	SchurDifferentiableOperator<ImplF>  schurOpF(numerator ? NumOpF  : DenOpF);

 	FermionFieldD inD(NumOpD.FermionRedBlackGrid());
 	FermionFieldD outD(NumOpD.FermionRedBlackGrid());
@@ -84,20 +86,15 @@ NAMESPACE_BEGIN(Grid);
      virtual void ImportGauge(const typename ImplD::GaugeField &Ud){

 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
-	typename ImplD::GaugeField Ud2(NumOpD.GaugeGrid());
 	precisionChange(Uf, Ud);
-	precisionChange(Ud2, Ud);

-	std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " << norm2(Ud2)<<std::endl;
+	std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " <<std::endl;
 	
 	NumOpD.ImportGauge(Ud);
 	DenOpD.ImportGauge(Ud);

 	NumOpF.ImportGauge(Uf);
 	DenOpF.ImportGauge(Uf);
-
-	NumOpD.ImportGauge(Ud2);
-	DenOpD.ImportGauge(Ud2);
      }
      
    public:
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -207,20 +207,27 @@ NAMESPACE_BEGIN(Grid);
        //X = (Mdag M)^-1 V^dag phi
        //Y = (Mdag)^-1 V^dag  phi
        Vpc.MpcDag(PhiOdd,Y);          // Y= Vdag phi
+	std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;
        X=Zero();
        DerivativeSolver(Mpc,Y,X);     // X= (MdagM)^-1 Vdag phi
+	std::cout << GridLogMessage <<" X "<<norm2(X)<<std::endl;
        Mpc.Mpc(X,Y);                  // Y=  Mdag^-1 Vdag phi
+	std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;

        // phi^dag V (Mdag M)^-1 dV^dag  phi
        Vpc.MpcDagDeriv(force , X, PhiOdd );   dSdU = force;
+	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
  
        // phi^dag dV (Mdag M)^-1 V^dag  phi
        Vpc.MpcDeriv(force , PhiOdd, X );      dSdU = dSdU+force;
+	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;

        //    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi
        //    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi
        Mpc.MpcDeriv(force,Y,X);              dSdU = dSdU-force;
+	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
        Mpc.MpcDagDeriv(force,X,Y);           dSdU = dSdU-force;
+	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;

        // FIXME No force contribution from EvenEven assumed here
        // Needs a fix for clover.
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@@ -283,12 +283,13 @@ public:
      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;

      TheIntegrator.print_timer();
-
+      
+      TheIntegrator.Smearer.set_Field(Ucur);
      for (int obs = 0; obs < Observables.size(); obs++) {
      	std::cout << GridLogDebug << "Observables # " << obs << std::endl;
      	std::cout << GridLogDebug << "Observables total " << Observables.size() << std::endl;
      	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
-        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
+        Observables[obs]->TrajectoryComplete(traj + 1, TheIntegrator.Smearer, sRNG, pRNG);
      }
      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
    }
--- a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
@@ -35,13 +35,16 @@ class CheckpointerParameters : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(CheckpointerParameters, 
 				  std::string, config_prefix, 
+				  std::string, smeared_prefix, 
 				  std::string, rng_prefix, 
 				  int, saveInterval, 
+				  bool, saveSmeared, 
 				  std::string, format, );

-  CheckpointerParameters(std::string cf = "cfg", std::string rn = "rng",
+  CheckpointerParameters(std::string cf = "cfg", std::string sf="cfg_smr" , std::string rn = "rng",
 			 int savemodulo = 1, const std::string &f = "IEEE64BIG")
    : config_prefix(cf),
+      smeared_prefix(sf),
      rng_prefix(rn),
      saveInterval(savemodulo),
      format(f){};
@@ -61,13 +64,21 @@ template <class Impl>
 class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> {
 public:
  void build_filenames(int traj, CheckpointerParameters &Params,
-                       std::string &conf_file, std::string &rng_file) {
+                       std::string &conf_file,
+                       std::string &smear_file,
+		       std::string &rng_file) {
    {
      std::ostringstream os;
      os << Params.rng_prefix << "." << traj;
      rng_file = os.str();
    }

+    {
+      std::ostringstream os;
+      os << Params.smeared_prefix << "." << traj;
+      smear_file = os.str();
+    }
+
    {
      std::ostringstream os;
      os << Params.config_prefix << "." << traj;
@@ -84,6 +95,11 @@ public:
  }
  virtual void initialize(const CheckpointerParameters &Params) = 0;

+  virtual void TrajectoryComplete(int traj,
+                                  typename Impl::Field &U,
+                                  GridSerialRNG &sRNG,
+                                  GridParallelRNG &pRNG) { assert(0); } ; // HMC should pass the smart config with smeared and unsmeared
+  
  virtual void CheckpointRestore(int traj, typename Impl::Field &U,
                                 GridSerialRNG &sRNG,
                                 GridParallelRNG &pRNG) = 0;
--- a/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
@@ -61,11 +61,14 @@ public:
    fout.close();
  }

-  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
+  void TrajectoryComplete(int traj,
+			  ConfigurationBase<Field> &SmartConfig,
+			  GridSerialRNG &sRNG, GridParallelRNG &pRNG)
+  {

    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
-      this->build_filenames(traj, Params, config, rng);
+      std::string config, rng, smr;
+      this->build_filenames(traj, Params, config, smr, rng);

      uint32_t nersc_csum;
      uint32_t scidac_csuma;
@@ -74,9 +77,15 @@ public:
      BinarySimpleUnmunger<sobj_double, sobj> munge;
      truncate(rng);
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      truncate(config);
+      std::cout << GridLogMessage << "Written Binary RNG " << rng
+                << " checksum " << std::hex 
+		<< nersc_csum   <<"/"
+		<< scidac_csuma   <<"/"
+		<< scidac_csumb 
+		<< std::dec << std::endl;

-      BinaryIO::writeLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
+      truncate(config);
+      BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(false), config, munge, 0, Params.format,
 						      nersc_csum,scidac_csuma,scidac_csumb);

      std::cout << GridLogMessage << "Written Binary Configuration " << config
@@ -85,6 +94,18 @@ public:
 		<< scidac_csuma   <<"/"
 		<< scidac_csumb 
 		<< std::dec << std::endl;
+
+      if ( Params.saveSmeared ) {
+	truncate(smr);
+	BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(true), smr, munge, 0, Params.format,
+							nersc_csum,scidac_csuma,scidac_csumb);
+	std::cout << GridLogMessage << "Written Binary Smeared Configuration " << smr
+                << " checksum " << std::hex 
+		<< nersc_csum   <<"/"
+		<< scidac_csuma   <<"/"
+		<< scidac_csumb 
+		<< std::dec << std::endl;
+      }
    }

  };
--- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -69,17 +69,27 @@ public:
    }
  }

-  void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
+  void TrajectoryComplete(int traj,
+			  ConfigurationBase<GaugeField> &SmartConfig,
+			  GridSerialRNG &sRNG,
                          GridParallelRNG &pRNG) {
    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
+      std::string config, rng, smr;
      this->build_filenames(traj, Params, config, rng);
-      GridBase *grid = U.Grid();
+      GridBase *grid = SmartConfig.get_U(false).Grid();
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
+      std::cout << GridLogMessage << "Written BINARY RNG " << rng
+                << " checksum " << std::hex 
+		<< nersc_csum<<"/"
+		<< scidac_csuma<<"/"
+		<< scidac_csumb
+		<< std::dec << std::endl;
+
+      
      IldgWriter _IldgWriter(grid->IsBoss());
      _IldgWriter.open(config);
-      _IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config);
+      _IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(false), traj, config, config);
      _IldgWriter.close();

      std::cout << GridLogMessage << "Written ILDG Configuration on " << config
@@ -88,6 +98,21 @@ public:
 		<< scidac_csuma<<"/"
 		<< scidac_csumb
 		<< std::dec << std::endl;
+
+      if ( Params.saveSmeared ) { 
+	IldgWriter _IldgWriter(grid->IsBoss());
+	_IldgWriter.open(smr);
+	_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
+	_IldgWriter.close();
+
+	std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
+                << " checksum " << std::hex 
+		<< nersc_csum<<"/"
+		<< scidac_csuma<<"/"
+		<< scidac_csumb
+		<< std::dec << std::endl;
+      }
+
    }
  };

--- a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
@@ -52,23 +52,29 @@ public:
    Params.format = "IEEE64BIG";  // fixed, overwrite any other choice
  }

-  void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
-                          GridParallelRNG &pRNG) {
+  virtual void TrajectoryComplete(int traj,
+                                  ConfigurationBase<GaugeField> &SmartConfig,
+                                  GridSerialRNG &sRNG,
+                                  GridParallelRNG &pRNG)
+  {
    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
-      this->build_filenames(traj, Params, config, rng);
-
+      std::string config, rng, smr;
+      this->build_filenames(traj, Params, config, smr, rng);
+      
      int precision32 = 1;
      int tworow = 0;
      NerscIO::writeRNGState(sRNG, pRNG, rng);
-      NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32);
+      NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(false), config, tworow, precision32);
+      if ( Params.saveSmeared ) {
+	NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(true), smr, tworow, precision32);
+      }
    }
  };

  void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
                         GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
+    std::string config, rng, smr;
+    this->build_filenames(traj, Params, config, smr, rng );
    this->check_filename(rng);
    this->check_filename(config);

--- a/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
@@ -70,19 +70,37 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
    }
  }

-  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
+  void TrajectoryComplete(int traj, 
+			  ConfigurationBase<Field> &SmartConfig,
+			  GridSerialRNG &sRNG,
                          GridParallelRNG &pRNG) {
    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
-      this->build_filenames(traj, Params, config, rng);
-      GridBase *grid = U.Grid();
+      std::string config, rng,smr;
+      this->build_filenames(traj, Params, config, smr, rng);
+      GridBase *grid = SmartConfig.get_U(false).Grid();
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      ScidacWriter _ScidacWriter(grid->IsBoss());
-      _ScidacWriter.open(config);
-      _ScidacWriter.writeScidacFieldRecord(U, MData);
-      _ScidacWriter.close();
+      std::cout << GridLogMessage << "Written Binary RNG " << rng
+                << " checksum " << std::hex 
+		<< nersc_csum   <<"/"
+		<< scidac_csuma   <<"/"
+		<< scidac_csumb 
+		<< std::dec << std::endl;

+
+      {
+	ScidacWriter _ScidacWriter(grid->IsBoss());
+	_ScidacWriter.open(config);
+	_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(false), MData);
+	_ScidacWriter.close();
+      }
+      
+      if ( Params.saveSmeared ) {
+	ScidacWriter _ScidacWriter(grid->IsBoss());
+	_ScidacWriter.open(smr);
+	_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(true), MData);
+	_ScidacWriter.close();
+      }
      std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl;
    }
  };
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -66,6 +66,7 @@ public:
 template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy>
 class Integrator {
 protected:
+public:
  typedef FieldImplementation_ FieldImplementation;
  typedef typename FieldImplementation::Field MomentaField;  //for readability
  typedef typename FieldImplementation::Field Field;
@@ -96,7 +97,6 @@ protected:
  {
    t_P[level] += ep;
    update_P(P, U, level, ep);
-
    std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
  }

@@ -130,28 +130,20 @@ protected:
      Field force(U.Grid());
      conformable(U.Grid(), Mom.Grid());

-      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      double start_force = usecond();

-      std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl;
-      
      as[level].actions.at(a)->deriv_timer_start();
-      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
+      as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta
      as[level].actions.at(a)->deriv_timer_stop();

-      std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl;
-
-      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
      auto name = as[level].actions.at(a)->action_name();
-      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);

      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-
-      //      DumpSliceNorm("force ",force,Nd-1);
+      
      MomFilter->applyFilter(force);
+
      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl;
-      DumpSliceNorm("force filtered ",force,Nd-1);
      
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
@@ -377,14 +369,9 @@ public:
 	auto name = as[level].actions.at(actionID)->action_name();
        std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;

-        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
-
-	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl;
-
 	as[level].actions.at(actionID)->refresh_timer_start();
-        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
+        as[level].actions.at(actionID)->refresh(Smearer, sRNG, pRNG);
 	as[level].actions.at(actionID)->refresh_timer_stop();
-	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl;

      }

@@ -425,10 +412,9 @@ public:

        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
-        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
 	        as[level].actions.at(actionID)->S_timer_start();
-        Hterm = as[level].actions.at(actionID)->S(Us);
+        Hterm = as[level].actions.at(actionID)->S(Smearer);
   	        as[level].actions.at(actionID)->S_timer_stop();
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
@@ -469,12 +455,11 @@ public:
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
-        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
-	        as[level].actions.at(actionID)->S_timer_start();

-        Hterm = as[level].actions.at(actionID)->Sinitial(Us);
-   	        as[level].actions.at(actionID)->S_timer_stop();
+	as[level].actions.at(actionID)->S_timer_start();
+        Hterm = as[level].actions.at(actionID)->S(Smearer);
+	as[level].actions.at(actionID)->S_timer_stop();

        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
--- a/Grid/qcd/observables/hmc_observable.h
+++ b/Grid/qcd/observables/hmc_observable.h
@@ -34,6 +34,13 @@ NAMESPACE_BEGIN(Grid);
 template <class Field>
 class HmcObservable {
 public:
+  virtual void TrajectoryComplete(int traj,
+                                  ConfigurationBase<Field> &SmartConfig,
+                                  GridSerialRNG &sRNG,
+                                  GridParallelRNG &pRNG)
+  {
+    TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
+  };
  virtual void TrajectoryComplete(int traj,
                                  Field &U,
                                  GridSerialRNG &sRNG,
--- a/Grid/qcd/observables/plaquette.h
+++ b/Grid/qcd/observables/plaquette.h
@@ -42,6 +42,18 @@ public:
  // necessary for HmcObservable compatibility
  typedef typename Impl::Field Field;

+  virtual void TrajectoryComplete(int traj,
+                                  ConfigurationBase<Field> &SmartConfig,
+                                  GridSerialRNG &sRNG,
+                                  GridParallelRNG &pRNG)
+  {
+    std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
+    std::cout << GridLogMessage << "Unsmeared plaquette"<<std::endl;
+    TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
+    std::cout << GridLogMessage << "Smeared plaquette"<<std::endl;
+    TrajectoryComplete(traj,SmartConfig.get_U(true),sRNG,pRNG); // Unsmeared observable
+    std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
+  };
  void TrajectoryComplete(int traj,
                          Field &U,
                          GridSerialRNG &sRNG,
--- a/Grid/qcd/smearing/GaugeConfiguration.h
+++ b/Grid/qcd/smearing/GaugeConfiguration.h
@@ -7,26 +7,27 @@

 NAMESPACE_BEGIN(Grid);

+
 //trivial class for no smearing
 template< class Impl >
-class NoSmearing
+class NoSmearing : public ConfigurationBase<typename Impl::Field>
 {
 public:
  INHERIT_FIELD_TYPES(Impl);

-  Field* ThinField;
+  Field* ThinLinks;

-  NoSmearing(): ThinField(NULL) {}
+  NoSmearing(): ThinLinks(NULL) {}

-  void set_Field(Field& U) { ThinField = &U; }
+  virtual void set_Field(Field& U) { ThinLinks = &U; }

-  void smeared_force(Field&) const {}
+  virtual void smeared_force(Field&) {}

-  Field& get_SmearedU() { return *ThinField; }
+  virtual Field& get_SmearedU() { return *ThinLinks; }

-  Field &get_U(bool smeared = false)
+  virtual Field &get_U(bool smeared = false)
  {
-    return *ThinField;
+    return *ThinLinks;
  }
 };

@@ -42,19 +43,24 @@ public:
  It stores a list of smeared configurations.
 */
 template <class Gimpl>
-class SmearedConfiguration
+class SmearedConfiguration : public ConfigurationBase<typename Gimpl::Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);

-private:
+protected:
  const unsigned int smearingLevels;
  Smear_Stout<Gimpl> *StoutSmearing;
  std::vector<GaugeField> SmearedSet;
-
+public:
+  GaugeField*  ThinLinks; /* Pointer to the thin links configuration */ // move to base???
+protected:
+  
  // Member functions
  //====================================================================
-  void fill_smearedSet(GaugeField &U)
+
+  // Overridden in masked version
+  virtual void fill_smearedSet(GaugeField &U)
  {
    ThinLinks = &U;  // attach the smearing routine to the field U

@@ -82,9 +88,10 @@ private:
      }
    }
  }
-  //====================================================================
-  GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
-                                  const GaugeField& GaugeK) const 
+
+  //overridden in masked verson
+  virtual GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
+					  const GaugeField& GaugeK) const 
  {
    GridBase* grid = GaugeK.Grid();
    GaugeField C(grid), SigmaK(grid), iLambda(grid);
@@ -213,8 +220,6 @@ private:

  //====================================================================
 public:
-  GaugeField*
-      ThinLinks; /* Pointer to the thin links configuration */

  /* Standard constructor */
  SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
@@ -230,7 +235,7 @@ public:
    : smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {}

  // attach the smeared routines to the thin links U and fill the smeared set
-  void set_Field(GaugeField &U)
+  virtual void set_Field(GaugeField &U)
  {
    double start = usecond();
    fill_smearedSet(U);
@@ -240,7 +245,7 @@ public:
  }

  //====================================================================
-  void smeared_force(GaugeField &SigmaTilde) const
+  virtual void smeared_force(GaugeField &SigmaTilde) 
  {
    if (smearingLevels > 0)
    {
@@ -267,14 +272,16 @@ public:
      }
      double end = usecond();
      double time = (end - start)/ 1e3;
-      std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;  
+      std::cout << GridLogMessage << " GaugeConfiguration: Smeared Force chain rule took " << time << " ms" << std::endl;
    }  // if smearingLevels = 0 do nothing
+    SigmaTilde=Gimpl::projectForce(SigmaTilde); // Ta
+      
  }
  //====================================================================

-  GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
+  virtual GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }

-  GaugeField &get_U(bool smeared = false)
+  virtual GaugeField &get_U(bool smeared = false)
  {
    // get the config, thin links by default
    if (smeared)
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@@ -0,0 +1,813 @@
+/*!
+  @file GaugeConfiguration.h
+  @brief Declares the GaugeConfiguration class
+*/
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+/*!
+  @brief Smeared configuration masked container
+  Modified for a multi-subset smearing (aka Luscher Flowed HMC)
+*/
+template <class Gimpl>
+class SmearedConfigurationMasked : public SmearedConfiguration<Gimpl>
+{
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+private:
+  // These live in base class
+  //  const unsigned int smearingLevels;
+  //  Smear_Stout<Gimpl> *StoutSmearing;
+  //  std::vector<GaugeField> SmearedSet;
+  
+  std::vector<LatticeLorentzComplex> masks;
+
+  typedef typename SU3Adjoint::AMatrix AdjMatrix;
+  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
+  typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField;
+
+  // Adjoint vector to GaugeField force
+  void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu)
+  {
+    Complex ci(0,1);
+    GaugeLinkField Fdet_pol(Fdet.Grid());
+    Fdet_pol=Zero();
+    for(int e=0;e<8;e++){
+      ColourMatrix te;
+      SU3::generator(e, te);
+      auto tmp=peekColour(Fdet_nu,e);
+      Fdet_pol=Fdet_pol + ci*tmp*te; // but norm of te is different.. why?
+    }
+    pokeLorentz(Fdet, Fdet_pol, nu);
+  }
+  void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
+  {
+    GaugeLinkField UtaU(PlaqL.Grid());
+    GaugeLinkField D(PlaqL.Grid());
+    AdjMatrixField Dbc(PlaqL.Grid());
+    LatticeComplex tmp(PlaqL.Grid());
+    const int Ngen = SU3Adjoint::Dimension;
+    Complex ci(0,1);
+    ColourMatrix   ta,tb,tc;
+    
+    for(int a=0;a<Ngen;a++) {
+      SU3::generator(a, ta);
+      // Qlat Tb = 2i Tb^Grid
+      UtaU= 2.0*ci*adj(PlaqL)*ta*PlaqR;
+      for(int c=0;c<Ngen;c++) {
+	SU3::generator(c, tc);
+	D = Ta( (2.0)*ci*tc *UtaU);
+	for(int b=0;b<Ngen;b++){
+	  SU3::generator(b, tb);
+	  tmp =-trace(ci*tb*D); 
+	  PokeIndex<ColourIndex>(Dbc,tmp,b,c);  // Adjoint rep
+	}
+      }
+      tmp = trace(MpInvJx * Dbc);
+      PokeIndex<ColourIndex>(Fdet2,tmp,a);
+    }
+  }
+  
+  void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd)
+  {
+    GaugeLinkField Nx(PlaqL.Grid());
+    const int Ngen = SU3Adjoint::Dimension;
+    Complex ci(0,1);
+    ColourMatrix   tb;
+    ColourMatrix   tc;
+    for(int b=0;b<Ngen;b++) {
+      SU3::generator(b, tb);
+      Nx = (2.0)*Ta( adj(PlaqL)*ci*tb * PlaqR );
+      for(int c=0;c<Ngen;c++) {
+	SU3::generator(c, tc);
+	auto tmp =closure( -trace(ci*tc*Nx)); 
+	PokeIndex<ColourIndex>(NxAd,tmp,c,b); 
+      }
+    }
+  }
+  void ApplyMask(GaugeField &U,int smr)
+  {
+    LatticeComplex tmp(U.Grid());
+    GaugeLinkField Umu(U.Grid());
+    for(int mu=0;mu<Nd;mu++){
+      Umu=PeekIndex<LorentzIndex>(U,mu);
+      tmp=PeekIndex<LorentzIndex>(masks[smr],mu);
+      Umu=Umu*tmp;
+      PokeIndex<LorentzIndex>(U, Umu, mu);
+    }
+  }
+public:
+
+  void logDetJacobianForceLevel(const GaugeField &U, GaugeField &force ,int smr)
+  {
+    GridBase* grid = U.Grid();
+    ColourMatrix   tb;
+    ColourMatrix   tc;
+    ColourMatrix   ta;
+    GaugeField C(grid);
+    GaugeField Umsk(grid);
+    std::vector<GaugeLinkField> Umu(Nd,grid);
+    GaugeLinkField Cmu(grid); // U and staple; C contains factor of epsilon
+    GaugeLinkField Zx(grid);  // U times Staple, contains factor of epsilon
+    GaugeLinkField Nxx(grid);  // Nxx fundamental space
+    GaugeLinkField Utmp(grid);
+    GaugeLinkField PlaqL(grid);
+    GaugeLinkField PlaqR(grid);
+    const int Ngen = SU3Adjoint::Dimension;
+    AdjMatrix TRb;
+    ColourMatrix Ident;
+    LatticeComplex  cplx(grid);
+    
+    AdjVectorField  dJdXe_nMpInv(grid); 
+    AdjVectorField  dJdXe_nMpInv_y(grid); 
+    AdjMatrixField  MpAd(grid);    // Mprime luchang's notes
+    AdjMatrixField  MpAdInv(grid); // Mprime inverse
+    AdjMatrixField  NxxAd(grid);    // Nxx in adjoint space
+    AdjMatrixField  JxAd(grid);     
+    AdjMatrixField  ZxAd(grid);
+    AdjMatrixField  mZxAd(grid);
+    AdjMatrixField  X(grid);
+    Complex ci(0,1);
+
+    RealD t0 = usecond();
+    Ident = ComplexD(1.0);
+    for(int d=0;d<Nd;d++){
+      Umu[d] = peekLorentz(U, d);
+    }
+    int mu= (smr/2) %Nd;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Mask the gauge field
+    ////////////////////////////////////////////////////////////////////////////////
+    auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
+
+    Umsk = U;
+    ApplyMask(Umsk,smr);
+    Utmp = peekLorentz(Umsk,mu);
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Retrieve the eps/rho parameter(s) -- could allow all different but not so far
+    ////////////////////////////////////////////////////////////////////////////////
+    double rho=this->StoutSmearing->SmearRho[1];
+    int idx=0;
+    for(int mu=0;mu<4;mu++){
+    for(int nu=0;nu<4;nu++){
+      if ( mu!=nu) assert(this->StoutSmearing->SmearRho[idx]==rho);
+      else         assert(this->StoutSmearing->SmearRho[idx]==0.0);
+      idx++;
+    }}
+    //////////////////////////////////////////////////////////////////
+    // Assemble the N matrix
+    //////////////////////////////////////////////////////////////////
+    // Computes ALL the staples -- could compute one only and do it here
+    RealD time;
+    time=-usecond();
+    this->StoutSmearing->BaseSmear(C, U);
+    Cmu = peekLorentz(C, mu);
+
+    //////////////////////////////////////////////////////////////////
+    // Assemble Luscher exp diff map J matrix 
+    //////////////////////////////////////////////////////////////////
+    // Ta so Z lives in Lie algabra
+    Zx  = Ta(Cmu * adj(Umu[mu]));
+    time+=usecond();
+    std::cout << GridLogMessage << "Z took "<<time<< " us"<<std::endl;
+
+    time=-usecond();
+    // Move Z to the Adjoint Rep == make_adjoint_representation
+    ZxAd = Zero();
+    for(int b=0;b<8;b++) {
+      // Adj group sets traceless antihermitian T's -- Guido, really????
+      SU3::generator(b, tb);         // Fund group sets traceless hermitian T's
+      SU3Adjoint::generator(b,TRb);
+      TRb=-TRb;
+      cplx = 2.0*trace(ci*tb*Zx); // my convention 1/2 delta ba
+      ZxAd = ZxAd + cplx * TRb; // is this right? YES - Guido used Anti herm Ta's and with bloody wrong sign.
+    }
+    time+=usecond();
+    std::cout << GridLogMessage << "ZxAd took "<<time<< " us"<<std::endl;
+
+    //////////////////////////////////////
+    // J(x) = 1 + Sum_k=1..N (-Zac)^k/(k+1)!
+    //////////////////////////////////////
+    time=-usecond();
+    X=1.0; 
+    JxAd = X;
+    mZxAd = (-1.0)*ZxAd; 
+    RealD kpfac = 1;
+    for(int k=1;k<12;k++){
+      X=X*mZxAd;
+      kpfac = kpfac /(k+1);
+      JxAd = JxAd + X * kpfac;
+    }
+    time+=usecond();
+    std::cout << GridLogMessage << "Jx took "<<time<< " us"<<std::endl;
+
+    //////////////////////////////////////
+    // dJ(x)/dxe
+    //////////////////////////////////////
+    time=-usecond();
+    std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid);
+    AdjMatrixField tbXn(grid);
+    AdjMatrixField sumXtbX(grid);
+    AdjMatrixField t2(grid);
+    AdjMatrixField dt2(grid);
+    AdjMatrixField t3(grid);
+    AdjMatrixField dt3(grid);
+    AdjMatrixField aunit(grid);
+    for(int b=0;b<8;b++){
+      aunit = ComplexD(1.0);
+      SU3Adjoint::generator(b, TRb); //dt2
+
+      X  = (-1.0)*ZxAd; 
+      t2 = X;
+      dt2 = TRb;
+      for (int j = 20; j > 1; --j) {
+	t3 = t2*(1.0 / (j + 1))  + aunit;
+	dt3 = dt2*(1.0 / (j + 1));
+	t2 = X * t3;
+	dt2 = TRb * t3 + X * dt3;
+      }
+      dJdX[b] = -dt2; 
+    }
+    time+=usecond();
+    std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl;
+    /////////////////////////////////////////////////////////////////
+    // Mask Umu for this link
+    /////////////////////////////////////////////////////////////////
+    time=-usecond();
+    PlaqL = Ident;
+    PlaqR = Utmp*adj(Cmu);
+    ComputeNxy(PlaqL,PlaqR,NxxAd);
+    time+=usecond();
+    std::cout << GridLogMessage << "ComputeNxy took "<<time<< " us"<<std::endl;
+    
+    ////////////////////////////
+    // Mab
+    ////////////////////////////
+    MpAd = Complex(1.0,0.0);
+    MpAd = MpAd - JxAd * NxxAd;
+
+    /////////////////////////
+    // invert the 8x8
+    /////////////////////////
+    time=-usecond();
+    MpAdInv = Inverse(MpAd);
+    time+=usecond();
+    std::cout << GridLogMessage << "MpAdInv took "<<time<< " us"<<std::endl;
+    
+    RealD t3a = usecond();
+    /////////////////////////////////////////////////////////////////
+    // Nxx Mp^-1
+    /////////////////////////////////////////////////////////////////
+    AdjVectorField  FdetV(grid);
+    AdjVectorField  Fdet1_nu(grid);
+    AdjVectorField  Fdet2_nu(grid);
+    AdjVectorField  Fdet2_mu(grid);
+    AdjVectorField  Fdet1_mu(grid);
+
+    AdjMatrixField nMpInv(grid);
+    nMpInv= NxxAd *MpAdInv;
+
+    AdjMatrixField MpInvJx(grid);
+    AdjMatrixField MpInvJx_nu(grid);
+    MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
+
+    Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
+    Fdet2_mu=FdetV;
+    Fdet1_mu=Zero();
+    
+    for(int e =0 ; e<8 ; e++){
+      LatticeComplexD tr(grid);
+      ColourMatrix te;
+      SU3::generator(e, te);
+      tr = trace(dJdX[e] * nMpInv);
+      pokeColour(dJdXe_nMpInv,tr,e);
+    }
+    ///////////////////////////////
+    // Mask it off
+    ///////////////////////////////
+    auto tmp=PeekIndex<LorentzIndex>(masks[smr],mu);
+    dJdXe_nMpInv = dJdXe_nMpInv*tmp;
+    
+    //    dJdXe_nMpInv needs to multiply:
+    //       Nxx_mu (site local)                           (1)
+    //       Nxy_mu one site forward  in each nu direction (3)
+    //       Nxy_mu one site backward in each nu direction (3)
+    //       Nxy_nu 0,0  ; +mu,0; 0,-nu; +mu-nu   [ 3x4 = 12]
+    // 19 terms.
+    AdjMatrixField Nxy(grid);
+
+    GaugeField Fdet1(grid);
+    GaugeField Fdet2(grid);
+    GaugeLinkField Fdet_pol(grid); // one polarisation
+
+    RealD t4 = usecond();
+    for(int nu=0;nu<Nd;nu++){
+
+      if (nu!=mu) {
+	///////////////// +ve nu /////////////////
+	//     __
+	//    |  |
+	//    x==    // nu polarisation -- clockwise
+
+	time=-usecond();
+	PlaqL=Ident;
+
+	PlaqR=(-rho)*Gimpl::CovShiftForward(Umu[nu], nu,
+ 	       Gimpl::CovShiftForward(Umu[mu], mu,
+	         Gimpl::CovShiftBackward(Umu[nu], nu,
+		   Gimpl::CovShiftIdentityBackward(Utmp, mu))));
+	time+=usecond();
+	std::cout << GridLogMessage << "PlaqLR took "<<time<< " us"<<std::endl;
+
+	time=-usecond();
+	dJdXe_nMpInv_y =   dJdXe_nMpInv;
+	ComputeNxy(PlaqL,PlaqR,Nxy);
+	Fdet1_nu = transpose(Nxy)*dJdXe_nMpInv_y;
+	time+=usecond();
+	std::cout << GridLogMessage << "ComputeNxy (occurs 6x) took "<<time<< " us"<<std::endl;
+
+	time=-usecond();
+	PlaqR=(-1.0)*PlaqR;
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
+	Fdet2_nu = FdetV;
+	time+=usecond();
+	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
+	
+	//    x==
+	//    |  |
+	//    .__|    // nu polarisation -- anticlockwise
+
+	PlaqR=(rho)*Gimpl::CovShiftForward(Umu[nu], nu,
+		      Gimpl::CovShiftBackward(Umu[mu], mu,
+    	 	        Gimpl::CovShiftIdentityBackward(Umu[nu], nu)));
+
+	PlaqL=Gimpl::CovShiftIdentityBackward(Utmp, mu);
+
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1);
+	ComputeNxy(PlaqL, PlaqR,Nxy);
+	Fdet1_nu = Fdet1_nu+transpose(Nxy)*dJdXe_nMpInv_y;
+	
+
+	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Fdet2_nu = Fdet2_nu+FdetV;
+	
+	///////////////// -ve nu /////////////////
+	//  __
+	// |  |
+	// x==          // nu polarisation -- clockwise
+
+	PlaqL=(rho)* Gimpl::CovShiftForward(Umu[mu], mu,
+		       Gimpl::CovShiftForward(Umu[nu], nu,
+			 Gimpl::CovShiftIdentityBackward(Utmp, mu)));
+
+        PlaqR = Gimpl::CovShiftIdentityForward(Umu[nu], nu);
+
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1);
+	ComputeNxy(PlaqL,PlaqR,Nxy);
+	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
+
+	MpInvJx_nu = Cshift(MpInvJx,nu,1);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Fdet2_nu = Fdet2_nu+FdetV;
+	
+	// x==
+	// |  |
+	// |__|         // nu polarisation
+
+	PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[nu], nu,
+ 	        Gimpl::CovShiftIdentityBackward(Utmp, mu));
+
+	PlaqR=Gimpl::CovShiftBackward(Umu[mu], mu,
+	        Gimpl::CovShiftIdentityForward(Umu[nu], nu));
+
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1);
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv_y,nu,1);
+
+	ComputeNxy(PlaqL,PlaqR,Nxy);
+	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
+
+	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
+	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Fdet2_nu = Fdet2_nu+FdetV;
+
+	/////////////////////////////////////////////////////////////////////
+	// Set up the determinant force contribution in 3x3 algebra basis
+	/////////////////////////////////////////////////////////////////////
+	InsertForce(Fdet1,Fdet1_nu,nu);
+	InsertForce(Fdet2,Fdet2_nu,nu);
+	
+	//////////////////////////////////////////////////
+	// Parallel direction terms
+	//////////////////////////////////////////////////
+
+        //     __
+	//    |  "
+	//    |__"x    // mu polarisation
+	PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[mu], mu,
+		      Gimpl::CovShiftBackward(Umu[nu], nu,
+   		        Gimpl::CovShiftIdentityBackward(Utmp, mu)));
+
+	PlaqR=Gimpl::CovShiftIdentityBackward(Umu[nu], nu);
+	
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,-1);
+
+	ComputeNxy(PlaqL,PlaqR,Nxy);
+	Fdet1_mu = Fdet1_mu + transpose(Nxy)*dJdXe_nMpInv_y;
+
+	MpInvJx_nu = Cshift(MpInvJx,nu,-1);
+
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Fdet2_mu = Fdet2_mu+FdetV;
+
+	//  __
+	// "  |
+	// x__|          // mu polarisation
+
+	PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[mu], mu,
+		       Gimpl::CovShiftForward(Umu[nu], nu,
+		 	 Gimpl::CovShiftIdentityBackward(Utmp, mu)));
+
+        PlaqR=Gimpl::CovShiftIdentityForward(Umu[nu], nu);
+
+	dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1);
+
+	ComputeNxy(PlaqL,PlaqR,Nxy);
+	Fdet1_mu = Fdet1_mu + transpose(Nxy)*dJdXe_nMpInv_y;
+
+	MpInvJx_nu = Cshift(MpInvJx,nu,1);
+
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Fdet2_mu = Fdet2_mu+FdetV;
+	
+      }
+    }
+    RealD t5 = usecond();
+
+    Fdet1_mu = Fdet1_mu + transpose(NxxAd)*dJdXe_nMpInv;
+
+    InsertForce(Fdet1,Fdet1_mu,mu);
+    InsertForce(Fdet2,Fdet2_mu,mu);
+
+    force= (-0.5)*( Fdet1 + Fdet2);
+    RealD t1 = usecond();
+    std::cout << GridLogMessage << " logDetJacobianForce level took "<<t1-t0<<" us "<<std::endl;
+    std::cout << GridLogMessage << " logDetJacobianForce t3-t0 "<<t3a-t0<<" us "<<std::endl;
+    std::cout << GridLogMessage << " logDetJacobianForce t4-t3 dJdXe_nMpInv "<<t4-t3a<<" us "<<std::endl;
+    std::cout << GridLogMessage << " logDetJacobianForce t5-t4 mu nu loop "<<t5-t4<<" us "<<std::endl;
+    std::cout << GridLogMessage << " logDetJacobianForce t1-t5 "<<t1-t5<<" us "<<std::endl;
+    std::cout << GridLogMessage << " logDetJacobianForce level took "<<t1-t0<<" us "<<std::endl;
+  }
+  RealD logDetJacobianLevel(const GaugeField &U,int smr)
+  {
+    GridBase* grid = U.Grid();
+    GaugeField C(grid);
+    GaugeLinkField Nb(grid);
+    GaugeLinkField Z(grid);
+    GaugeLinkField Umu(grid), Cmu(grid);
+    ColourMatrix   Tb;
+    ColourMatrix   Tc;
+    typedef typename SU3Adjoint::AMatrix AdjMatrix;
+    typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
+    typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField;
+    const int Ngen = SU3Adjoint::Dimension;
+    AdjMatrix TRb;
+    LatticeComplex       cplx(grid); 
+    AdjVectorField  AlgV(grid); 
+    AdjMatrixField  Mab(grid);
+    AdjMatrixField  Ncb(grid);
+    AdjMatrixField  Jac(grid);
+    AdjMatrixField  Zac(grid);
+    AdjMatrixField  mZac(grid);
+    AdjMatrixField  X(grid);
+
+    int mu= (smr/2) %Nd;
+
+    auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
+
+    //////////////////////////////////////////////////////////////////
+    // Assemble the N matrix
+    //////////////////////////////////////////////////////////////////
+    // Computes ALL the staples -- could compute one only here
+    this->StoutSmearing->BaseSmear(C, U);
+    Cmu = peekLorentz(C, mu);
+    Umu = peekLorentz(U, mu);
+    Complex ci(0,1);
+    for(int b=0;b<Ngen;b++) {
+      SU3::generator(b, Tb);
+      // Qlat Tb = 2i Tb^Grid
+      Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu));
+      for(int c=0;c<Ngen;c++) {
+	SU3::generator(c, Tc);
+	auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important
+	PokeIndex<ColourIndex>(Ncb,tmp,c,b); 
+      }
+    }      
+
+    //////////////////////////////////////////////////////////////////
+    // Assemble Luscher exp diff map J matrix 
+    //////////////////////////////////////////////////////////////////
+    // Ta so Z lives in Lie algabra
+    Z  = Ta(Cmu * adj(Umu));
+
+    // Move Z to the Adjoint Rep == make_adjoint_representation
+    Zac = Zero();
+    for(int b=0;b<8;b++) {
+      // Adj group sets traceless antihermitian T's -- Guido, really????
+      // Is the mapping of these the same? Same structure constants
+      // Might never have been checked.
+      SU3::generator(b, Tb);         // Fund group sets traceless hermitian T's
+      SU3Adjoint::generator(b,TRb);
+      TRb=-TRb;
+      cplx = 2.0*trace(ci*Tb*Z); // my convention 1/2 delta ba
+      Zac = Zac + cplx * TRb; // is this right? YES - Guido used Anti herm Ta's and with bloody wrong sign.
+    }
+
+    //////////////////////////////////////
+    // J(x) = 1 + Sum_k=1..N (-Zac)^k/(k+1)!
+    //////////////////////////////////////
+    X=1.0; 
+    Jac = X;
+    mZac = (-1.0)*Zac; 
+    RealD kpfac = 1;
+    for(int k=1;k<12;k++){
+      X=X*mZac;
+      kpfac = kpfac /(k+1);
+      Jac = Jac + X * kpfac;
+    }
+
+    ////////////////////////////
+    // Mab
+    ////////////////////////////
+    Mab = Complex(1.0,0.0);
+    Mab = Mab - Jac * Ncb;
+
+    ////////////////////////////
+    // det
+    ////////////////////////////
+    LatticeComplex       det(grid); 
+    det = Determinant(Mab);
+
+    ////////////////////////////
+    // ln det
+    ////////////////////////////
+    LatticeComplex       ln_det(grid); 
+    ln_det = log(det);
+
+    ////////////////////////////
+    // Masked sum
+    ////////////////////////////
+    ln_det = ln_det * mask;
+    Complex result = sum(ln_det);
+    return result.real();
+  }
+public:
+  RealD logDetJacobian(void)
+  {
+    RealD ln_det = 0;
+    if (this->smearingLevels > 0)
+    {
+      double start = usecond();
+      for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
+	ln_det+= logDetJacobianLevel(this->get_smeared_conf(ismr-1),ismr);
+      }
+      ln_det +=logDetJacobianLevel(*(this->ThinLinks),0);
+
+      double end = usecond();
+      double time = (end - start)/ 1e3;
+      std::cout << GridLogMessage << "GaugeConfigurationMasked: logDetJacobian took " << time << " ms" << std::endl;  
+    }
+    return ln_det;
+  }
+  void logDetJacobianForce(GaugeField &force)
+  {
+    force =Zero();
+    GaugeField force_det(force.Grid());
+
+    if (this->smearingLevels > 0)
+    {
+      double start = usecond();
+
+      GaugeLinkField tmp_mu(force.Grid());
+
+      for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
+
+	// remove U in UdSdU...
+	for (int mu = 0; mu < Nd; mu++) {
+	  tmp_mu = adj(peekLorentz(this->get_smeared_conf(ismr), mu)) * peekLorentz(force, mu);
+	  pokeLorentz(force, tmp_mu, mu);
+	}
+	
+      	// Propagate existing force
+        force = this->AnalyticSmearedForce(force, this->get_smeared_conf(ismr - 1), ismr);
+
+	// Add back U in UdSdU...
+	for (int mu = 0; mu < Nd; mu++) {
+	  tmp_mu = peekLorentz(this->get_smeared_conf(ismr - 1), mu) * peekLorentz(force, mu);
+	  pokeLorentz(force, tmp_mu, mu);
+	}
+    	
+	// Get this levels determinant force
+	force_det = Zero();
+	logDetJacobianForceLevel(this->get_smeared_conf(ismr-1),force_det,ismr);
+
+	// Sum the contributions
+	force = force + force_det;
+      }
+    
+      // remove U in UdSdU...
+      for (int mu = 0; mu < Nd; mu++) {
+	tmp_mu = adj(peekLorentz(this->get_smeared_conf(0), mu)) * peekLorentz(force, mu);
+	pokeLorentz(force, tmp_mu, mu);
+      }
+
+      force = this->AnalyticSmearedForce(force, *this->ThinLinks,0);
+
+      for (int mu = 0; mu < Nd; mu++) {
+	tmp_mu = peekLorentz(*this->ThinLinks, mu) * peekLorentz(force, mu);
+	pokeLorentz(force, tmp_mu, mu);
+      }
+
+      force_det = Zero();
+
+      logDetJacobianForceLevel(*this->ThinLinks,force_det,0);
+
+      force = force + force_det;
+
+      force=Ta(force); // Ta
+      
+      double end = usecond();
+      double time = (end - start)/ 1e3;
+      std::cout << GridLogMessage << "GaugeConfigurationMasked: lnDetJacobianForce took " << time << " ms" << std::endl;  
+    }  // if smearingLevels = 0 do nothing
+  }
+
+private:
+  //====================================================================
+  // Override base clas here to mask it
+  virtual void fill_smearedSet(GaugeField &U)
+  {
+    this->ThinLinks = &U;  // attach the smearing routine to the field U
+
+    // check the pointer is not null
+    if (this->ThinLinks == NULL)
+      std::cout << GridLogError << "[SmearedConfigurationMasked] Error in ThinLinks pointer\n";
+
+    if (this->smearingLevels > 0)
+    {
+      std::cout << GridLogMessage << "[SmearedConfigurationMasked] Filling SmearedSet\n";
+      GaugeField previous_u(this->ThinLinks->Grid());
+
+      GaugeField smeared_A(this->ThinLinks->Grid());
+      GaugeField smeared_B(this->ThinLinks->Grid());
+
+      previous_u = *this->ThinLinks;
+      double start = usecond();
+      for (int smearLvl = 0; smearLvl < this->smearingLevels; ++smearLvl)
+      {
+        this->StoutSmearing->smear(smeared_A, previous_u);
+	ApplyMask(smeared_A,smearLvl);
+	smeared_B = previous_u;
+	ApplyMask(smeared_B,smearLvl);
+	// Replace only the masked portion
+	this->SmearedSet[smearLvl] = previous_u-smeared_B + smeared_A;
+        previous_u = this->SmearedSet[smearLvl];
+
+        // For debug purposes
+        RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(previous_u);
+        std::cout << GridLogMessage << "[SmearedConfigurationMasked] smeared Plaq: " << impl_plaq << std::endl;
+      }
+      double end = usecond();
+      double time = (end - start)/ 1e3;
+      std::cout << GridLogMessage << "GaugeConfigurationMasked: Link smearing took " << time << " ms" << std::endl;  
+    }
+  }
+  //====================================================================
+  // Override base to add masking
+  virtual GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
+					  const GaugeField& GaugeK,int level) 
+  {
+    GridBase* grid = GaugeK.Grid();
+    GaugeField C(grid), SigmaK(grid), iLambda(grid);
+    GaugeField SigmaKPrimeA(grid);
+    GaugeField SigmaKPrimeB(grid);
+    GaugeLinkField iLambda_mu(grid);
+    GaugeLinkField iQ(grid), e_iQ(grid);
+    GaugeLinkField SigmaKPrime_mu(grid);
+    GaugeLinkField GaugeKmu(grid), Cmu(grid);
+    
+    this->StoutSmearing->BaseSmear(C, GaugeK);
+    SigmaK = Zero();
+    iLambda = Zero();
+
+    SigmaKPrimeA = SigmaKPrime;
+    ApplyMask(SigmaKPrimeA,level);
+    SigmaKPrimeB = SigmaKPrime - SigmaKPrimeA;
+    
+    // Could get away with computing only one polarisation here
+    // int mu= (smr/2) %Nd;
+    // SigmaKprime_A has only one component
+    for (int mu = 0; mu < Nd; mu++)
+    {
+      Cmu = peekLorentz(C, mu);
+      GaugeKmu = peekLorentz(GaugeK, mu);
+      SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
+      iQ = Ta(Cmu * adj(GaugeKmu));
+      this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
+      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
+      pokeLorentz(iLambda, iLambda_mu, mu);
+    }
+    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // propagate the rest of the force as identity map, just add back
+    ////////////////////////////////////////////////////////////////////////////////////
+    SigmaK = SigmaK+SigmaKPrimeB;
+
+    return SigmaK;
+  }
+
+public:
+
+  /* Standard constructor */
+  SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
+    : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
+  {
+    assert(Nsmear%(2*Nd)==0); // Or multiply by 8??
+
+    // was resized in base class
+    assert(this->SmearedSet.size()==Nsmear);
+    
+    GridRedBlackCartesian * UrbGrid;
+    UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
+    LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
+    LatticeComplex tmp(_UGrid);
+
+    for (unsigned int i = 0; i < this->smearingLevels; ++i) {
+
+      masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
+
+      int mu= (i/2) %Nd;
+      int cb= (i%2);
+      LatticeComplex tmpcb(UrbGrid);
+	
+      masks[i]=Zero();
+      ////////////////////
+      // Setup the mask
+      ////////////////////
+      tmp = Zero();
+      pickCheckerboard(cb,tmpcb,one);
+      setCheckerboard(tmp,tmpcb);
+      PokeIndex<LorentzIndex>(masks[i],tmp, mu);
+	
+    }
+    delete UrbGrid;
+  }
+  
+  virtual void smeared_force(GaugeField &SigmaTilde) 
+  {
+    if (this->smearingLevels > 0)
+    {
+      double start = usecond();
+      GaugeField force = SigmaTilde; // actually = U*SigmaTilde
+      GaugeLinkField tmp_mu(SigmaTilde.Grid());
+
+      // Remove U from UdSdU
+      for (int mu = 0; mu < Nd; mu++)
+      {
+        // to get just SigmaTilde
+        tmp_mu = adj(peekLorentz(this->SmearedSet[this->smearingLevels - 1], mu)) * peekLorentz(force, mu);
+        pokeLorentz(force, tmp_mu, mu);
+      }
+
+      for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
+        force = this->AnalyticSmearedForce(force, this->get_smeared_conf(ismr - 1),ismr);
+      }
+      
+      force = this->AnalyticSmearedForce(force, *this->ThinLinks,0);
+
+      // Add U to UdSdU
+      for (int mu = 0; mu < Nd; mu++)
+      {
+        tmp_mu = peekLorentz(*this->ThinLinks, mu) * peekLorentz(force, mu);
+        pokeLorentz(SigmaTilde, tmp_mu, mu);
+      }
+
+
+      double end = usecond();
+      double time = (end - start)/ 1e3;
+      std::cout << GridLogMessage << " GaugeConfigurationMasked: Smeared Force chain rule took " << time << " ms" << std::endl;
+
+    }  // if smearingLevels = 0 do nothing
+    SigmaTilde=Gimpl::projectForce(SigmaTilde); // Ta
+  }
+
+};
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/smearing/JacobianAction.h
+++ b/Grid/qcd/smearing/JacobianAction.h
@@ -0,0 +1,87 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/gauge/JacobianAction.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+////////////////////////////////////////////////////////////////////////
+// Jacobian Action .. 
+////////////////////////////////////////////////////////////////////////
+template <class Gimpl>
+class JacobianAction : public Action<typename Gimpl::GaugeField> {
+public:  
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  SmearedConfigurationMasked<Gimpl> * smearer;
+  /////////////////////////// constructors
+  explicit JacobianAction(SmearedConfigurationMasked<Gimpl> * _smearer ) { smearer=_smearer;};
+
+  virtual std::string action_name() {return "JacobianAction";}
+
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    sstream << GridLogMessage << "[JacobianAction] " << std::endl;
+    return sstream.str();
+  }
+
+  //////////////////////////////////
+  // Usual cases are not used
+  //////////////////////////////////
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){ assert(0);};
+  virtual RealD S(const GaugeField &U) { assert(0); }
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) { assert(0);  }
+
+  //////////////////////////////////
+  // Functions of smart configs only
+  //////////////////////////////////
+  virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
+  {
+    return;
+  }
+  virtual RealD S(ConfigurationBase<GaugeField>& U)
+  {
+    // det M = e^{ - ( - logDetM) }
+    assert( &U == smearer );
+    return -smearer->logDetJacobian();
+  }
+  virtual RealD Sinitial(ConfigurationBase<GaugeField>& U) 
+  {
+    return S(U);
+  }
+  virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
+  {
+    assert( &U == smearer );
+    smearer->logDetJacobianForce(dSdU);
+  }
+
+private:
+ };
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@@ -40,7 +40,9 @@ template <class Gimpl>
 class Smear_Stout : public Smear<Gimpl> {
 private:
  int OrthogDim = -1;
+public:
  const std::vector<double> SmearRho;
+private:
  // Smear<Gimpl>* ownership semantics:
  //    Smear<Gimpl>* passed in to constructor are owned by caller, so we don't delete them here
  //    Smear<Gimpl>* created within constructor need to be deleted as part of the destructor
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@@ -34,6 +34,61 @@ directory

 NAMESPACE_BEGIN(Grid);

+template<int N, class Vec>
+Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
+{
+  GridBase *grid=Umu.Grid();
+  auto lvol = grid->lSites();
+  Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
+  typedef typename Vec::scalar_type scalar;
+  autoView(Umu_v,Umu,CpuRead);
+  autoView(ret_v,ret,CpuWrite);
+  thread_for(site,lvol,{
+    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
+    Coordinate lcoor;
+    grid->LocalIndexToLocalCoor(site, lcoor);
+    iScalar<iScalar<iMatrix<scalar, N> > > Us;
+    peekLocalSite(Us, Umu_v, lcoor);
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	scalar tmp= Us()()(i,j);
+	ComplexD ztmp(real(tmp),imag(tmp));
+	EigenU(i,j)=ztmp;
+      }}
+    ComplexD detD  = EigenU.determinant();
+    typename Vec::scalar_type det(detD.real(),detD.imag());
+    pokeLocalSite(det,ret_v,lcoor);
+  });
+  return ret;
+}
+
+template<int N, class Vec>
+static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
+{
+  Umu      = ProjectOnGroup(Umu);
+  auto det = Determinant(Umu);
+
+  det = conjugate(det);
+
+  for(int i=0;i<N;i++){
+    auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
+    element = element * det;
+    PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
+  }
+}
+template<int N,class Vec>
+static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<Vec, N> >,Nd> > &U)
+{
+  GridBase *grid=U.Grid();
+  // Reunitarise
+  for(int mu=0;mu<Nd;mu++){
+    auto Umu = PeekIndex<LorentzIndex>(U,mu);
+    Umu      = ProjectOnGroup(Umu);
+    ProjectSUn(Umu);
+    PokeIndex<LorentzIndex>(U,Umu,mu);
+  }
+}
+
 template <int ncolour>
 class SU {
 public:
@@ -741,8 +796,14 @@ public:
    typedef Lattice<vMatrixType> LatticeMatrixType;

    LatticeMatrixType Umu(out.Grid());
+    LatticeMatrixType tmp(out.Grid());
    for (int mu = 0; mu < Nd; mu++) {
-      LieRandomize(pRNG, Umu, 1.0);
+      //      LieRandomize(pRNG, Umu, 1.0);
+      //      PokeIndex<LorentzIndex>(out, Umu, mu);
+      gaussian(pRNG,Umu);
+      tmp = Ta(Umu);
+      taExp(tmp,Umu);
+      ProjectSUn(Umu);
      PokeIndex<LorentzIndex>(out, Umu, mu);
    }
  }
@@ -799,12 +860,12 @@ public:
 };

 template<int N>
-LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
+Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
 {
  GridBase *grid=Umu.Grid();
  auto lvol = grid->lSites();
-  LatticeComplexD ret(grid);
-
+  Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
+  
  autoView(Umu_v,Umu,CpuRead);
  autoView(ret_v,ret,CpuWrite);
  thread_for(site,lvol,{
@@ -812,42 +873,21 @@ LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N>
    Coordinate lcoor;
    grid->LocalIndexToLocalCoor(site, lcoor);
    iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
+    iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
    peekLocalSite(Us, Umu_v, lcoor);
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	EigenU(i,j) = Us()()(i,j);
      }}
-    ComplexD det = EigenU.determinant();
-    pokeLocalSite(det,ret_v,lcoor);
+    Eigen::MatrixXcd EigenUinv = EigenU.inverse();
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	Ui()()(i,j) = EigenUinv(i,j);
+      }}
+    pokeLocalSite(Ui,ret_v,lcoor);
  });
  return ret;
 }
-template<int N>
-static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
-{
-  Umu      = ProjectOnGroup(Umu);
-  auto det = Determinant(Umu);
-
-  det = conjugate(det);
-
-  for(int i=0;i<N;i++){
-    auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
-    element = element * det;
-    PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
-  }
-}
-template<int N>
-static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<vComplexD, N> >,Nd> > &U)
-{
-  GridBase *grid=U.Grid();
-  // Reunitarise
-  for(int mu=0;mu<Nd;mu++){
-    auto Umu = PeekIndex<LorentzIndex>(U,mu);
-    Umu      = ProjectOnGroup(Umu);
-    ProjectSUn(Umu);
-    PokeIndex<LorentzIndex>(U,Umu,mu);
-  }
-}
 // Explicit specialisation for SU(3).
 // Explicit specialisation for SU(3).
 static void
--- a/Grid/qcd/utils/SUnAdjoint.h
+++ b/Grid/qcd/utils/SUnAdjoint.h
@@ -51,6 +51,7 @@ public:
  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> > LatticeAdjFieldF;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> > LatticeAdjFieldD;

+  typedef Lattice<iScalar<iScalar<iVector<vComplex, Dimension> > > >  LatticeAdjVector;

  template <class cplx>
  static void generator(int Index, iSUnAdjointMatrix<cplx> &iAdjTa) {
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@@ -123,7 +123,7 @@ public:
 	  }
 	  if ( permute_slice ) {
 	    int ptype       =grid->PermuteType(d);
-	    uint8_t mask    =grid->Nsimd() >> (ptype + 1);		
+	    uint8_t mask    =0x1<<ptype;
 	    SE._permute    |= mask;
 	  }
 	}	
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -339,8 +339,8 @@ public:
  // Vectors that live on the symmetric heap in case of SHMEM
  // These are used; either SHM objects or refs to the above symmetric heap vectors
  // depending on comms target
-  Vector<cobj *> u_simd_send_buf;
-  Vector<cobj *> u_simd_recv_buf;
+  std::vector<cobj *> u_simd_send_buf;
+  std::vector<cobj *> u_simd_recv_buf;

  int u_comm_offset;
  int _unified_buffer_size;
@@ -348,7 +348,7 @@ public:
  ////////////////////////////////////////
  // Stencil query
  ////////////////////////////////////////
-#ifdef SHM_FAST_PATH
+#if 1
  inline int SameNode(int point) {

    int dimension    = this->_directions[point];
@@ -434,7 +434,6 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    accelerator_barrier();
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
@@ -452,7 +451,6 @@ public:
    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
    acceleratorCopySynchronise();
-    // Everyone agrees we are all done
    _grid->StencilBarrier(); 
  }
  ////////////////////////////////////////////////////////////////////////
@@ -541,6 +539,7 @@ public:
      compress.Point(point);
      HaloGatherDir(source,compress,point,face_idx);
    }
+    accelerator_barrier();
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);

@@ -666,11 +665,9 @@ public:
    for(int i=0;i<mm.size();i++){
      decompressor::MergeFace(decompress,mm[i]);
    }
-    if ( mm.size() )    acceleratorFenceComputeStream();
    for(int i=0;i<dd.size();i++){
      decompressor::DecompressFace(decompress,dd[i]);
    }
-    if ( dd.size() )    acceleratorFenceComputeStream();
  }
  ////////////////////////////////////////
  // Set up routines
@@ -708,6 +705,7 @@ public:
 	}
      }
    }
+    std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@@ -1369,10 +1367,11 @@ public:
 	    int recv_from_rank;
 	    int xmit_to_rank;
 	    int shm_send=0;
-	    int shm_recv=0;
+
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
 #ifdef SHM_FAST_PATH
  #warning STENCIL SHM FAST PATH SELECTED
+  	  int shm_recv=0;
 	    // shm == receive pointer         if offnode
 	    // shm == Translate[send pointer] if on node -- my view of his send pointer
 	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
@@ -1405,7 +1404,6 @@ public:
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
 	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
-	      int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
 	      AddPacket((void *)sp,(void *)rp,
 			xmit_to_rank,do_send,
 			recv_from_rank,do_send,
--- a/Grid/tensors/Tensor_exp.h
+++ b/Grid/tensors/Tensor_exp.h
@@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c


 // Specialisation: Cayley-Hamilton exponential for SU(3)
-#ifndef GRID_ACCELERATED
+#if 0
 template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
 accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
 {
--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@@ -133,7 +133,6 @@ typename vobj::scalar_object extractLane(int lane, const vobj & __restrict__ vec
  typedef scalar_type * pointer;

  constexpr int words=sizeof(vobj)/sizeof(vector_type);
-  constexpr int Nsimd=vector_type::Nsimd();

  scalar_object extracted;
  pointer __restrict__  sp = (pointer)&extracted; // Type pun
@@ -153,7 +152,6 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob
  typedef scalar_type * pointer;

  constexpr int words=sizeof(vobj)/sizeof(vector_type);
-  constexpr int Nsimd=vector_type::Nsimd();

  pointer __restrict__ sp = (pointer)&extracted;
  vector_type *vp = (vector_type *)&vec;
@@ -178,8 +176,6 @@ void extract(const vobj &vec,const ExtractPointerArray<sobj> &extracted, int off
  const int s = Nsimd/Nextr;

  vector_type * vp = (vector_type *)&vec;
-  scalar_type      vtmp;
-  sobj_scalar_type stmp;
  for(int w=0;w<words;w++){
    for(int i=0;i<Nextr;i++){
      sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
@@ -205,7 +201,6 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)

  vector_type * vp = (vector_type *)&vec;
  scalar_type      vtmp;
-  sobj_scalar_type stmp;
  for(int w=0;w<words;w++){
    for(int i=0;i<Nextr;i++){
      sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
@@ -242,9 +237,6 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest
  typedef oextract_type * opointer;
  typedef iextract_type * ipointer;

-  constexpr int oNsimd=ovector_type::Nsimd();
-  constexpr int iNsimd=ivector_type::Nsimd();
-
  iscalar_type itmp;
  oscalar_type otmp;

--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -526,7 +526,7 @@ inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 //////////////////////////////////////////////

 #ifdef GRID_SYCL
-inline void acceleratorFenceComputeStream(void){ theGridAccelerator->submit_barrier();};
+inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); };
 #else
 // Ordering within a stream guaranteed on Nvidia & AMD
 inline void acceleratorFenceComputeStream(void){ };
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@@ -0,0 +1,224 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Copyright (C) 2023
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
+#include <Grid/qcd/smearing/JacobianAction.h>
+
+using namespace Grid;
+
+int main(int argc, char **argv)
+{
+  std::cout << std::setprecision(12);
+  
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef MobiusFermionD FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+  typedef Grid::XmlReader       Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  MD.name    = std::string("Leap Frog");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = 12;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 0;
+  HMCparams.Trajectories     = 200;
+  HMCparams.NoMetropolisUntil=  20;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  HMCparams.StartingType     =std::string("HotStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_EODWF_lat";
+  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
+  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
+  CPparams.saveInterval  = 1;
+  CPparams.saveSmeared   = true;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 16;
+  Real beta         = 2.13;
+  Real light_mass   = 0.01;
+  Real strange_mass = 0.04;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD b   = 1.0; // Scale factor two
+  RealD c   = 0.0;
+
+  OneFlavourRationalParams OFRp;
+  OFRp.lo       = 1.0e-2;
+  OFRp.hi       = 64;
+  OFRp.MaxIter  = 10000;
+  OFRp.tolerance= 1.0e-10;
+  OFRp.degree   = 14;
+  OFRp.precision= 40;
+
+  std::vector<Real> hasenbusch({ 0.1 });
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+  LatticeGaugeField Uhot(GridPtr);
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+
+  double StoppingCondition = 1e-10;
+  double MaxCGIterations = 30000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+
+  bool ApplySmearing = true;
+  
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(2);
+  ActionLevel<HMCWrapper::Field> Level3(4);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+
+  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
+  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 CG,
+	 CG, CG,
+	 CG, CG, 
+	 OFRp, false);
+
+  EOFA.is_smeared = ApplySmearing;
+  Level1.push_back(&EOFA);
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]);
+    light_num.push_back(hasenbusch[h]);
+  }
+  light_num.push_back(pv_mass);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionAction *> Denominators;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
+    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
+  }
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    Quotients[h]->is_smeared = ApplySmearing;
+    Level1.push_back(Quotients[h]);
+  }
+
+  /////////////////////////////////////////////////////////////
+  // lnDetJacobianAction
+  /////////////////////////////////////////////////////////////
+  double rho = 0.1;  // smearing parameter
+  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
+  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
+  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
+  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
+  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
+  if( ApplySmearing ) Level2.push_back(&Jacobian);
+  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
+
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  //  GaugeAction.is_smeared = ApplySmearing;
+  GaugeAction.is_smeared = true;
+  Level3.push_back(&GaugeAction);
+
+  std::cout << GridLogMessage << " ************************************************"<< std::endl;
+  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
+  std::cout << GridLogMessage << " ************************************************"<< std::endl;
+  std::cout << GridLogMessage <<  std::endl;
+  std::cout << GridLogMessage <<  std::endl;
+
+
+  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+
+  TheHMC.Run(SmearingPolicy); // for smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
@@ -227,7 +227,7 @@ int main(int argc, char **argv) {
  //  std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });

-  int SP_iters=10000;
+  int SP_iters=9000;
  
  RationalActionParams OFRp; // Up/down
  OFRp.lo       = 6.0e-5;
@@ -362,12 +362,12 @@ int main(int argc, char **argv) {

  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
-  SFRp.lo       = 0.25;
+  SFRp.lo       = 0.1;
  SFRp.hi       = 25.0;
  SFRp.MaxIter  = 10000;
-  SFRp.tolerance= 1.0e-5;
+  SFRp.tolerance= 1.0e-8;
  SFRp.mdtolerance= 2.0e-4;
-  SFRp.degree   = 8;
+  SFRp.degree   = 12;
  SFRp.precision= 50;
  
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
--- a/HMC/Mobius2p1f_DD_RHMC_96I.cc
+++ b/HMC/Mobius2p1f_DD_RHMC_96I.cc
@@ -329,7 +329,6 @@ int main(int argc, char **argv) {

    
    auto grid4= GridPtr;
-    auto rbgrid4= GridRBPtr;
    auto rbgrid = StrangeOp.FermionRedBlackGrid();
    auto grid = StrangeOp.FermionGrid();
    if(1){
--- a/HMC/Mobius2p1f_EOFA_96I_hmc.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc
@@ -146,6 +146,8 @@ NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  using namespace Grid;

+  std::cout << " Grid Initialise "<<std::endl;
+  
  Grid_init(&argc, &argv);

  CartesianCommunicator::BarrierWorld();
@@ -170,24 +172,24 @@ int main(int argc, char **argv) {
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
-  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
-  MD.name    = std::string("Force Gradient");
-  //typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
-  // MD.name    = std::string("MinimumNorm2");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  MD.name    = std::string("MinimumNorm2");
  // TrajL = 2
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
-  MD.MDsteps =  12;
+  MD.MDsteps =  14;
  MD.trajL   = 0.5;

  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 1077;
-  HMCparams.Trajectories     = 1;
+  HMCparams.Trajectories     = 20;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  //  HMCparams.StartingType     =std::string("ColdStart");
-  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.StartingType     =std::string("ColdStart");
+  //  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);

@@ -223,7 +225,7 @@ int main(int argc, char **argv) {
  Real pv_mass      = 1.0;
  //  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
-  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
+  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });

  auto GridPtr   = TheHMC.Resources.GetCartesian();
@@ -244,11 +246,6 @@ int main(int argc, char **argv) {
  Coordinate shm;

  GlobalSharedMemory::GetShmDims(mpi,shm);
-  
-  Coordinate CommDim(Nd);
-  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-
-  Coordinate NonDirichlet(Nd+1,0);

  //////////////////////////
  // Fermion Grids
@@ -277,15 +274,13 @@ int main(int argc, char **argv) {
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  FermionActionF::ImplParams ParamsF(boundary);
-  Params.dirichlet=NonDirichlet;
-  ParamsF.dirichlet=NonDirichlet;

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
-  double StoppingCondition = 1e-8;
-  double MDStoppingCondition = 1e-7;
-  double MDStoppingConditionLoose = 1e-7;
-  double MDStoppingConditionStrange = 1e-7;
+  double StoppingCondition = 1e-9;
+  double MDStoppingCondition = 1e-8;
+  double MDStoppingConditionLoose = 1e-8;
+  double MDStoppingConditionStrange = 1e-8;
  double MaxCGIterations = 300000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
@@ -305,12 +300,12 @@ int main(int argc, char **argv) {

  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
-  SFRp.lo       = 0.25;
-  SFRp.hi       = 25.0;
+  SFRp.lo       = 0.1;
+  SFRp.hi       = 30.0;
  SFRp.MaxIter  = 10000;
-  SFRp.tolerance= 1.0e-5;
-  SFRp.mdtolerance= 2.0e-4;
-  SFRp.degree   = 8;
+  SFRp.tolerance= 1.0e-8;
+  SFRp.mdtolerance= 2.0e-6;
+  SFRp.degree   = 10;
  SFRp.precision= 50;
  
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
@@ -370,19 +365,17 @@ int main(int argc, char **argv) {
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
-  std::vector<int> dirichlet_den;
-  std::vector<int> dirichlet_num;

  int n_hasenbusch = hasenbusch.size();
-  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
+  light_den.push_back(light_mass); 
  for(int h=0;h<n_hasenbusch;h++){
-    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(0);
+    light_den.push_back(hasenbusch[h]);
  }

  for(int h=0;h<n_hasenbusch;h++){
-    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(0);
+    light_num.push_back(hasenbusch[h]);
  }
-  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
+  light_num.push_back(pv_mass);

  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
@@ -408,9 +401,7 @@ int main(int argc, char **argv) {
    std::cout << GridLogMessage
 	      << " 2f quotient Action ";
    std::cout << "det D("<<light_den[h]<<")";
-    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
    std::cout << "/ det D("<<light_num[h]<<")";
-    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
    std::cout << std::endl;

    FermionAction::ImplParams ParamsNum(boundary);
@@ -418,21 +409,11 @@ int main(int argc, char **argv) {
    FermionActionF::ImplParams ParamsDenF(boundary);
    FermionActionF::ImplParams ParamsNumF(boundary);
    
-    ParamsNum.dirichlet = NonDirichlet;
-    ParamsDen.dirichlet = NonDirichlet;
-
-    ParamsNum.partialDirichlet = 0;
-    ParamsDen.partialDirichlet = 0;
-    
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));

-    ParamsDenF.dirichlet = ParamsDen.dirichlet;
-    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));

-    ParamsNumF.dirichlet = ParamsNum.dirichlet;
-    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
    NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));

    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
@@ -469,7 +450,6 @@ int main(int argc, char **argv) {
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
-  //  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -425,7 +425,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)

  err = r_eo-result;
  n2e= norm2(err);
-  std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
  assert(n2e<1.0e-4);

  pickCheckerboard(Even,src_e,err);
--- a/documentation/David_notes.txt
+++ b/documentation/David_notes.txt
@@ -0,0 +1,90 @@
+Branch: develop
+
+Files:
+
+Grid/lattice/PaddedCell.h -- Halo exchange
+tests/Test_general_stencil.cc -- test local off axis stencil addressing
+tests/debug/Test_padded_cell.cc -- test PaddedCell halo exchange and the General local stencil  by computing ALL plaquettes on lattice
+
+Functionality:
+
+-- extend a lattice field:
+Grid/lattice/PaddedCell.h
+
+// Constructor
+  PaddedCell(int _depth,GridCartesian *_grid)
+
+// Expand a field "in" to depth "d"
+  template<class vobj>
+  inline Lattice<vobj> Exchange(Lattice<vobj> &in)
+  
+// Take the "apple core" of in to a smaller local volume
+  template<class vobj>
+  inline Lattice<vobj> Extract(Lattice<vobj> &in)
+
+-- Plaquette test:
+tests/debug/Test_padded_cell.cc
+  /////////////////////////////////////////////////
+  // Create a padded cell of extra padding depth=1
+  /////////////////////////////////////////////////
+  int depth = 1;
+  PaddedCell Ghost(depth,&GRID);
+  LatticeGaugeField Ughost = Ghost.Exchange(Umu);
+
+///// Array for the site plaquette
+  GridBase *GhostGrid = Ughost.Grid();
+  LatticeComplex gplaq(GhostGrid); 
+
+  std::vector<Coordinate> shifts;
+  for(int mu=0;mu<Nd;mu++){
+    for(int nu=mu+1;nu<Nd;nu++){
+  
+      //    Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x)
+      Coordinate shift_0(Nd,0);
+      Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
+      Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
+      shifts.push_back(shift_0);
+      shifts.push_back(shift_mu);
+      shifts.push_back(shift_nu);
+      shifts.push_back(shift_0);
+    }
+  }
+  GeneralLocalStencil gStencil(GhostGrid,shifts);
+
+  gplaq=Zero();
+  {
+    autoView( gp_v , gplaq, CpuWrite);
+    autoView( t_v , trplaq, CpuRead);
+    autoView( U_v , Ughost, CpuRead);
+    for(int ss=0;ss<gp_v.size();ss++){
+      int s=0;
+      for(int mu=0;mu<Nd;mu++){
+	for(int nu=mu+1;nu<Nd;nu++){
+
+	  auto SE0 = gStencil.GetEntry(s+0,ss);
+	  auto SE1 = gStencil.GetEntry(s+1,ss);
+	  auto SE2 = gStencil.GetEntry(s+2,ss);
+	  auto SE3 = gStencil.GetEntry(s+3,ss);
+	
+	  int o0 = SE0->_offset;
+	  int o1 = SE1->_offset;
+	  int o2 = SE2->_offset;
+	  int o3 = SE3->_offset;
+	  
+	  auto U0 = U_v[o0](mu);
+	  auto U1 = U_v[o1](nu);
+	  auto U2 = adj(U_v[o2](mu));
+	  auto U3 = adj(U_v[o3](nu));
+
+	  gpermute(U0,SE0->_permute);
+	  gpermute(U1,SE1->_permute);
+	  gpermute(U2,SE2->_permute);
+	  gpermute(U3,SE3->_permute);
+	  
+	  gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 );
+	  s=s+4;
+	}
+      }
+    }
+  }
+  cplaq = Ghost.Extract(gplaq);
--- a/examples/socket_grid.cc
+++ b/examples/socket_grid.cc
@@ -0,0 +1,133 @@
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <err.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+static int sock;
+static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
+static char sock_path[256];
+
+class UnixSockets {
+public:
+  static void Open(int rank)
+  {
+    int errnum;
+
+    sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0);
+    printf("allocated socket %d\n",sock);
+
+    struct sockaddr_un sa_un = { 0 };
+    sa_un.sun_family = AF_UNIX;
+    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
+    unlink(sa_un.sun_path);
+    if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
+      perror("bind failure");
+      exit(EXIT_FAILURE);
+    }
+    printf("bound socket %d to %s\n",sock,sa_un.sun_path);
+  }
+
+  static int RecvFileDescriptor(void)
+  {
+    int n;
+    int fd;
+    char buf[1];
+    struct iovec iov;
+    struct msghdr msg;
+    struct cmsghdr *cmsg;
+    char cms[CMSG_SPACE(sizeof(int))];
+
+    iov.iov_base = buf;
+    iov.iov_len = 1;
+
+    memset(&msg, 0, sizeof msg);
+    msg.msg_name = 0;
+    msg.msg_namelen = 0;
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+
+    msg.msg_control = (caddr_t)cms;
+    msg.msg_controllen = sizeof cms;
+
+    if((n=recvmsg(sock, &msg, 0)) < 0) {
+      perror("recvmsg failed");
+      return -1;
+    }
+    if(n == 0){
+      perror("recvmsg returned 0");
+      return -1;
+    }
+    cmsg = CMSG_FIRSTHDR(&msg);
+    memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
+    printf("received fd %d from socket %d\n",fd,sock);
+    return fd;
+  }
+
+  static void SendFileDescriptor(int fildes,int xmit_to_rank)
+  {
+    struct msghdr msg;
+    struct iovec iov;
+    struct cmsghdr *cmsg = NULL;
+    char ctrl[CMSG_SPACE(sizeof(int))];
+    char data = ' ';
+
+    memset(&msg, 0, sizeof(struct msghdr));
+    memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
+    iov.iov_base = &data;
+    iov.iov_len = sizeof(data);
+    
+    sprintf(sock_path,sock_path_fmt,xmit_to_rank);
+    printf("sending FD %d over socket %d to rank %d AF_UNIX path %s\n",fildes,sock,xmit_to_rank,sock_path);fflush(stdout);
+    
+    struct sockaddr_un sa_un = { 0 };
+    sa_un.sun_family = AF_UNIX;
+    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
+
+    msg.msg_name = (void *)&sa_un;
+    msg.msg_namelen = sizeof(sa_un);
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+    msg.msg_controllen =  CMSG_SPACE(sizeof(int));
+    msg.msg_control = ctrl;
+
+    cmsg = CMSG_FIRSTHDR(&msg);
+    cmsg->cmsg_level = SOL_SOCKET;
+    cmsg->cmsg_type = SCM_RIGHTS;
+    cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+
+    *((int *) CMSG_DATA(cmsg)) = fildes;
+
+    if ( sendmsg(sock, &msg, 0) == -1 ) perror("sendmsg failed");
+  };
+};
+
+int main(int argc, char **argv)
+{
+  int me = fork()?0:1;
+  
+  UnixSockets::Open(me);
+  
+  // need MPI barrier
+  sleep(10);
+  const char * message = "Hello, World\n";
+  if( me ) {
+    int fd = open("foo",O_RDWR|O_CREAT,0666);
+    if ( fd < 0 ) {
+      perror("failed to open file");
+      exit(EXIT_FAILURE);
+    }
+    // rank 1 sends ot rank 0
+    UnixSockets::SendFileDescriptor(fd,0);
+    close(fd);
+  } else {
+    // rank 0 sends receives frmo rank 1
+    int fd = UnixSockets::RecvFileDescriptor();
+    write(fd,(const void *)message,strlen(message));
+    close(fd);
+  }
+}
--- a/grid-config.in
+++ b/grid-config.in
@@ -60,7 +60,7 @@ while test $# -gt 0; do
    ;;
    
    --cxxflags)
-      echo @GRID_CXXFLAGS@
+      echo @GRID_CXXFLAGS@ -I@prefix@/include
    ;;
    
    --cxx)
@@ -72,11 +72,11 @@ while test $# -gt 0; do
    ;;
    
    --ldflags)
-      echo @GRID_LDFLAGS@
+      echo @GRID_LDFLAGS@ -L@prefix@/lib
    ;;
    
    --libs)
-      echo @GRID_LIBS@
+      echo @GRID_LIBS@ -lGrid
    ;;
    
    --summary)
--- a/systems/Lumi/benchmarks/bench2.slurm
+++ b/systems/Lumi/benchmarks/bench2.slurm
@@ -0,0 +1,44 @@
+#!/bin/bash -l
+#SBATCH --job-name=bench_lehner
+#SBATCH --partition=small-g
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-node=8
+#SBATCH --time=00:10:00
+#SBATCH --account=project_465000546
+#SBATCH --gpu-bind=none
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+CPU_BIND="map_cpu:48,56,32,40,16,24,1,8"
+echo $CPU_BIND
+
+cat << EOF > select_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3 4 5 6 7)
+export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
+export HIP_VISIBLE_DEVICES=\$GPU
+unset ROCR_VISIBLE_DEVICES
+echo RANK \$SLURM_LOCALID using GPU \$GPU    
+exec \$*
+EOF
+
+chmod +x ./select_gpu
+
+root=/scratch/project_465000546/boylepet/Grid/systems/Lumi
+source ${root}/sourceme.sh
+
+export OMP_NUM_THREADS=7
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+
+for vol in 16.16.16.64 32.32.32.64  32.32.32.128
+do
+srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
+#srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
+
+srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
+#srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
+done
+
--- a/systems/Lumi/config-command
+++ b/systems/Lumi/config-command
@@ -0,0 +1,30 @@
+spack load c-lime
+spack load gmp
+spack load mpfr
+CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
+GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
+MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
+echo clime X$CLIME
+echo gmp X$GMP
+echo mpfr X$MPFR
+
+../../configure \
+--enable-comms=mpi-auto \
+--with-lime=$CLIME \
+--enable-unified=no \
+--enable-shm=nvlink \
+--enable-accelerator=hip \
+--enable-gen-simd-width=64 \
+--enable-simd=GPU \
+--enable-accelerator-cshift \
+--with-gmp=$GMP \
+--with-mpfr=$MPFR \
+--with-fftw=$FFTW_DIR/.. \
+--disable-fermion-reps \
+--disable-gparity \
+CXX=hipcc MPICXX=mpicxx \
+  CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
+  LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp" 
+
+
+
--- a/systems/Lumi/sourceme.sh
+++ b/systems/Lumi/sourceme.sh
@@ -0,0 +1,5 @@
+source ~/spack/share/spack/setup-env.sh
+module load CrayEnv LUMI/22.12 partition/G  cray-fftw/3.3.10.1 rocm
+spack load c-lime
+spack load gmp
+spack load mpfr
--- a/systems/PVC/setup.sh
+++ b/systems/PVC/setup.sh
@@ -3,8 +3,14 @@ export https_proxy=http://proxy-chain.intel.com:911
 export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH

 module load intel-release
-source /opt/intel/oneapi/PVC_setup.sh
+module load intel-comp-rt/embargo-ci-neo
+
+#source /opt/intel/oneapi/PVC_setup.sh
 #source /opt/intel/oneapi/ATS_setup.sh
+#module load intel-nightly/20230331
+#module load intel-comp-rt/ci-neo-master/026093
+
+#module load intel/mpich
 module load intel/mpich/pvc45.3
 export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH

--- a/systems/Sunspot/benchmarks/bench.pbs
+++ b/systems/Sunspot/benchmarks/bench.pbs
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+#PBS -l select=1:system=sunspot,place=scatter
+#PBS -A LatticeQCD_aesp_CNDA
+#PBS -l walltime=01:00:00
+#PBS -N dwf
+#PBS -k doe
+
+HDIR=/home/paboyle/
+module use /soft/testing/modulefiles/
+module load intel-UMD23.05.25593.11/23.05.25593.11
+module load tools/pti-gpu  
+export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
+export PATH=$HDIR/tools/bin:$PATH
+
+export TZ='/usr/share/zoneinfo/US/Central'
+export OMP_PROC_BIND=spread
+export OMP_NUM_THREADS=3
+unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+qsub jobscript.pbs
+
+echo Jobid: $PBS_JOBID
+echo Running on host `hostname`
+echo Running on nodes `cat $PBS_NODEFILE`
+
+echo NODES
+cat $PBS_NODEFILE
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS=12         # Number of MPI ranks per node
+NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
+NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
+
+NTOTRANKS=$(( NNODES * NRANKS ))
+
+echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
+echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
+
+    
+CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
+	     ./gpu_tile_compact.sh \
+	./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
+	--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
--- a/systems/Sunspot/benchmarks/gpu_tile_compact.sh
+++ b/systems/Sunspot/benchmarks/gpu_tile_compact.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+display_help() {
+  echo " Will map gpu tile to rank in compact and then round-robin fashion"
+  echo " Usage (only work for one node of ATS/PVC):"
+  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
+  echo
+  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
+  echo "   0 Rank 0.0"
+  echo "   1 Rank 0.1"
+  echo "   2 Rank 1.0"
+  echo "   3 Rank 1.1"
+  echo "   4 Rank 2.0"
+  echo "   5 Rank 2.1"
+  echo "   6 Rank 0.0"
+  echo
+  echo " Hacked together by apl@anl.gov, please contact if bug found"
+  exit 1
+}
+
+#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
+#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
+num_gpu=6
+num_tile=2
+
+if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
+  display_help
+fi
+
+gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
+tile_id=$((PALS_LOCAL_RANKID % num_tile))
+
+unset EnableWalkerPartition
+export EnableImplicitScaling=0
+export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+export ZE_AFFINITY_MASK=$gpu_id.$tile_id
+export ONEAPI_DEVICE_FILTER=gpu,level_zero
+export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
+
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK"
+
+if [ $PALS_LOCAL_RANKID = 0 ]
+then
+    onetrace --chrome-device-timeline "$@"
+#    "$@"
+else
+"$@"
+fi
--- a/systems/Sunspot/config-command
+++ b/systems/Sunspot/config-command
@@ -0,0 +1,16 @@
+TOOLS=$HOME/tools
+../../configure \
+	--enable-simd=GPU \
+	--enable-gen-simd-width=64 \
+	--enable-comms=mpi-auto \
+	--enable-accelerator-cshift \
+	--disable-gparity \
+	--disable-fermion-reps \
+	--enable-shm=nvlink \
+	--enable-accelerator=sycl \
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=icpx \
+	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -lapmidg -L$TOOLS/lib64/" \
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
+
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@@ -1,2 +1,4 @@
-CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi --enable-unified=yes
+BREW=/opt/local/
+CXX=mpicxx-openmpi-mp ../../configure --enable-simd=GEN --enable-comms=mpi --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug
+

--- a/tests/Test_general_stencil.cc
+++ b/tests/Test_general_stencil.cc
@@ -115,6 +115,7 @@ int main(int argc, char ** argv)
 	  if (SE->_permute & 0x2 ) { permute(check[i],tmp,1); tmp=check[i];}
 	  if (SE->_permute & 0x4 ) { permute(check[i],tmp,2); tmp=check[i];}
 	  if (SE->_permute & 0x8 ) { permute(check[i],tmp,3); tmp=check[i];}
+	  //	  std::cout<<GridLogMessage<<"stencil["<<i<<"] "<< check[i]<< " perm "<<(uint32_t)SE->_permute <<std::endl;
 	}

 	Real nrmC = norm2(Check);
@@ -138,18 +139,17 @@ int main(int argc, char ** argv)
 	  ddiff = check -bar;
 	  diff =norm2(ddiff);
 	  if ( diff > 0){
-	    std::cout <<"Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]
-		      <<") " <<check<<" vs "<<bar<<std::endl;
+	    std::cout <<"Diff at Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]
+		      <<") stencil " <<check<<" vs cshift "<<bar<<std::endl;
 	  }

-
 	}}}}

 	if (nrm > 1.0e-4) {
 	  autoView( check , Check, CpuRead);
 	  autoView(   bar ,   Bar, CpuRead);
 	  for(int i=0;i<check.size();i++){
-	    std::cout << i<<" Check "<<check[i]<< "\n"<<i<<" Bar "<<bar[i]<<std::endl;
+	    std::cout << i<<" ERROR Check \n"<<check[i]<< "\n"<<i<<" Bar \n"<<bar[i]<<std::endl;
 	  }
 	}
 	if (nrm > 1.0e-4) exit(-1);
--- a/tests/core/Test_fft_pf.cc
+++ b/tests/core/Test_fft_pf.cc
@@ -0,0 +1,307 @@
+    /*************************************************************************************
+
+    grid` physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate simd_layout( { vComplexD::Nsimd(),1,1,1});
+  Coordinate mpi_layout  = GridDefaultMpi();
+
+  int vol = 1;
+  for(int d=0;d<latt_size.size();d++){
+    vol = vol * latt_size[d];
+  }
+  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian RBGRID(&GRID);
+
+  ComplexD ci(0.0,1.0);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridSerialRNG          sRNG;  sRNG.SeedFixedIntegers(seeds); // naughty seeding
+  GridParallelRNG          pRNG(&GRID);
+  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeGaugeFieldD Umu(&GRID);
+
+  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
+
+  ////////////////////////////////////////////////////
+  // PF prop
+  ////////////////////////////////////////////////////
+  LatticeFermionD    src(&GRID);
+
+  gaussian(pRNG,src);
+#if 1
+    Coordinate point(4,0);
+    src=Zero();
+    SpinColourVectorD ferm; gaussian(sRNG,ferm);
+    pokeSite(ferm,src,point);
+#endif
+  
+  {
+    std::cout<<"****************************************"<<std::endl;
+    std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator \n";
+    std::cout<<"****************************************"<<std::endl;
+
+    //    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
+    LatticeFermionD    tmp(&GRID);
+    LatticeFermionD    ref(&GRID);
+    LatticeFermionD    diff(&GRID);
+
+    const int Ls=48+1;
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
+
+    RealD mass=0.1;
+    RealD M5  =0.8;
+    OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0);
+
+    // Momentum space prop
+    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
+    bool fiveD = false; //calculate 4d free propagator
+
+    std::cout << " Free propagator " <<std::endl;
+    Dov.FreePropagator(src,ref,mass) ;
+    std::cout << " Free propagator norm "<< norm2(ref) <<std::endl;
+
+    Gamma G5(Gamma::Algebra::Gamma5);
+
+    LatticeFermionD    src5(FGrid); src5=Zero();
+    LatticeFermionD    tmp5(FGrid); 
+    LatticeFermionD    result5(FGrid); result5=Zero();
+    LatticeFermionD    result4(&GRID); 
+    const int sdir=0;
+
+    ////////////////////////////////////////////////////////////////////////
+    // Import
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Free propagator Import "<< norm2(src) <<std::endl;
+    Dov.ImportPhysicalFermionSource  (src,src5);
+    std::cout << " Free propagator Imported "<< norm2(src5) <<std::endl;
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Conjugate gradient on normal equations system
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
+    Dov.Mdag(src5,tmp5);
+    src5=tmp5;
+    MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov);
+    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
+    CG(HermOp,src5,result5);
+    ////////////////////////////////////////////////////////////////////////
+    // Domain wall physical field propagator
+    ////////////////////////////////////////////////////////////////////////
+    Dov.ExportPhysicalFermionSolution(result5,result4);
+
+    // From DWF4d.pdf :
+    //
+    // Dov_pf = 2/(1-m) D_cayley_ovlap  [ Page 43 ]
+    // Dinv_cayley_ovlap = 2/(1-m) Dinv_pf 
+    // Dinv_cayley_surface =1/(1-m) ( Dinv_cayley_ovlap - 1 ) =>  2/(1-m)^2 Dinv_pf - 1/(1-m) * src   [ Eq.2.67 ]
+
+    RealD scale = 2.0/(1.0-mass)/(1.0-mass);
+    result4 = result4 * scale;
+    result4 = result4 - src*(1.0/(1.0-mass)); // Subtract contact term
+    DumpSliceNorm("Src",src);
+    DumpSliceNorm("Grid",result4);
+    DumpSliceNorm("Fourier",ref);
+
+    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
+    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
+
+    diff = result4- ref;
+    DumpSliceNorm("diff ",diff);
+    
+  }
+  
+  ////////////////////////////////////////////////////
+  // Dwf prop
+  ////////////////////////////////////////////////////
+  {
+    std::cout<<"****************************************"<<std::endl;
+    std::cout << "Testing Dov(Hw) Mom space 4d propagator \n";
+    std::cout<<"****************************************"<<std::endl;
+
+    LatticeFermionD    tmp(&GRID);
+    LatticeFermionD    ref(&GRID);
+    LatticeFermionD    diff(&GRID);
+
+    const int Ls=48;
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
+
+    RealD mass=0.1;
+    RealD M5  =0.8;
+
+    OverlapWilsonCayleyTanhFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,1.0);
+
+    // Momentum space prop
+    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
+    Dov.FreePropagator(src,ref,mass) ;
+
+    Gamma G5(Gamma::Algebra::Gamma5);
+
+    LatticeFermionD    src5(FGrid); src5=Zero();
+    LatticeFermionD    tmp5(FGrid); 
+    LatticeFermionD    result5(FGrid); result5=Zero();
+    LatticeFermionD    result4(&GRID); 
+    const int sdir=0;
+
+    ////////////////////////////////////////////////////////////////////////
+    // Domain wall physical field source; need D_minus
+    ////////////////////////////////////////////////////////////////////////
+    /*
+	chi_5[0]   = chiralProjectPlus(chi);
+	chi_5[Ls-1]= chiralProjectMinus(chi);
+    */      
+    tmp =   (src + G5*src)*0.5;      InsertSlice(tmp,src5,   0,sdir);
+    tmp =   (src - G5*src)*0.5;      InsertSlice(tmp,src5,Ls-1,sdir);
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Conjugate gradient on normal equations system
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
+    Dov.Dminus(src5,tmp5);
+    src5=tmp5;
+    Dov.Mdag(src5,tmp5);
+    src5=tmp5;
+    MdagMLinearOperator<OverlapWilsonCayleyTanhFermionD,LatticeFermionD> HermOp(Dov);
+    ConjugateGradient<LatticeFermionD> CG(1.0e-16,10000);
+    CG(HermOp,src5,result5);
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Domain wall physical field propagator
+    ////////////////////////////////////////////////////////////////////////
+    /*
+      psi  = chiralProjectMinus(psi_5[0]);
+      psi += chiralProjectPlus(psi_5[Ls-1]);
+    */
+    ExtractSlice(tmp,result5,0   ,sdir);   result4 =         (tmp-G5*tmp)*0.5;
+    ExtractSlice(tmp,result5,Ls-1,sdir);   result4 = result4+(tmp+G5*tmp)*0.5;
+    
+    std::cout << " Taking difference" <<std::endl;
+    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
+    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
+    DumpSliceNorm("Grid",result4);
+    DumpSliceNorm("Fourier",ref);
+    diff = ref - result4;
+    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
+    
+    DumpSliceNorm("diff",diff);
+
+  }
+
+  
+  {
+    std::cout<<"****************************************"<<std::endl;
+    std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator with q\n";
+    std::cout<<"****************************************"<<std::endl;
+
+    //    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
+    LatticeFermionD    tmp(&GRID);
+    LatticeFermionD    ref(&GRID);
+    LatticeFermionD    diff(&GRID);
+
+    const int Ls=48+1;
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
+
+    RealD mass=0.1;
+    RealD M5  =0.8;
+    OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0);
+
+    // Momentum space prop
+    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
+    bool fiveD = false; //calculate 4d free propagator
+
+    std::cout << " Free propagator " <<std::endl;
+    Dov.FreePropagator(src,ref,mass) ;
+    std::cout << " Free propagator norm "<< norm2(ref) <<std::endl;
+
+    Gamma G5(Gamma::Algebra::Gamma5);
+
+    LatticeFermionD    src5(FGrid); src5=Zero();
+    LatticeFermionD    tmp5(FGrid); 
+    LatticeFermionD    result5(FGrid); result5=Zero();
+    LatticeFermionD    result4(&GRID); 
+    const int sdir=0;
+
+    ////////////////////////////////////////////////////////////////////////
+    // Import
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Free propagator Import "<< norm2(src) <<std::endl;
+    Dov.ImportPhysicalFermionSource  (src,src5);
+    std::cout << " Free propagator Imported "<< norm2(src5) <<std::endl;
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Conjugate gradient on normal equations system
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
+    Dov.Mdag(src5,tmp5);
+    src5=tmp5;
+    MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov);
+    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
+    CG(HermOp,src5,result5);
+    ////////////////////////////////////////////////////////////////////////
+    // Domain wall physical field propagator
+    ////////////////////////////////////////////////////////////////////////
+    Dov.ExportPhysicalFermionSolution(result5,result4);
+
+    // From DWF4d.pdf :
+    //
+    // Dov_pf = 2/(1-m) D_cayley_ovlap  [ Page 43 ]
+    // Dinv_cayley_ovlap = 2/(1-m) Dinv_pf 
+    // Dinv_cayley_surface =1/(1-m) ( Dinv_cayley_ovlap - 1 ) =>  2/(1-m)^2 Dinv_pf - 1/(1-m) * src   [ Eq.2.67 ]
+
+    RealD scale = 2.0/(1.0-mass)/(1.0-mass);
+    result4 = result4 * scale;
+    result4 = result4 - src*(1.0/(1.0-mass)); // Subtract contact term
+    DumpSliceNorm("Src",src);
+    DumpSliceNorm("Grid",result4);
+    DumpSliceNorm("Fourier",ref);
+
+    std::cout << "Dov result4 "<<norm2(result4)<<std::endl;
+    std::cout << "Dov ref     "<<norm2(ref)<<std::endl;
+
+    diff = result4- ref;
+    DumpSliceNorm("diff ",diff);
+    
+  }
+
+  
+  Grid_finalize();
+}
--- a/tests/core/Test_lie_generators.cc
+++ b/tests/core/Test_lie_generators.cc
@@ -63,7 +63,9 @@ int main(int argc, char** argv) {
  std::cout << "Dimension of adjoint representation: "<< SU2Adjoint::Dimension << std::endl;

  // guard as this code fails to compile for Nc != 3
-#if (Nc == 3)
+#if 1
+
+  std::cout << " Printing  Adjoint Generators"<< std::endl;
    
  SU2Adjoint::printGenerators();
  SU2::testGenerators();
@@ -148,10 +150,33 @@ int main(int argc, char** argv) {
    typename AdjointRep<Nc>::LatticeMatrix Vrmu = peekLorentz(Vr,mu);
    pokeLorentz(UrVr,Urmu*Vrmu, mu);
  }
-    
+
+  typedef typename SU_Adjoint<Nc>::AMatrix AdjointMatrix;
  typename AdjointRep<Nc>::LatticeField Diff_check = UVr - UrVr;
  std::cout << GridLogMessage << "Group structure SU("<<Nc<<") check difference (Adjoint representation) : " << norm2(Diff_check) << std::endl;
-    
+
+  std::cout << GridLogMessage << "****************************************** " << std::endl;
+  std::cout << GridLogMessage << " MAP BETWEEN FUNDAMENTAL AND ADJOINT CHECK " << std::endl;
+  std::cout << GridLogMessage << "****************************************** " << std::endl;
+  for(int a=0;a<Nc*Nc-1;a++){
+  for(int b=0;b<Nc*Nc-1;b++){
+  for(int c=0;c<Nc*Nc-1;c++){
+    ColourMatrix Ta;
+    ColourMatrix Tb;
+    ColourMatrix Tc;
+    SU3::generator(a, Ta);
+    SU3::generator(b, Tb);
+    SU3::generator(c, Tc);
+    AdjointMatrix TRa;
+    SU3Adjoint::generator(a,TRa);
+    Complex tr1 = trace ( Tc * ( Ta*Tb-Tb*Ta)); // i/2 fabc
+    Complex tr2 = TRa()()(b,c) * Complex(0,1);
+    std::cout << " 2 Tr( Tc[Ta,Tb]) " << 2.0*tr1<<std::endl;
+    std::cout << " - TRa_bc " << tr2<<std::endl;
+    assert(abs( (2.0*tr1-tr2) ) < 1.0e-7);
+    std::cout << "------------------"<<std::endl;
+  }}}
+  
  // Check correspondence of algebra and group transformations
  // Create a random vector
  SU3::LatticeAlgebraVector h_adj(grid);
--- a/tests/debug/Test_padded_cell.cc
+++ b/tests/debug/Test_padded_cell.cc
@@ -0,0 +1,184 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class vobj> void gpermute(vobj & inout,int perm){
+  vobj tmp=inout;
+  if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
+  if (perm & 0x2 ) { permute(inout,tmp,1); tmp=inout;}
+  if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
+  if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
+}
+  
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size  = GridDefaultLatt();
+  Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout = GridDefaultMpi();
+  std::cout << " mpi "<<mpi_layout<<std::endl;
+  std::cout << " simd "<<simd_layout<<std::endl;
+  std::cout << " latt "<<latt_size<<std::endl;
+  GridCartesian GRID(latt_size,simd_layout,mpi_layout);
+
+  GridParallelRNG   pRNG(&GRID);
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  LatticeGaugeField Umu(&GRID);
+
+  SU<Nc>::HotConfiguration(pRNG,Umu);
+
+  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
+  LatticeComplex trplaq(&GRID);
+
+  std::vector<LatticeColourMatrix> U(Nd, Umu.Grid());
+  for (int mu = 0; mu < Nd; mu++) {
+    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+  }
+
+  std::cout << GridLogMessage << " Average plaquette "<<plaq<<std::endl;
+
+  LatticeComplex cplaq(&GRID); cplaq=Zero();
+
+  /////////////////////////////////////////////////
+  // Create a padded cell of extra padding depth=1
+  /////////////////////////////////////////////////
+  int depth = 1;
+  PaddedCell Ghost(depth,&GRID);
+  LatticeGaugeField Ughost = Ghost.Exchange(Umu);
+
+  ///////////////////////////////////////////////////////////////////
+  // Temporary debug Hack for single rank sim:
+  // Check the contents of the cell are periodcally replicated
+  // In future ONLY pad those dimensions that are not local to node
+  ///////////////////////////////////////////////////////////////////
+#if 0
+  {
+    double diff=0;
+    double n=0;
+  {
+    autoView( Ug_v , Ughost, CpuRead);
+    autoView( Ul_v , Umu   , CpuRead);
+  for(int x=0;x<latt_size[0]+2;x++){
+  for(int y=0;y<latt_size[1]+2;y++){
+  for(int z=0;z<latt_size[2]+2;z++){
+  for(int t=0;t<latt_size[3]+2;t++){
+    int lx=(x-1+latt_size[0])%latt_size[0];
+    int ly=(y-1+latt_size[1])%latt_size[1];
+    int lz=(z-1+latt_size[2])%latt_size[2];
+    int lt=(t-1+latt_size[3])%latt_size[3];
+    Coordinate gcoor({x,y,z,t});
+    Coordinate lcoor({lx,ly,lz,lt});
+    LorentzColourMatrix g;
+    LorentzColourMatrix l;
+    peekLocalSite(g,Ug_v,gcoor);
+    peekLocalSite(l,Ul_v,lcoor);
+    g=g-l;
+    assert(norm2(g)==0);
+    diff = diff + norm2(g);
+    n = n + norm2(l);
+  }}}}
+  }
+  std::cout << "padded field check diff "<< diff <<" / "<< n<<std::endl;
+  std::cout << norm2(Ughost)<< " " << norm2(Umu)<<std::endl;
+  }
+#endif
+
+  ///// Array for the site plaquette
+  GridBase *GhostGrid = Ughost.Grid();
+  LatticeComplex gplaq(GhostGrid); 
+  
+  std::vector<Coordinate> shifts;
+  for(int mu=0;mu<Nd;mu++){
+    for(int nu=mu+1;nu<Nd;nu++){
+  
+      //    Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x)
+      Coordinate shift_0(Nd,0);
+      Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
+      Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
+      shifts.push_back(shift_0);
+      shifts.push_back(shift_mu);
+      shifts.push_back(shift_nu);
+      shifts.push_back(shift_0);
+    }
+  }
+  GeneralLocalStencil gStencil(GhostGrid,shifts);
+
+  gplaq=Zero();
+  {
+    autoView( gp_v , gplaq, CpuWrite);
+    autoView( t_v , trplaq, CpuRead);
+    autoView( U_v , Ughost, CpuRead);
+    for(int ss=0;ss<gp_v.size();ss++){
+      int s=0;
+      for(int mu=0;mu<Nd;mu++){
+	for(int nu=mu+1;nu<Nd;nu++){
+
+	  auto SE0 = gStencil.GetEntry(s+0,ss);
+	  auto SE1 = gStencil.GetEntry(s+1,ss);
+	  auto SE2 = gStencil.GetEntry(s+2,ss);
+	  auto SE3 = gStencil.GetEntry(s+3,ss);
+	
+	  int o0 = SE0->_offset;
+	  int o1 = SE1->_offset;
+	  int o2 = SE2->_offset;
+	  int o3 = SE3->_offset;
+	  
+	  auto U0 = U_v[o0](mu);
+	  auto U1 = U_v[o1](nu);
+	  auto U2 = adj(U_v[o2](mu));
+	  auto U3 = adj(U_v[o3](nu));
+
+	  gpermute(U0,SE0->_permute);
+	  gpermute(U1,SE1->_permute);
+	  gpermute(U2,SE2->_permute);
+	  gpermute(U3,SE3->_permute);
+	  
+	  gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 );
+	  s=s+4;
+	}
+      }
+    }
+  }
+  cplaq = Ghost.Extract(gplaq);
+  RealD vol = cplaq.Grid()->gSites();
+  RealD faces = (Nd * (Nd-1))/2;
+  auto p = TensorRemove(sum(cplaq));
+  auto result = p.real()/vol/faces/Nc;
+
+  std::cout << GridLogMessage << " Average plaquette via padded cell "<<result<<std::endl;
+  std::cout << GridLogMessage << " Diff "<<result-plaq<<std::endl;
+  
+  assert(fabs(result-plaq)<1.0e-8);
+  Grid_finalize();
+}
--- a/tests/forces/Test_double_ratio.cc
+++ b/tests/forces/Test_double_ratio.cc
@@ -476,7 +476,9 @@ int main (int argc, char ** argv)
  //  ForceTest<GimplTypesR>(BdyNf2eo,U,DDHMCFilter);

  //////////////////// One flavour boundary det  ////////////////////
+  /*
  RationalActionParams OFRp; // Up/down
+  int SP_iters = 3000;
  OFRp.lo       = 6.0e-5;
  OFRp.hi       = 90.0;
  OFRp.inv_pow  = 2;
@@ -489,7 +491,7 @@ int main (int argc, char ** argv)
  //  OFRp.degree   = 16;
  OFRp.precision= 80;
  OFRp.BoundsCheckFreq=0;
-  /*
+  */
  OneFlavourRationalParams OFRp; // Up/down
  OFRp.lo       = 4.0e-5;
  OFRp.hi       = 90.0;
@@ -499,7 +501,6 @@ int main (int argc, char ** argv)
  OFRp.degree   = 18;
  OFRp.precision= 80;
  OFRp.BoundsCheckFreq=0;
-  */
  std::vector<RealD> ActionTolByPole({
      1.0e-7,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
--- a/tests/forces/Test_fthmc.cc
+++ b/tests/forces/Test_fthmc.cc
@@ -0,0 +1,219 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_fthmc.cc
+
+    Copyright (C) 2022
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
+#include <Grid/qcd/smearing/JacobianAction.h>
+
+using namespace std;
+using namespace Grid;
+
+typedef MobiusFermionD FermionAction;
+typedef WilsonImplD FimplD;
+typedef WilsonImplD FermionImplPolicy;
+
+template<class Gimpl>
+void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeField> & smU,MomentumFilterBase<LatticeGaugeField> &Filter)
+{
+  LatticeGaugeField U = smU.get_U(false); // unsmeared config
+  GridBase *UGrid = U.Grid();
+
+  std::vector<int> seeds({1,2,3,5});
+  GridSerialRNG            sRNG;         sRNG.SeedFixedIntegers(seeds);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds);
+
+  LatticeColourMatrix Pmu(UGrid); 
+  LatticeGaugeField P(UGrid); 
+  LatticeGaugeField UdSdU(UGrid); 
+
+  std::cout << GridLogMessage << "*********************************************************"<<std::endl;
+  std::cout << GridLogMessage << " Force test for "<<action.action_name()<<std::endl;
+  std::cout << GridLogMessage << "*********************************************************"<<std::endl;
+  
+  RealD eps=0.01;
+
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  std::cout << GridLogMessage << " Refresh "<<action.action_name()<<std::endl;
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  
+  Gimpl::generate_momenta(P,sRNG,RNG4);
+  //  Filter.applyFilter(P);
+
+  action.refresh(smU,sRNG,RNG4);
+
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+
+  RealD S1 = action.S(smU);
+
+  Gimpl::update_field(P,U,eps);
+  smU.set_Field(U);
+
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  std::cout << GridLogMessage << " Derivative "<<action.action_name()<<std::endl;
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  action.deriv(smU,UdSdU);
+  UdSdU = Ta(UdSdU);
+  //  Filter.applyFilter(UdSdU);
+
+  DumpSliceNorm("Force",UdSdU,Nd-1);
+  
+  Gimpl::update_field(P,U,eps);
+  smU.set_Field(U);
+
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
+  std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  
+  RealD S2 = action.S(smU);
+
+  // Use the derivative
+  LatticeComplex dS(UGrid); dS = Zero();
+  for(int mu=0;mu<Nd;mu++){
+    auto UdSdUmu = PeekIndex<LorentzIndex>(UdSdU,mu);
+    Pmu= PeekIndex<LorentzIndex>(P,mu);
+    dS = dS - trace(Pmu*UdSdUmu)*eps*2.0*HMC_MOMENTUM_DENOMINATOR;
+  }
+  ComplexD dSpred    = sum(dS);
+  RealD diff =  S2-S1-dSpred.real();
+
+  std::cout<< GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
+  std::cout<< GridLogMessage << "S1 : "<< S1    <<std::endl;
+  std::cout<< GridLogMessage << "S2 : "<< S2    <<std::endl;
+  std::cout<< GridLogMessage << "dS : "<< S2-S1 <<std::endl;
+  std::cout<< GridLogMessage << "dSpred : "<< dSpred.real() <<std::endl;
+  std::cout<< GridLogMessage << "diff : "<< diff<<std::endl;
+  std::cout<< GridLogMessage << "*********************************************************"<<std::endl;
+  //  assert(diff<1.0);
+  std::cout<< GridLogMessage << "Done" <<std::endl;
+  std::cout << GridLogMessage << "*********************************************************"<<std::endl;
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::cout << std::setprecision(14);
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate mpi_layout  = GridDefaultMpi();
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  Coordinate shm;
+  GlobalSharedMemory::GetShmDims(mpi_layout,shm);
+
+  const int Ls=12;
+  const int Nt = latt_size[3];
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  
+  ///////////////////// Gauge Field and Gauge Forces ////////////////////////////
+  LatticeGaugeField U(UGrid);
+
+#if  0
+  FieldMetaData header;
+  std::string file("./ckpoint_lat.2000");
+  NerscIO::readConfiguration(U,header,file);
+#else
+  std::vector<int> seeds({1,2,3,4,5,6,7,8});
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds);
+  SU<Nc>::HotConfiguration(RNG4,U);
+#endif
+
+  
+  WilsonGaugeActionR  PlaqAction(6.0);
+  IwasakiGaugeActionR RectAction(2.13);
+  PlaqAction.is_smeared = true;  
+  RectAction.is_smeared = true;  
+
+  ////////////////////////////////////
+  // Fermion Action
+  ////////////////////////////////////
+  RealD mass=0.01; 
+  RealD pvmass=1.0; 
+  RealD M5=1.8; 
+  RealD b=1.5;
+  RealD c=0.5;
+  
+  // Double versions
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+  FermionAction DdwfPeriodic(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,Params);
+  FermionAction PVPeriodic  (U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,pvmass,M5,b,c,Params);
+
+  double StoppingCondition = 1.0e-8;
+  double MaxCGIterations = 50000;
+  ConjugateGradient<LatticeFermion>  CG(StoppingCondition,MaxCGIterations);
+
+  TwoFlavourRatioPseudoFermionAction<FimplD> Nf2(PVPeriodic, DdwfPeriodic,CG,CG);
+  Nf2.is_smeared = true;  
+  
+  ////////////////////////////////////////////////
+  // Plaquette only FTHMC smearer
+  ////////////////////////////////////////////////
+  double rho = 0.1;
+  Smear_Stout<PeriodicGimplR> Smearer(rho);
+  SmearedConfigurationMasked<PeriodicGimplR> SmartConfig(UGrid,2*Nd,Smearer);
+  SmearedConfiguration<PeriodicGimplR> StoutConfig(UGrid,1,Smearer);
+
+  JacobianAction<PeriodicGimplR> Jacobian(&SmartConfig);
+  
+  ////////////////////////////////////////////////
+  // Run some tests
+  ////////////////////////////////////////////////
+  MomentumFilterNone<LatticeGaugeField> FilterNone;
+
+  std::cout << " *********  FIELD TRANSFORM SMEARING ***** "<<std::endl;
+
+  SmartConfig.set_Field(U);
+  ForceTest<GimplTypesR>(PlaqAction,SmartConfig,FilterNone);
+
+  SmartConfig.set_Field(U);
+  ForceTest<GimplTypesR>(RectAction,SmartConfig,FilterNone);
+
+  SmartConfig.set_Field(U);
+  ForceTest<GimplTypesR>(Jacobian,SmartConfig,FilterNone);
+
+  SmartConfig.set_Field(U);
+  ForceTest<GimplTypesR>(Nf2,SmartConfig,FilterNone);
+
+  std::cout << " *********    STOUT SMEARING ***** "<<std::endl;
+
+  StoutConfig.set_Field(U);
+  ForceTest<GimplTypesR>(PlaqAction,StoutConfig,FilterNone);
+
+  StoutConfig.set_Field(U);
+  ForceTest<GimplTypesR>(RectAction,StoutConfig,FilterNone);
+  
+  StoutConfig.set_Field(U);
+  ForceTest<GimplTypesR>(Nf2,StoutConfig,FilterNone);
+  
+
+  Grid_finalize();
+}
--- a/tests/qdpxx/Test_qdpxx_munprec.cc
+++ b/tests/qdpxx/Test_qdpxx_munprec.cc
@@ -1,7 +1,6 @@
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
-
    Source file: ./tests/qdpxx/Test_qdpxx_munprec.cc

    Copyright (C) 2015
@@ -26,13 +25,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include <chroma.h>
+#include <actions/ferm/invert/syssolver_linop_cg_array.h>
+#include <actions/ferm/invert/syssolver_linop_aggregate.h>
+
 #include <Grid/Grid.h>

 int    Ls=8;
 double M5=1.6;
 double mq=0.01;
-double zolo_lo = 0.1;
-double zolo_hi = 2.0;
+double zolo_lo = 0.01;
+double zolo_hi = 7.0;
 double mobius_scale=2.0;

 enum ChromaAction {
@@ -55,11 +58,6 @@ enum ChromaAction {
 void calc_grid      (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag);
 void calc_chroma    (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag);

-#include <chroma.h>
-#include <actions/ferm/invert/syssolver_linop_cg_array.h>
-#include <actions/ferm/invert/syssolver_linop_aggregate.h>
-
-

 namespace Chroma { 

@@ -81,7 +79,7 @@ public:

    std::vector<int> x(4);
    QDP::multi1d<int> cx(4);
-    std::vector<int> gd= gr.Grid()->GlobalDimensions();
+    Grid::Coordinate gd = gr.Grid()->GlobalDimensions();

    for (x[0]=0;x[0]<gd[0];x[0]++){
    for (x[1]=0;x[1]<gd[1];x[1]++){
@@ -124,7 +122,7 @@ public:

    std::vector<int> x(5);
    QDP::multi1d<int> cx(4);
-    std::vector<int> gd= gr.Grid()->GlobalDimensions();
+    Grid::Coordinate gd= gr.Grid()->GlobalDimensions();

    for (x[0]=0;x[0]<gd[0];x[0]++){
    for (x[1]=0;x[1]<gd[1];x[1]++){
@@ -166,7 +164,7 @@ public:

    std::vector<int> x(5);
    QDP::multi1d<int> cx(4);
-    std::vector<int> gd= gr.Grid()->GlobalDimensions();
+    Grid::Coordinate gd= gr.Grid()->GlobalDimensions();

    for (x[0]=0;x[0]<gd[0];x[0]++){
    for (x[1]=0;x[1]<gd[1];x[1]++){
@@ -304,7 +302,30 @@ public:
     //     param.approximation_type=COEFF_TYPE_TANH_UNSCALED;
     //     param.approximation_type=COEFF_TYPE_TANH;
     param.tuning_strategy_xml=
-"<TuningStrategy><Name>OVEXT_CONSTANT_STRATEGY</Name></TuningStrategy>\n";
+"<TuningStrategy><Name>OVEXT_CONSTANT_STRATEGY</Name><TuningConstant>1.0</TuningConstant></TuningStrategy>\n";
+     UnprecOvExtFermActArray S_f(cfs,param);
+     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
+     Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
+     return M;
+   }
+   if ( parms == HwPartFracTanh ) {
+     if ( Ls%2 == 0 ) { 
+       printf("Ls is not odd\n");
+       exit(-1);
+     }
+     UnprecOvExtFermActArrayParams param;
+     param.OverMass=M5; 
+     param.Mass=_mq;
+     param.RatPolyDeg = Ls;
+     param.ApproxMin =eps_lo;
+     param.ApproxMax =eps_hi;
+     param.b5 =1.0;
+     param.c5 =1.0;
+     //     param.approximation_type=COEFF_TYPE_ZOLOTAREV;
+     param.approximation_type=COEFF_TYPE_TANH_UNSCALED;
+     //param.approximation_type=COEFF_TYPE_TANH;
+     param.tuning_strategy_xml=
+       "<TuningStrategy><Name>OVEXT_CONSTANT_STRATEGY</Name><TuningConstant>1.0</TuningConstant></TuningStrategy>\n";
     UnprecOvExtFermActArray S_f(cfs,param);
     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
     Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
@@ -316,7 +337,35 @@ public:
     param.ApproxMin=eps_lo;
     param.ApproxMax=eps_hi;
     param.approximation_type=COEFF_TYPE_ZOLOTAREV;
-     param.RatPolyDeg=Ls;
+     param.RatPolyDeg=Ls-1;
+     // The following is why I think Chroma made some directional errors:
+     param.AuxFermAct= std::string(
+"<AuxFermAct>\n"
+"  <FermAct>UNPRECONDITIONED_WILSON</FermAct>\n"
+"  <Mass>-1.8</Mass>\n"
+"  <b5>1</b5>\n"
+"  <c5>0</c5>\n"
+"  <MaxCG>1000</MaxCG>\n"
+"  <RsdCG>1.0e-9</RsdCG>\n"
+"  <FermionBC>\n"
+"      <FermBC>SIMPLE_FERMBC</FermBC>\n"
+"      <boundary>1 1 1 1</boundary>\n"
+"   </FermionBC> \n"
+"</AuxFermAct>"
+);
+     param.AuxFermActGrp= std::string("");
+     UnprecOvlapContFrac5DFermActArray S_f(fbc,param);
+     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
+     Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
+     return  M;
+   }
+   if ( parms == HwContFracTanh ) {
+     UnprecOvlapContFrac5DFermActParams param;
+     param.Mass=_mq; // How is M5 set? Wilson mass In AuxFermAct
+     param.ApproxMin=eps_lo;
+     param.ApproxMax=eps_hi;
+     param.approximation_type=COEFF_TYPE_TANH_UNSCALED;
+     param.RatPolyDeg=Ls-1;
     // The following is why I think Chroma made some directional errors:
     param.AuxFermAct= std::string(
 "<AuxFermAct>\n"
@@ -378,7 +427,14 @@ int main (int argc,char **argv )
   * Setup QDP
   *********************************************************/
  Chroma::initialize(&argc,&argv);
-  Chroma::WilsonTypeFermActs4DEnv::registerAll(); 
+  //  Chroma::WilsonTypeFermActs4DEnv::registerAll(); 
+  Chroma::WilsonTypeFermActsEnv::registerAll(); 
+  //bool linkageHack(void)
+  //{
+  //  bool foo = true;
+  // Inline Measurements
+  //  InlineAggregateEnv::registerAll();
+  //  GaugeInitEnv::registerAll();

  /********************************************************
   * Setup Grid
@@ -388,26 +444,34 @@ int main (int argc,char **argv )
                                                                       Grid::GridDefaultSimd(Grid::Nd,Grid::vComplex::Nsimd()),
                                                                       Grid::GridDefaultMpi());
  
-  std::vector<int> gd = UGrid->GlobalDimensions();
+  Grid::Coordinate gd = UGrid->GlobalDimensions();
  QDP::multi1d<int> nrow(QDP::Nd);
  for(int mu=0;mu<4;mu++) nrow[mu] = gd[mu];

  QDP::Layout::setLattSize(nrow);
  QDP::Layout::create();

-  Grid::GridCartesian         * FGrid   = Grid::SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  Grid::LatticeGaugeField lat(UGrid);
-  Grid::LatticeFermion    src(FGrid);
-  Grid::LatticeFermion    res_chroma(FGrid);
-  Grid::LatticeFermion    res_grid  (FGrid);
-  
  std::vector<ChromaAction> ActionList({
 		 HtCayleyTanh, // Plain old DWF.
 		 HmCayleyTanh,
 		 HwCayleyTanh,
 		 HtCayleyZolo, // Plain old DWF.
 		 HmCayleyZolo,
-		 HwCayleyZolo
+		 HwCayleyZolo,
+		 HwPartFracZolo,
+		 HwContFracZolo,
+		 HwContFracTanh
+  });
+  std::vector<int> LsList({
+      8,//HtCayleyTanh, // Plain old DWF.
+      8,//HmCayleyTanh,
+      8,//HwCayleyTanh,
+      8,//HtCayleyZolo, // Plain old DWF.
+      8,//HmCayleyZolo,
+      8,//HwCayleyZolo,
+      9,//HwPartFracZolo
+      9, //HwContFracZolo
+      9 //HwContFracTanh
  });
  std::vector<std::string> ActionName({
        "HtCayleyTanh",
@@ -415,10 +479,19 @@ int main (int argc,char **argv )
 	"HwCayleyTanh",
 	"HtCayleyZolo",
 	"HmCayleyZolo",
-        "HwCayleyZolo"
+        "HwCayleyZolo",
+	"HwPartFracZolo",
+	"HwContFracZolo",
+	"HwContFracTanh"
  });

  for(int i=0;i<ActionList.size();i++) {
+    Ls = LsList[i];
+    Grid::GridCartesian      * FGrid   = Grid::SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+    Grid::LatticeGaugeField lat(UGrid);
+    Grid::LatticeFermion    src(FGrid);
+    Grid::LatticeFermion    res_chroma(FGrid);
+    Grid::LatticeFermion    res_grid  (FGrid);
    std::cout << "*****************************"<<std::endl;
    std::cout << "Action "<<ActionName[i]<<std::endl;
    std::cout << "*****************************"<<std::endl;
@@ -439,6 +512,7 @@ int main (int argc,char **argv )
      
      std::cout << "Norm of difference "<<Grid::norm2(res_chroma)<<std::endl;
    }
+    delete FGrid;
  }

  std::cout << "Finished test "<<std::endl;
@@ -502,7 +576,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF
  Grid::gaussian(RNG5,src);
  Grid::gaussian(RNG5,res);

-  Grid::SU<Nc>::HotConfiguration(RNG4,Umu);
+  Grid::SU<Grid::Nc>::HotConfiguration(RNG4,Umu);

  /*
  Grid::LatticeColourMatrix U(UGrid);
@@ -519,7 +593,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HtCayleyTanh ) { 

-    Grid::DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5);
+    Grid::DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5);

    std::cout << Grid::GridLogMessage <<" Calling domain wall multiply "<<std::endl;

@@ -535,7 +609,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

    Grid::Real _b = 0.5*(mobius_scale +1.0);
    Grid::Real _c = 0.5*(mobius_scale -1.0);
-    Grid::MobiusZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c,zolo_lo,zolo_hi);
+    Grid::MobiusZolotarevFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c,zolo_lo,zolo_hi);

    std::cout << Grid::GridLogMessage <<" Calling mobius zolo multiply "<<std::endl;

@@ -549,7 +623,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HtCayleyZolo ) {

-    Grid::ShamirZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+    Grid::ShamirZolotarevFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);

    std::cout << Grid::GridLogMessage <<" Calling shamir zolo multiply "<<std::endl;

@@ -561,6 +635,60 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF
    return;
  }

+  if ( action == HwPartFracTanh ) {
+
+    Grid::OverlapWilsonPartialFractionTanhFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
+
+    std::cout << Grid::GridLogMessage <<" Calling part frac tanh multiply "<<std::endl;
+
+    if ( dag ) 
+      Dov.Mdag(src,res);  
+    else 
+      Dov.M(src,res);  
+
+    return;
+  }
+
+  if ( action == HwContFracTanh ) {
+
+    Grid::OverlapWilsonContFracTanhFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
+
+    std::cout << Grid::GridLogMessage <<" Calling cont frac tanh multiply "<<std::endl;
+
+    if ( dag ) 
+      Dov.Mdag(src,res);  
+    else 
+      Dov.M(src,res);  
+
+    return;
+  }
+  if ( action == HwContFracZolo ) {
+
+    Grid::OverlapWilsonContFracZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+
+    std::cout << Grid::GridLogMessage <<" Calling cont frac zolo multiply "<<std::endl;
+
+    if ( dag ) 
+      Dov.Mdag(src,res);  
+    else 
+      Dov.M(src,res);  
+
+    return;
+  }
+
+  if ( action == HwPartFracZolo ) {
+
+    Grid::OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+    std::cout << Grid::GridLogMessage <<" Calling part frac zolotarev multiply "<<std::endl;
+
+    if ( dag ) 
+      Dov.Mdag(src,res);  
+    else 
+      Dov.M(src,res);  
+
+    return;
+  }
+  
  /*
  if ( action == HmCayleyTanh ) {
    Grid::Real _b = 0.5*(mobius_scale +1.0);
@@ -581,7 +709,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HmCayleyTanh ) {

-    Grid::ScaledShamirFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,mobius_scale);
+    Grid::ScaledShamirFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,mobius_scale);

    std::cout << Grid::GridLogMessage <<" Calling scaled shamir multiply "<<std::endl;

@@ -595,7 +723,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HwCayleyTanh ) {

-    Grid::OverlapWilsonCayleyTanhFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
+    Grid::OverlapWilsonCayleyTanhFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);

    if ( dag ) 
      D.Mdag(src,res);  
@@ -607,7 +735,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF

  if ( action == HwCayleyZolo ) {

-    Grid::OverlapWilsonCayleyZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+    Grid::OverlapWilsonCayleyZolotarevFermionD D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);

    if ( dag ) 
      D.Mdag(src,res);  
--- a/tests/solver/Test_dwf_cg_prec.cc
+++ b/tests/solver/Test_dwf_cg_prec.cc
@@ -1,4 +1,4 @@
-/*************************************************************************************
+*************************************************************************************

 Grid physics library, www.github.com/paboyle/Grid

@@ -67,7 +67,13 @@ int main(int argc, char** argv) {
  result = Zero();
  LatticeGaugeField Umu(UGrid);

+#if 0
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+#else  
  SU<Nc>::HotConfiguration(RNG4, Umu);
+#endif

  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt()
            << "   Ls: " << Ls << std::endl;
--- a/tests/solver/Test_dwf_cg_unprec.cc
+++ b/tests/solver/Test_dwf_cg_unprec.cc
@@ -54,15 +54,30 @@ int main (int argc, char ** argv)
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

+  std::vector<ComplexD> qmu;
+  qmu.push_back(ComplexD(0.1,0.0));
+  qmu.push_back(ComplexD(0.0,0.0));
+  qmu.push_back(ComplexD(0.0,0.0));
+  qmu.push_back(ComplexD(0.0,0.01));
+  
+
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);

+  LatticeFermion    tmp(FGrid);
  LatticeFermion    src(FGrid); random(RNG5,src);
  LatticeFermion result(FGrid); result=Zero();
-  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
-
+  LatticeGaugeField Umu(UGrid); 
+#if 0
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+#else  
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+#endif
+  
  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@@ -71,8 +86,15 @@ int main (int argc, char ** argv)
  RealD mass=0.1;
  RealD M5=1.8;
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  Ddwf.qmu = qmu;

+  Ddwf.M(src,tmp);
+  std::cout << " |M src|^2 "<<norm2(tmp)<<std::endl;
  MdagMLinearOperator<DomainWallFermionD,LatticeFermion> HermOp(Ddwf);
+  HermOp.HermOp(src,tmp);
+
+  std::cout << " <src|MdagM| src> "<<innerProduct(src,tmp)<<std::endl;
+  
  ConjugateGradient<LatticeFermion> CG(1.0e-6,10000);
  CG(HermOp,src,result);
Author	SHA1	Message	Date
Peter Boyle	6815e138b4	Boosted fermion attempt	2024-10-17 18:37:33 +01:00
Peter Boyle	e29b97b3ea	Qslash term added	2023-09-14 16:14:03 -04:00
Peter Boyle	ad2b699d2b	Better macos	2023-09-14 16:12:21 -04:00
Peter Boyle	b8a7004365	Partial fraction test	2023-08-14 15:17:03 -04:00
Peter Boyle	994512048e	Merge pull request #439 from felixerben/bugfix/IRL_convergence Bugfix/irl convergence	2023-07-12 16:32:26 -04:00
Felix Erben	78bae9417c	returning Nstop vectors even if not all meet true convergence criterion	2023-06-27 14:38:19 +01:00
Felix Erben	dd170ead01	whitespace	2023-06-27 11:37:01 +01:00
Felix Erben	014704856f	do one more iteration if not all vectors converged	2023-06-27 11:33:30 +01:00
Peter Boyle	ee92e08edb	Merge pull request #435 from fjosw/fix/warnings_in_WilsonKernelsImplementation Unused variable in WilsonKernelsImplementation	2023-06-23 11:47:19 -04:00
Peter Boyle	c1dcee9328	Merge pull request #437 from fjosw/fix/stencil_debug Added GridLogDebug to BuildSurfaceList debug message	2023-06-23 11:47:00 -04:00
Peter Boyle	6b150961fe	Better script	2023-06-23 18:09:25 +03:00
Peter Boyle	5bafcaedfa	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-06-22 19:59:45 +03:00
Peter Boyle	bfeceae708	FTHMC	2023-06-22 12:58:18 -04:00
Peter Boyle	eacb66591f	Config command	2023-06-22 19:56:40 +03:00
Peter Boyle	fadaa85626	Update	2023-06-22 19:56:27 +03:00
Peter Boyle	02a5b0d786	Updating run during testing	2023-06-22 19:52:46 +03:00
Peter Boyle	0e2141442a	Dennis says broken	2023-06-22 19:19:51 +03:00
Peter Boyle	769eb0eecb	Precision coverage	2023-06-22 19:19:20 +03:00
Fabian Joswig	85e35c4da1	fix: added GridLogDebug to BuildSurfaceList debug message.	2023-06-16 10:31:16 +01:00
Peter Boyle	d72e914cf0	Profiling temporary code until optimised	2023-06-15 10:43:04 -04:00
Peter Boyle	3b5254e2d5	Optional checkpoint smeared configs for FTHMC	2023-06-15 10:43:04 -04:00
Peter Boyle	f1c358b596	Additional tests	2023-06-15 10:43:04 -04:00
Peter Boyle	c0ef210265	Hot start should be properly Hot	2023-06-15 10:43:04 -04:00
Peter Boyle	e3e1cc1962	Ta project	2023-06-15 10:43:04 -04:00
Peter Boyle	723eadbb5c	Keep methods virtual	2023-06-15 10:43:04 -04:00
Peter Boyle	e24637ec1e	Clean up	2023-06-15 10:43:04 -04:00
Peter Boyle	8b01ff4ce7	Integrator over to smeared force structure	2023-06-15 10:43:04 -04:00
Peter Boyle	588197c487	Smeared action virtual class	2023-06-15 10:43:04 -04:00
Peter Boyle	1352bad2e4	Sunspot compile	2023-06-15 11:22:46 +00:00
Peter Boyle	ffd7301649	Updated masked / fthmc smeared config container	2023-06-01 06:23:02 -04:00
Peter Boyle	d2a8494044	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-06-01 06:22:33 -04:00
Peter Boyle	0982e0d19b	Jacobian action wrapper for FTHMC	2023-06-01 06:15:08 -04:00
Peter Boyle	3badbfc3c1	Refactor the Action and Smeared gauge configuration containers. Add first pass at FTHMC action	2023-06-01 06:14:28 -04:00
Peter Boyle	5465961e30	New test for FTHMC portion	2023-06-01 06:14:04 -04:00
Fabian Joswig	477b794bc5	fix: unused variable removed.	2023-05-29 14:08:53 +01:00
Peter Boyle	4835fd1a87	HIP stream synch	2023-05-27 17:58:22 +03:00
Peter Boyle	6533c25814	Lumi	2023-05-27 16:13:32 +03:00
Peter Boyle	1b2914ec09	FT-HMC smearing, derivative chain rule, log det and force first pass.	2023-05-22 10:21:37 -04:00
Peter Boyle	519f795066	Header not liked by gcc on mac? puzzling	2023-05-22 10:21:12 -04:00
Peter Boyle	4240ad5ca8	Preparing for FTHMC	2023-05-19 21:21:55 -04:00
Peter Boyle	d418347d86	public for convenience to see rho params	2023-05-19 21:21:05 -04:00
Peter Boyle	29a4bfe5e5	Clean up	2023-05-19 21:20:45 -04:00
Peter Boyle	9955bf9daf	Regresses to Qlat	2023-05-19 17:32:13 -04:00
Peter Boyle	876c8f4478	Nodes on padded cell	2023-05-11 12:35:49 -04:00
Peter Boyle	9c8750f261	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-05-11 12:29:09 -04:00
Peter Boyle	91efd08179	Option for Qlat generator basis	2023-05-11 12:27:45 -04:00
Peter Boyle	9953511b65	Mac compile	2023-05-11 12:27:29 -04:00
Peter Boyle	025fa9991a	For FTHMC	2023-05-11 12:26:14 -04:00
Peter Boyle	e8c60c355b	Padded cell code	2023-05-11 12:25:50 -04:00
Peter Boyle	6c9c7f9d85	Permute fix	2023-05-11 12:24:21 -04:00
Peter Boyle	f534523ede	Debug	2023-05-11 12:23:11 -04:00
Peter Boyle	1b8a834beb	Debug	2023-05-11 12:22:24 -04:00
Peter Boyle	3aa43e6065	Debug info	2023-04-20 14:21:13 -04:00
Peter Boyle	78ac4044ff	HMC	2023-04-20 13:28:07 -04:00
Peter Boyle	119c3db47f	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-04-18 15:13:16 -04:00
Peter Boyle	21bbdb8fc2	Crusher	2023-04-18 15:11:16 -04:00
Peter Boyle	739bd7572c	Example code	2023-04-17 21:51:55 +00:00
Peter Boyle	074627a5bd	Pass file descriptors through AF_UNIX for level_zero	2023-04-17 21:50:52 +00:00
Peter Boyle	6a23b2c599	Drop UVM	2023-04-17 21:49:58 +00:00
Peter Boyle	bd891fb3f5	tests to compile	2023-04-12 18:32:44 -04:00
Peter Boyle	3984265851	Merge pull request #432 from paboyle/hotfix/nvcc-warnings Unused statements generating warnings removed	2023-04-12 16:59:02 -04:00
Peter Boyle	45361d188f	Merge pull request #427 from fjosw/feat/bug_report_issue_template Feat/bug report issue template	2023-04-12 16:58:41 -04:00
Peter Boyle	80c9d77e02	Merge pull request #433 from paboyle/hotfix/virtual-dtor Virtual destructor for LinearOperator	2023-04-12 16:56:18 -04:00
Peter Boyle	3aff64dddb	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-04-11 12:19:15 -07:00
Peter Boyle	b4f2ca81ff	Copy queue and compute queue same as better concurrency	2023-04-11 12:18:21 -07:00
Peter Boyle	d1dea5f840	New driver	2023-04-11 12:16:52 -07:00
Peter Boyle	54f8b84d16	Fence	2023-04-11 12:16:08 -07:00
Peter Boyle	da503fef0e	Name change on barrier routine	2023-04-11 12:14:04 -07:00
Peter Boyle	4a6802098a	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-04-07 15:43:28 -04:00
Peter Boyle	f9b41a84d2	Trajectory runs to completion on Crusher within wall clock time	2023-04-07 15:42:45 -04:00
Antonin Portelli	9e64387933	mores unused statements removed	2023-04-07 14:27:18 +01:00
Antonin Portelli	983b681d46	unused statement cleaning	2023-04-07 14:12:02 +01:00
Peter Boyle	86dac5ff4f	Better printing	2023-04-04 07:42:19 -07:00
Peter Boyle	4a382fad3f	Use distinct SYCL queue for copies	2023-04-04 07:41:41 -07:00
Peter Boyle	cc753670d9	Barrier elimination, surface list build	2023-04-04 07:39:14 -07:00
Peter Boyle	cc9d88ea1c	Fence changes and EXT kernel loop cout reduction	2023-04-04 07:37:23 -07:00
Peter Boyle	b281b0166e	Put the barrier in the subroutine	2023-04-04 07:36:03 -07:00
Peter Boyle	6a21f694ff	Apply barrier in Gather kernel sequence. Could place before comms, or in Gather, but decided to insist Gather means Gather is done	2023-04-04 07:33:24 -07:00
Fabian Joswig	39214702f6	feat: indentation fixed.	2023-03-28 16:30:34 +02:00
Fabian Joswig	3e4614c63a	feat: draft for bug-report issue template added.	2023-03-28 16:24:35 +02:00
Peter Boyle	ccd21f96ff	Plaquette agreeing and moving to final form (slowly) need to optimise	2023-02-01 22:57:44 -05:00
Peter Boyle	4b90cb8888	First cut passes combining padded cell with general stencil towards fast plaquette and staggered force	2023-02-01 22:14:10 -05:00