improved

std::cout<<GridLogMessage<<"Debug:"<<std::endl;
std::cout<<GridLogMessage<<" --dylib-map : print dynamic library map, useful for interpreting signal backtraces "<<std::endl; std::cout<<GridLogMessage<<" --heartbeat : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl; std::cout<<GridLogMessage<<" --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl; std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl; std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl; std::cout<<GridLogMessage<<" --debug-heartbeat : periodically report backtrace "<<std::endl; --dylib-map : Grid prints its dylib regions --heartbeat : itimer based / SIGALRM wake up which seems to make Aurora more stable --debug-heartbeat : periodically report to stderr where we are in code Now have libunwind option (configure: --with-unwind=<prefix>) to give an Asynch-Signal safe backtrace. Avoid glibc backtrace due to mallocs.
2025-11-09 16:19:32 +00:00 · 2025-06-27 06:08:54 +00:00 · 2025-06-27 06:08:54 +00:00 · 2025-06-27 06:08:54 +00:00 · 2025-06-27 06:08:54 +00:00 · 2025-06-27 06:08:54 +00:00
79 changed files with 1753 additions and 6901 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -51,11 +51,13 @@ directory
 #pragma nv_diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma nv_diag_suppress esa_on_defaulted_function_ignored
 #pragma nv_diag_suppress declared_but_not_referenced
 #pragma nv_diag_suppress extra_semicolon
 #else
 //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
 #pragma diag_suppress declared_but_not_referenced
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@@ -37,7 +37,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/spin/Pauli.h> // depends on Gparity
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/cartesian/Cartesian.h
+++ b/Grid/cartesian/Cartesian.h
@@ -31,6 +31,5 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cartesian/Cartesian_base.h>
 #include <Grid/cartesian/Cartesian_full.h>
 #include <Grid/cartesian/Cartesian_red_black.h> 
 #include <Grid/cartesian/CartesianCrossIcosahedron.h>
 #endif
--- a/Grid/cartesian/CartesianCrossIcosahedron.h
+++ b/Grid/cartesian/CartesianCrossIcosahedron.h
@@ -1,241 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cartesian/CartesianCrossIcosahedron.h
    Copyright (C) 2025
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////////////////////////////
 // Grid Support.
 /////////////////////////////////////////////////////////////////////////////////////////
 enum IcosahedralMeshType {
  IcosahedralVertices,
  IcosahedralEdges
 } ;
 enum NorthSouth {
  North = 1,
  South = 0
 };
 enum IcoshedralDirections {
  IcosahedronPatchX = 0,
  IcosahedronPatchY = 1,
  IcosahedronPatchDiagonal=2,
  NumIcosahedralPolarizations
 };
 const int IcosahedralPatches = 10;
 const int HemiPatches=IcosahedralPatches/2;
 const int NorthernHemisphere = HemiPatches;
 const int SouthernHemisphere = 0;
 class GridCartesianCrossIcosahedron: public GridCartesian {
 public:
  IcosahedralMeshType meshType;
  IcosahedralMeshType MeshType(void) { return meshType; };
  /////////////////////////////////////////////////////////////////////////
  // Constructor takes a parent grid and possibly subdivides communicator.
  /////////////////////////////////////////////////////////////////////////
  /*
  GridCartesian(const Coordinate &dimensions,
 		const Coordinate &simd_layout,
 		const Coordinate &processor_grid,
 		const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
  {
    assert(0); // No subdivision
  }
  GridCartesian(const Coordinate &dimensions,
 		const Coordinate &simd_layout,
 		const Coordinate &processor_grid,
 		const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
  {
    assert(0); // No subdivision
  }
  */
  /////////////////////////////////////////////////////////////////////////
  // Construct from comm world
  /////////////////////////////////////////////////////////////////////////
  GridCartesianCrossIcosahedron(const Coordinate &dimensions,
 				const Coordinate &simd_layout,
 				const Coordinate &processor_grid,
 				IcosahedralMeshType _meshType) : GridCartesian(dimensions,simd_layout,processor_grid)
  {
    meshType = _meshType;
    Coordinate S2dimensions=dimensions;
    Coordinate S2simd      =simd_layout;
    Coordinate S2procs     =processor_grid;
    assert(simd_layout[0]==1); // Force simd into perpendicular dimensions
    assert(simd_layout[1]==1); // to avoid pole storage complexity interacting with SIMD.
    assert(dimensions[_ndimension-1]==IcosahedralPatches);
    assert(processor_grid[_ndimension-1]<=2); // Keeps the patches that need a pole on the same node
    // Save a copy of the basic cartesian initialisation volume
    cartesianOsites = this->_osites;
    // allocate the pole storage if we are seeking vertex domain data
    if ( meshType == IcosahedralVertices ) {
      InitPoles();
    }
  }
  virtual ~GridCartesianCrossIcosahedron() = default;
  ////////////////////////////////////////////////
  // Use to decide if a given grid is icosahedral
  ////////////////////////////////////////////////
  int hasNorthPole;
  int hasSouthPole;
  int northPoleOsite;
  int southPoleOsite;
  int northPoleOsites;
  int southPoleOsites;
  int cartesianOsites;
  virtual int isIcosahedral(void)           override { return 1;}
  virtual int isIcosahedralVertex(void)     override { return meshType==IcosahedralVertices;}
  virtual int isIcosahedralEdge  (void)     override { return meshType==IcosahedralEdges;}
  virtual int NorthPoleOsite(void)  const override { return northPoleOsite; };
  virtual int NorthPoleOsites(void) const override { return northPoleOsites; };
  virtual int SouthPoleOsite(void)  const override { return southPoleOsite; };
  virtual int SouthPoleOsites(void) const override { return southPoleOsites; };
  virtual int ownsNorthPole(void)   const override { return hasNorthPole; };
  virtual int ownsSouthPole(void)   const override { return hasSouthPole; };
  virtual int CartesianOsites(void) const override { return cartesianOsites; };
  virtual int64_t PoleIdxForOcoor(Coordinate &Coor) override
  {
    // Work out the pole_osite. Pick the higher dims
    Coordinate rdims;
    Coordinate ocoor;
    int64_t pole_idx;
    int Ndm1 = this->Nd()-1;
    for(int d=2;d<Ndm1;d++){
      int dd=d-2;
      rdims.push_back(this->_rdimensions[d]);
      ocoor.push_back(Coor[d]%this->_rdimensions[d]);
    }
    Lexicographic::IndexFromCoor(ocoor,pole_idx,rdims);
    return pole_idx;
  }
  virtual int64_t PoleSiteForOcoor(Coordinate &Coor) override
  {
    int Ndm1 = this->Nd()-1;
    int64_t pole_idx = this->PoleIdxForOcoor(Coor);
    int64_t pole_osite;
    if ( Coor[Ndm1] >= HemiPatches ) {
      pole_osite = pole_idx + this->NorthPoleOsite();
    } else {
      pole_osite = pole_idx + this->SouthPoleOsite();
    }
    return pole_osite;
  }
  void InitPoles(void)
  {
    int Ndm1 = _ndimension-1;
    ///////////////////////
    // Add the extra pole storage
    ///////////////////////
    // Vertices = 1x LxLx D1...Dn + 2.D1...Dn
    // Start after the LxL and don't include the 10 patch dim
    int OrthogSize = 1;
    for (int d = 2; d < Ndm1; d++) {
      OrthogSize *= _gdimensions[d];
    }
    _fsites += OrthogSize*2;
    _gsites += OrthogSize*2;
    // Simd reduced sizes are multiplied up.
    // If the leading LxL are simd-ized, the vector objects will contain "redundant" lanes
    // which should contain identical north (south) pole data
    OrthogSize = 1;
    for (int d = 2; d < Ndm1; d++) {
      OrthogSize *= _rdimensions[d];
    }
    // Grow the local volume to hold pole data
    // on rank (0,0) in the LxL planes
    // since SIMD must be placed in the orthogonal directions
    Coordinate pcoor = this->ThisProcessorCoor();
    Coordinate pgrid = this->ProcessorGrid();
    const int xdim=0;
    const int ydim=1;
    /*
     *
     *  /\/\/\/\/\
     * /\/\/\/\/\/
     * \/\/\/\/\/
     *
     *  y
     * /
     * \x
     *
     * Labelling patches as 5 6 7 8 9
     *                      0 1 2 3 4
     *
     * Will ban distribution of the patch dimension by more than 2.
     *
     * Hence all 5 patches associated with the pole must have the
     * appropriate "corner" of the patch L^2 located on the SAME rank.
     */ 
    if( (pcoor[xdim]==pgrid[xdim]-1) && (pcoor[ydim]==0) && (pcoor[Ndm1]==0) ){
      hasSouthPole   =1;
      southPoleOsite=this->_osites; 
      southPoleOsites=OrthogSize;
      this->_osites += OrthogSize;
    } else {
      hasSouthPole   =0;
      southPoleOsites=0;
      southPoleOsite=0;
    }
    if( (pcoor[xdim]==0) && (pcoor[ydim]==pgrid[ydim]-1) && (pcoor[Ndm1]==pgrid[Ndm1]-1) ){
      hasNorthPole   =1;
      northPoleOsite=this->_osites;
      northPoleOsites=OrthogSize;
      this->_osites += OrthogSize;
    } else {
      hasNorthPole   =0;
      northPoleOsites=0;
      northPoleOsite=0;
    }
    std::cout << GridLogDebug<<"Icosahedral vertex field volume " << this->_osites<<std::endl;
    std::cout << GridLogDebug<<"Icosahedral south pole offset   " << this->southPoleOsite<<std::endl;
    std::cout << GridLogDebug<<"Icosahedral north pole offset   " << this->northPoleOsite<<std::endl;
    std::cout << GridLogDebug<<"Icosahedral south pole size     " << this->southPoleOsites<<std::endl;
    std::cout << GridLogDebug<<"Icosahedral north pole size     " << this->northPoleOsites<<std::endl;
  };
 };
 NAMESPACE_END(Grid);
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -86,25 +86,10 @@ public:
 public:
  // Icosahedral decisions
  virtual int isIcosahedral(void) { return 0;}
  virtual int isIcosahedralVertex(void) { return 0;}
  virtual int isIcosahedralEdge  (void) { return 0;}
  virtual int ownsNorthPole(void) const { return 0; };
  virtual int ownsSouthPole(void) const { return 0; };
  virtual int NorthPoleOsite(void) const { return 0; };
  virtual int SouthPoleOsite(void) const { return 0; };
  virtual int NorthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
  virtual int SouthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
  virtual int CartesianOsites(void) const { return this->oSites(); };
  virtual int64_t PoleIdxForOcoor(Coordinate &Coor) { return 0;};
  virtual int64_t PoleSiteForOcoor(Coordinate &Coor){ return 0;}
  ////////////////////////////////////////////////////////////////
  // Checkerboarding interface is virtual and overridden by 
  // GridCartesian / GridRedBlackCartesian
  ////////////////////////////////////////////////////////////////
  virtual int CheckerBoarded(int dim) =0;
  virtual int CheckerBoard(const Coordinate &site)=0;
  virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
@@ -191,8 +176,6 @@ public:
    }
    return permute_type;
  }
  ////////////////////////////////////////////////////////////////
  // Array sizing queries
  ////////////////////////////////////////////////////////////////
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -183,6 +183,7 @@ public:
 		      int recv_from_rank,
 		      int bytes);
  int IsOffNode(int rank);
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,int do_xmit,
 			       void *recv,
@@ -201,9 +202,9 @@ public:
  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-				    void *xmit,
+				    void *xmit,void *xmit_comp,
 				    int xmit_to_rank,int do_xmit,
-				    void *recv,
+				    void *recv,void *recv_comp,
 				    int recv_from_rank,int do_recv,
 				    int xbytes,int rbytes,int dir);
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -270,24 +270,24 @@ void CartesianCommunicator::GlobalSum(double &d)
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
-  FlightRecorder::StepLog("AllReduce");
+  FlightRecorder::StepLog("AllReduce float");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
-  FlightRecorder::StepLog("AllReduce");
+  FlightRecorder::StepLog("AllReduce double");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
-  FlightRecorder::StepLog("AllReduce");
+  FlightRecorder::StepLog("AllReduce uint32_t");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
-  FlightRecorder::StepLog("AllReduce");
+  FlightRecorder::StepLog("AllReduce uint64_t");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
@@ -301,26 +301,31 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  FlightRecorder::StepLog("GlobalXOR");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(float &f)
 {
  FlightRecorder::StepLog("GlobalMax");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(double &d)
 {
  FlightRecorder::StepLog("GlobalMax");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  FlightRecorder::StepLog("GlobalSumVector(float *)");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  FlightRecorder::StepLog("GlobalSumVector(double *)");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
@@ -395,11 +400,16 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
-  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
-
+int CartesianCommunicator::IsOffNode(int rank)
 {
  int grank = ShmRanks[rank];
  if ( grank == MPI_UNDEFINED ) return true;
  else return false;
 }
 #ifdef ACCELERATOR_AWARE_MPI
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
@@ -414,9 +424,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
  return 0.0; // Do nothing -- no preparation required
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
+							 void *xmit,void *xmit_comp,
 							 int dest,int dox,
-							 void *recv,
+							 void *recv,void *recv_comp,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
@@ -440,7 +450,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
-      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      //      std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
      ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      assert(ierr==0);
      list.push_back(rrq);
      off_node_bytes+=rbytes;
@@ -449,6 +460,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    else { 
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
      //      std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
    }
 #endif
@@ -457,7 +469,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      list.push_back(xrq);
      off_node_bytes+=xbytes;
@@ -676,9 +688,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
 }  
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
+							 void *xmit,void *xmit_comp,
 							 int dest,int dox,
-							 void *recv,
+							 void *recv,void *recv_comp,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
@@ -829,6 +841,7 @@ int CartesianCommunicator::RankWorld(void){
  return r;
 }
 void CartesianCommunicator::BarrierWorld(void){
  FlightRecorder::StepLog("BarrierWorld");
  int ierr = MPI_Barrier(communicator_world);
  assert(ierr==0);
 }
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -124,6 +124,8 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
  dest=0;
 }
 int CartesianCommunicator::IsOffNode(int rank) { return false; }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,int dox,
 						     void *recv,
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -543,49 +543,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
  // printf("Host buffer allocate for GPU non-aware MPI\n");
 #if 0
  HostCommBuf= acceleratorAllocHost(bytes);
 #else 
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
 #if 0
  #warning "Moving host buffers to specific NUMA domain"
  int numa;
  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
  if(numa_name) {
    unsigned long page_size = sysconf(_SC_PAGESIZE);
    numa = atoi(numa_name);
    unsigned long page_count = bytes/page_size;
    std::vector<void *> pages(page_count);
    std::vector<int>    nodes(page_count,numa);
    std::vector<int>    status(page_count,-1);
    for(unsigned long p=0;p<page_count;p++){
      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
    }
    int ret = move_pages(0,
 			 page_count,
 			 &pages[0],
 			 &nodes[0],
 			 &status[0],
 			 MPOL_MF_MOVE);
    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
    if (ret) perror(" move_pages failed for reason:");
  }
 #endif  
  acceleratorPin(HostCommBuf,bytes);
 #endif  
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+    std::cout << Mheader " acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
-  std::cout<< "Setting up IPC"<<std::endl;
+  if ( WorldRank == 0 ){
    std::cout<< Mheader "Setting up IPC"<<std::endl;
  }
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -616,8 +588,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
      if ( err != ZE_RESULT_SUCCESS ) {
 	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
      handle.pid = getpid();
@@ -676,12 +646,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef SHM_SOCKETS
      myfd=UnixSockets::RecvFileDescriptor();
 #else
-      std::cout<<"mapping seeking remote pid/fd "
+      //      std::cout<<"mapping seeking remote pid/fd "
-	       <<handle.pid<<"/"
+      //	       <<handle.pid<<"/"
-	       <<handle.fd<<std::endl;
+      //	       <<handle.fd<<std::endl;
      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
-      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
+      //      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
      myfd  = syscall(438,pidfd,handle.fd,0);
      int err_t = errno;
@@ -691,7 +661,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	assert(0);
      }
 #endif
-      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
+      //      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
      memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
@@ -700,9 +670,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
 	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
      }
      assert(thisBuf!=nullptr);
    }
@@ -783,6 +750,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    WorldShmCommBufs[r] =ptr;
    //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 };
@@ -1039,11 +1007,13 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  int gpeer = ShmRanks[rank];
  assert(gpeer!=ShmRank); // never send to self
  //  std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
    //    std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
    return (void *) remote;
  }
 }
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid);
 const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
  assert(!rhs.Grid()->isIcosahedral());
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
@@ -145,9 +143,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
    int comm_proc = ((x+sshift)/rd)%pd;
    if (comm_proc==0) {
      FlightRecorder::StepLog("Cshift_Copy_plane");
      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
      tcopy+=usecond();
      FlightRecorder::StepLog("Cshift_Copy_plane_complete");
    } else {
      int words = buffer_size;
@@ -155,9 +155,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      int bytes = words * sizeof(vobj);
      FlightRecorder::StepLog("Cshift_Gather_plane");
      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
      tgather+=usecond();
      FlightRecorder::StepLog("Cshift_Gather_plane_complete");
      //      int rank           = grid->_processor;
      int recv_from_rank;
@@ -168,6 +170,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      tcomms-=usecond();
      grid->Barrier();
      FlightRecorder::StepLog("Cshift_SendRecv");
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
@@ -184,10 +187,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 			   bytes);
      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
 #endif
      FlightRecorder::StepLog("Cshift_SendRecv_complete");
      xbytes+=bytes;
      grid->Barrier();
      tcomms+=usecond();
      FlightRecorder::StepLog("Cshift_barrier_complete");
      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
--- a/Grid/cshift/Cshift_none.h
+++ b/Grid/cshift/Cshift_none.h
@@ -30,7 +30,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
  assert(!rhs.Grid()->isIcosahedral());
  Lattice<vobj> ret(rhs.Grid());
  ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
  Cshift_local(ret,rhs,dimension,shift);
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -373,17 +373,14 @@ public:
 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
  typedef typename vobj::scalar_object sobj;
-  uint64_t gsites=1;
+  for(int64_t g=0;g<o.Grid()->_gsites;g++){
  uint64_t polesites=0;
  for(int d=0;d<o.Grid()->_ndimension;d++) gsites *= o.Grid()->_gdimensions[d];
  for(int64_t g=0;g<gsites;g++){
    Coordinate gcoor;
    o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
    sobj ss;
    peekSite(ss,o,gcoor);
-    stream<<"["<<  g<<" : ";
+    stream<<"[";
    for(int d=0;d<gcoor.size();d++){
      stream<<gcoor[d];
      if(d!=gcoor.size()-1) stream<<",";
@@ -391,41 +388,6 @@ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Latti
    stream<<"]\t";
    stream<<ss<<std::endl;
  }
  if ( o.Grid()->isIcosahedralVertex() ) {
    uint64_t psites=1;
    Coordinate perpdims;
    for(int d=2;d<o.Grid()->_ndimension-1;d++){
      int pd=o.Grid()->_gdimensions[d];
      psites*=pd;
      perpdims.push_back(pd);
    }
    for(uint64_t p=0;p<psites;p++){
      sobj ss;
      Coordinate orthog;
      Lexicographic::CoorFromIndex(orthog,p,perpdims);
      peekPole(ss,o,orthog,South);
      stream<<"[ SouthPole : ";
      for(int d=0;d<orthog.size();d++){
 	stream<<orthog[d];
 	if(d!=orthog.size()-1) stream<<",";
      }
      stream<<"]\t";
      stream<<ss<<std::endl;
    }
    for(uint64_t p=0;p<psites;p++){
      sobj ss;
      Coordinate orthog;
      Lexicographic::CoorFromIndex(orthog,p,perpdims);
      peekPole(ss,o,orthog,North);
      stream<<"[ NorthPole : ";
      for(int d=0;d<orthog.size();d++){
 	stream<<orthog[d];
 	if(d!=orthog.size()-1) stream<<",";
      }
      stream<<"]\t";
      stream<<ss<<std::endl;
    }
  }
  return stream;
 }
--- a/Grid/lattice/Lattice_coordinate.h
+++ b/Grid/lattice/Lattice_coordinate.h
@@ -34,86 +34,22 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
  typedef typename iobj::scalar_type scalar_type;
  typedef typename iobj::vector_type vector_type;
  l=Zero();
  GridBase *grid = l.Grid();
  int Nsimd = grid->iSites();
-  int cartesian_vol = grid->oSites();
+  autoView(l_v, l, CpuWrite);
-  if ( grid->isIcosahedral() ) {
+  thread_for( o, grid->oSites(), {
-    cartesian_vol = cartesian_vol - grid->NorthPoleOsites()-grid->SouthPoleOsites();
+    vector_type vI;
-  }
+    Coordinate gcoor;
-  {
+    ExtractBuffer<scalar_type> mergebuf(Nsimd);
-    autoView(l_v, l, CpuWrite);
+    for(int i=0;i<grid->iSites();i++){
-    thread_for( o, cartesian_vol, {
+      grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
-	vector_type vI;
+      mergebuf[i]=(Integer)gcoor[mu];
 	Coordinate gcoor;
 	ExtractBuffer<scalar_type> mergebuf(Nsimd);
 	for(int i=0;i<grid->iSites();i++){
 	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
 	  mergebuf[i]=(Integer)gcoor[mu];
 	}
 	merge<vector_type,scalar_type>(vI,mergebuf);
 	l_v[o]=vI;
      });
  }
  if (grid->isIcosahedralVertex()) {
    uint64_t psites=1;
    Coordinate perpdims;
    typename iobj::scalar_object ss;
    for(int d=2;d<grid->_ndimension-1;d++){
      int pd=grid->_gdimensions[d];
      psites*=pd;
      perpdims.push_back(pd);
    }
-    for(uint64_t p=0;p<psites;p++){
+    merge<vector_type,scalar_type>(vI,mergebuf);
-      Coordinate orthog;
+    l_v[o]=vI;
-      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+  });
      int icoor;
      if ( mu>=2 && mu < grid->_ndimension-1) {
 	icoor = orthog[mu-2];
      } else {
 	icoor = -1;
      }
      ss=scalar_type(icoor);
      pokePole(ss,l,orthog,South);
      pokePole(ss,l,orthog,North);
    }
  }
 };
 template<class iobj> inline void LatticePole(Lattice<iobj> &l,NorthSouth pole)
 {
  typedef typename iobj::scalar_object sobj;
  typedef typename iobj::scalar_type scalar_type;
  typedef typename iobj::vector_type vector_type;
  GridBase *grid = l.Grid();
  l=Zero();
  assert(grid->isIcosahedralVertex());
  if (grid->isIcosahedralVertex()) {
    uint64_t psites=1;
    Coordinate perpdims;
    sobj ss;
    scalar_type one(1.0);
    ss=one;
    for(int d=2;d<l.Grid()->_ndimension-1;d++){
      int pd=l.Grid()->_gdimensions[d];
      psites*=pd;
      perpdims.push_back(pd);
    }
    for(uint64_t p=0;p<psites;p++){
      Coordinate orthog;
      Lexicographic::CoorFromIndex(orthog,p,perpdims);
      pokePole(ss,l,orthog,pole);
    }
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
  grid->GlobalCoorToRankIndex(rank,odx,idx,site);
  ExtractBuffer<sobj> buf(Nsimd);
-  autoView( l_v , l, CpuRead);
+  autoView( l_v , l, CpuWrite);
  extract(l_v[odx],buf);
  s = buf[idx];
@@ -151,261 +151,6 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
  return;
 };
 // zero for south pole, one for north pole
 template<class vobj,class sobj>
 void peekPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
 {
  s=Zero();
  GridBase *grid=l.Grid();
  assert(grid->isIcosahedral());
  assert(grid->isIcosahedralVertex());
  int Nsimd = grid->Nsimd();
  int rank;
  int Ndm1         = grid->_ndimension-1;
  Coordinate pgrid = grid->ProcessorGrid();
  const int xdim=0;
  const int ydim=1;
  const int pdim=Ndm1;
  int64_t pole_osite;
  int64_t pole_isite;
  Coordinate rdims;
  Coordinate idims;
  Coordinate ocoor;
  Coordinate icoor;
  Coordinate pcoor(grid->_ndimension);
  for(int d=2;d<Ndm1;d++){
    int dd=d-2;
    rdims.push_back(grid->_rdimensions[d]);
    idims.push_back(grid->_simd_layout[d]);
    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
  }
  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
  int64_t osite;
  if(isNorth == North){
    pcoor[xdim] = 0;
    pcoor[ydim] = pgrid[ydim]-1;
    pcoor[Ndm1] = pgrid[Ndm1]-1;
    osite = pole_osite + grid->NorthPoleOsite();
  } else {
    pcoor[xdim] = pgrid[xdim]-1;
    pcoor[ydim] = 0;
    pcoor[Ndm1] = 0;
    osite = pole_osite + grid->SouthPoleOsite();
  }
  rank = grid->RankFromProcessorCoor(pcoor);
  if ( rank == grid->ThisRank() ) {
    ExtractBuffer<sobj> buf(Nsimd);
    autoView( l_v , l, CpuWrite);
    extract(l_v[osite],buf);
    s = buf[pole_isite];
  }
  grid->Broadcast(rank,s);
  return;
 };
 template<class vobj,class sobj>
 void pokePole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
 {
  GridBase *grid=l.Grid();
  assert(grid->isIcosahedral());
  assert(grid->isIcosahedralVertex());
  grid->Broadcast(grid->BossRank(),s);
  int Nsimd = grid->Nsimd();
  int rank;
  int Ndm1         = grid->_ndimension-1;
  Coordinate pgrid = grid->ProcessorGrid();
  const int xdim=0;
  const int ydim=1;
  const int pdim=Ndm1;
  int64_t pole_osite;
  int64_t pole_isite;
  Coordinate rdims;
  Coordinate idims;
  Coordinate ocoor;
  Coordinate icoor;
  Coordinate pcoor(grid->_ndimension,0);
  for(int d=2;d<Ndm1;d++){
    int dd = d-2;
    rdims.push_back(grid->_rdimensions[d]);
    idims.push_back(grid->_simd_layout[d]);
    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
    int o = orthog[dd];
    int r = grid->_rdimensions[d];
    int omr = o % r;
  }
  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
  int64_t osite;
  if(isNorth ==North){
    pcoor[xdim] = 0;
    pcoor[ydim] = pgrid[ydim]-1;
    pcoor[Ndm1] = pgrid[Ndm1]-1;
    osite = pole_osite + grid->NorthPoleOsite();
  } else {
    pcoor[xdim] = pgrid[xdim]-1;
    pcoor[ydim] = 0;
    pcoor[Ndm1] = 0;
    osite = pole_osite + grid->SouthPoleOsite();
  }
  rank = grid->RankFromProcessorCoor(pcoor);
  // extract-modify-merge cycle is easiest way and this is not perf critical
  if ( rank == grid->ThisRank() ) {
    ExtractBuffer<sobj> buf(Nsimd);
    autoView( l_v , l, CpuWrite);
    extract(l_v[osite],buf);
    buf[pole_isite] = s;
    merge(l_v[osite],buf);
  }
  return;
 };
 template<class vobj,class sobj>
 void peekLocalPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
 {
  s=Zero();
  GridBase *grid=l.Grid();
  assert(grid->isIcosahedral());
  assert(grid->isIcosahedralVertex());
  int Nsimd = grid->Nsimd();
  int rank;
  int Ndm1         = grid->_ndimension-1;
  Coordinate pgrid = grid->ProcessorGrid();
  const int xdim=0;
  const int ydim=1;
  const int pdim=Ndm1;
  int64_t pole_osite;
  int64_t pole_isite;
  Coordinate rdims;
  Coordinate idims;
  Coordinate ocoor;
  Coordinate icoor;
  //  Coordinate pcoor(grid->_ndimension);
  for(int d=2;d<Ndm1;d++){
    int dd=d-2;
    rdims.push_back(grid->_rdimensions[d]);
    idims.push_back(grid->_simd_layout[d]);
    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
    //    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
  }
  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
  int64_t osite;
  if(isNorth == North){
    //    pcoor[xdim] = 0;
    //    pcoor[ydim] = pgrid[ydim]-1;
    //    pcoor[Ndm1] = pgrid[Ndm1]-1;
    osite = pole_osite + grid->NorthPoleOsite();
    assert(grid->ownsNorthPole());
  } else {
    //    pcoor[xdim] = pgrid[xdim]-1;
    //    pcoor[ydim] = 0;
    //    pcoor[Ndm1] = 0;
    osite = pole_osite + grid->SouthPoleOsite();
    assert(grid->ownsSouthPole());
  }
  ExtractBuffer<sobj> buf(Nsimd);
  autoView( l_v , l, CpuWrite);
  extract(l_v[osite],buf);
  s = buf[pole_isite];
  return;
 };
 template<class vobj,class sobj>
 void pokeLocalPole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
 {
  GridBase *grid=l.Grid();
  assert(grid->isIcosahedral());
  assert(grid->isIcosahedralVertex());
  int Nsimd = grid->Nsimd();
  int rank;
  int Ndm1         = grid->_ndimension-1;
  const int xdim=0;
  const int ydim=1;
  const int pdim=Ndm1;
  int64_t pole_osite;
  int64_t pole_isite;
  Coordinate rdims;
  Coordinate idims;
  Coordinate ocoor;
  Coordinate icoor;
  //  Coordinate pcoor(grid->_ndimension,0);
  for(int d=2;d<Ndm1;d++){
    int dd = d-2;
    rdims.push_back(grid->_rdimensions[d]);
    idims.push_back(grid->_simd_layout[d]);
    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
    //    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
    int o = orthog[dd];
    int r = grid->_rdimensions[d];
    int omr = o % r;
  }
  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
  int64_t osite;
  int insert=0;
  if(isNorth ==North){
    //    pcoor[xdim] = 0;
    //    pcoor[ydim] = pgrid[ydim]-1;
    //    pcoor[Ndm1] = pgrid[Ndm1]-1;
    osite = pole_osite + grid->NorthPoleOsite();
    assert(grid->ownsNorthPole());
  } else {
    //    pcoor[xdim] = pgrid[xdim]-1;
    //    pcoor[ydim] = 0;
    //    pcoor[Ndm1] = 0;
    osite = pole_osite + grid->SouthPoleOsite();
    assert(grid->ownsSouthPole());
  }
  // extract-modify-merge cycle is easiest way and this is not perf critical
  ExtractBuffer<sobj> buf(Nsimd);
  autoView( l_v , l, CpuWrite);
  extract(l_v[osite],buf);
  buf[pole_isite] = s;
  merge(l_v[osite],buf);
  return;
 };
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
@@ -434,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  for(int w=0;w<words;w++){
    pt[w] = getlane(vp[w],idx);
  }
-
+  //  std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
  return;
 };
 template<class vobj,class sobj>
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -325,8 +325,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
    assert(ok);
  }
  FlightRecorder::StepLog("Start global sum");
-  //  grid->GlobalSumP2P(nrm);
+  grid->GlobalSumP2P(nrm);
-  grid->GlobalSum(nrm);
+  //  grid->GlobalSum(nrm);
  FlightRecorder::StepLog("Finished global sum");
  //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
  FlightRecorder::ReductionLog(local,real(nrm)); 
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -48,45 +48,31 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////
 inline int RNGfillable(GridBase *coarse,GridBase *fine)
 {
  if ( coarse == fine ) return 1;
-  if ( coarse->isIcosahedral()) assert(coarse->isIcosahedralEdge());
+  int rngdims = coarse->_ndimension;
-  
+
-  if ( fine->isIcosahedralVertex() && coarse->isIcosahedralEdge() ) {
+  // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
-    assert(fine->Nd()==coarse->Nd());
+  int lowerdims   = fine->_ndimension - coarse->_ndimension;
-    for(int d=0;d<fine->Nd();d++){
+  assert(lowerdims >= 0);
-      assert(fine->LocalDimensions()[d] == coarse->LocalDimensions()[d]);
+  for(int d=0;d<lowerdims;d++){
-    }
+    assert(fine->_simd_layout[d]==1);
-    return 1;
+    assert(fine->_processors[d]==1);
  }
  {
    int rngdims = coarse->_ndimension;
-    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+  int multiplicity=1;
-    int lowerdims   = fine->_ndimension - coarse->_ndimension;
+  for(int d=0;d<lowerdims;d++){
-    assert(lowerdims >= 0);
+    multiplicity=multiplicity*fine->_rdimensions[d];
    for(int d=0;d<lowerdims;d++){
      assert(fine->_simd_layout[d]==1);
      assert(fine->_processors[d]==1);
    }
    int multiplicity=1;
    for(int d=0;d<lowerdims;d++){
      multiplicity=multiplicity*fine->_rdimensions[d];
    }
    // local and global volumes subdivide cleanly after SIMDization
    for(int d=0;d<rngdims;d++){
      int fd= d+lowerdims;
      assert(coarse->_processors[d]  == fine->_processors[fd]);
      assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
      assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
      multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
    }
    return multiplicity;
  }
  // local and global volumes subdivide cleanly after SIMDization
  for(int d=0;d<rngdims;d++){
    int fd= d+lowerdims;
    assert(coarse->_processors[d]  == fine->_processors[fd]);
    assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
    assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
    multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
  }
  return multiplicity;
 }
@@ -94,19 +80,6 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
 // this function is necessary for the LS vectorised field
 inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
 {
  if ( coarse == fine ) return 1;
  if ( coarse->isIcosahedral()) assert(coarse->isIcosahedralEdge());
  if ( fine->isIcosahedralVertex() && coarse->isIcosahedralEdge() ) {
    assert(fine->Nd()==coarse->Nd());
    for(int d=0;d<fine->Nd();d++){
      assert(fine->LocalDimensions()[d] == coarse->LocalDimensions()[d]);
    }
    return 1;
  }
  int rngdims = coarse->_ndimension;
  // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
@@ -379,12 +352,12 @@ private:
 public:
  GridBase *Grid(void) const { return _grid; }
  int generator_idx(int os,int is) {
-    return (is*_grid->CartesianOsites()+os)%_grid->lSites(); // On the pole sites wrap back to normal generators; Icosahedral hack
+    return is*_grid->oSites()+os;
  }
  GridParallelRNG(GridBase *grid) : GridRNGbase() {
    _grid = grid;
-    _vol  =_grid->lSites();
+    _vol  =_grid->iSites()*_grid->oSites();
    _generators.resize(_vol);
    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
@@ -408,7 +381,7 @@ public:
    int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid
    int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l.Grid() too
-    int osites = _grid->CartesianOsites();  // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity, except on Icosahedral
+    int osites = _grid->oSites();  // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
    int words  = sizeof(scalar_object) / sizeof(scalar_type);
    autoView(l_v, l, CpuWrite);
@@ -429,27 +402,8 @@ public:
 	// merge into SIMD lanes, FIXME suboptimal implementation
 	merge(l_v[sm], buf);
      }
-    });
+      });
-
+    //    });
    /*
     * Fill in the poles for an Icosahedral vertex mesh
     */
    if (l.Grid()->isIcosahedralVertex()) { 
      int64_t pole_sites=l.Grid()->NorthPoleOsites()+l.Grid()->SouthPoleOsites();
      int64_t pole_base =l.Grid()->CartesianOsites();
      ExtractBuffer<scalar_object> buf(Nsimd);
      for (int m = 0; m < pole_sites; m++) {  // Draw from same generator multiplicity times                                                                                                           
        for (int si = 0; si < Nsimd; si++) {
          int gdx = 0;
 	  scalar_type *pointer = (scalar_type *)&buf[si];
          dist[gdx].reset();
          for (int idx = 0; idx < words; idx++)
            fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
        }
        merge(l_v[pole_base+m], buf);
      }      
    }
    _time_counter += usecond()- inner_time_counter;
  }
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -49,7 +49,7 @@ static constexpr int Tm = 7;
 static constexpr int Nc=Config_Nc;
 static constexpr int Ns=4;
-static constexpr int Nd=Config_Nd;
+static constexpr int Nd=4;
 static constexpr int Nhs=2; // half spinor
 static constexpr int Nds=8; // double stored gauge field
 static constexpr int Ngp=2; // gparity index range
@@ -75,7 +75,6 @@ static constexpr int InverseYes=1;
 //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
 const int SpinorIndex = 2;
 const int PauliIndex  = 2; //TensorLevel counts from the bottom!
 template<typename T> struct isSpinor {
  static constexpr bool value = (SpinorIndex==T::TensorLevel);
 };
--- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
+++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
@@ -123,10 +123,10 @@ public:
      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
      peekLocalSite(ScalarUmu, Umu_v, lcoor);
-      for (int mu = 0; mu < Nd; mu++) ScalarUds(mu) = ScalarUmu(mu);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
      peekLocalSite(ScalarUmu, Uadj_v, lcoor);
-      for (int mu = 0; mu < Nd; mu++) ScalarUds(mu + Nd) = ScalarUmu(mu);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
      pokeLocalSite(ScalarUds, Uds_v, lcoor);
    });
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -85,15 +85,6 @@ NAMESPACE_CHECK(DomainWall);
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
 NAMESPACE_CHECK(Overlap);
 ///////////////////////////////////////////////////////////////////////////////
 // Two spin wilson fermion based
 ///////////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
 NAMESPACE_CHECK(TwoSpinWilson);
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/FermionCore.h
+++ b/Grid/qcd/action/fermion/FermionCore.h
@@ -41,9 +41,8 @@ NAMESPACE_CHECK(Compressor);
 NAMESPACE_CHECK(FermionOperatorImpl);
 #include <Grid/qcd/action/fermion/FermionOperator.h>
 NAMESPACE_CHECK(FermionOperator);
-#include <Grid/qcd/action/fermion/WilsonKernels.h>           //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
 #include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions
 #include <Grid/qcd/action/fermion/TwoSpinWilsonKernels.h>    //used for 3D fermions, pauli in place of Dirac
 NAMESPACE_CHECK(Kernels);
 #endif
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
@@ -180,12 +180,6 @@ NAMESPACE_CHECK(ImplGparityWilson);
 #include <Grid/qcd/action/fermion/StaggeredImpl.h> 
 NAMESPACE_CHECK(ImplStaggered);  
 /////////////////////////////////////////////////////////////////////////////
 // Two component spinor Wilson action for 3d / Boston
 /////////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/TwoSpinWilsonImpl.h> 
 NAMESPACE_CHECK(ImplTwoSpinWilson);  
 /////////////////////////////////////////////////////////////////////////////
 // Single flavour one component spinors with colour index. 5d vec
 /////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -274,7 +274,7 @@ public:
 	autoView( Uds_v , Uds, CpuWrite);
 	autoView( Utmp_v, Utmp, CpuWrite);
 	thread_foreach(ss,Utmp_v,{
-	    Uds_v[ss](0)(mu+Nd) = Utmp_v[ss]();
+	    Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
 	  });
      }
      Utmp = Uconj;
@@ -286,7 +286,7 @@ public:
 	autoView( Uds_v , Uds, CpuWrite);
 	autoView( Utmp_v, Utmp, CpuWrite);
 	thread_foreach(ss,Utmp_v,{
-	    Uds_v[ss](1)(mu+Nd) = Utmp_v[ss]();
+	    Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
        });
      }
    }
@@ -320,7 +320,7 @@ public:
      }
      Uconj = conjugate(*Upoke);
-      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + Nd);
+      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -36,8 +36,6 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static const int npoint = 16;
  static std::vector<int> MakeDirections(void);
  static std::vector<int> MakeDisplacements(void);
 };
 template <class Impl>
@@ -156,6 +154,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
  void SloppyComms(int sloppy)
  {
    Stencil.SetSloppyComms(sloppy);
    StencilEven.SetSloppyComms(sloppy);
    StencilOdd.SetSloppyComms(sloppy);
  }
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -40,8 +40,6 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  const int npoint = 16;
  static std::vector<int> MakeDirections(void);
  static std::vector<int> MakeDisplacements(void);
 };
 template<class Impl>
@@ -181,6 +179,12 @@ public:
  StencilImpl Stencil; 
  StencilImpl StencilEven; 
  StencilImpl StencilOdd; 
  void SloppyComms(int sloppy)
  {
    Stencil.SetSloppyComms(sloppy);
    StencilEven.SetSloppyComms(sloppy);
    StencilOdd.SetSloppyComms(sloppy);
  }
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -36,8 +36,6 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static const int npoint = 8;
  static std::vector<int> MakeDirections(void);
  static std::vector<int> MakeDisplacements(void);
 };
 template <class Impl>
@@ -148,6 +146,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
  void SloppyComms(int sloppy)
  {
    Stencil.SetSloppyComms(sloppy);
    StencilEven.SetSloppyComms(sloppy);
    StencilOdd.SetSloppyComms(sloppy);
  }
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/StaggeredImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredImpl.h
@@ -141,9 +141,9 @@ public:
      Udag = Udag *phases;
 	InsertGaugeField(Uds,U,mu);
-	InsertGaugeField(Uds,Udag,mu+Nd);
+	InsertGaugeField(Uds,Udag,mu+4);
 	//	PokeIndex<LorentzIndex>(Uds, U, mu);
-	//	PokeIndex<LorentzIndex>(Uds, Udag, mu + Nd);
+	//	PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
      // 3 hop based on thin links. Crazy huh ?
      U  = PeekIndex<LorentzIndex>(Uthin, mu);
@@ -156,7 +156,7 @@ public:
      UUUdag = UUUdag *phases;
 	InsertGaugeField(UUUds,UUU,mu);
-	InsertGaugeField(UUUds,UUUdag,mu+Nd);
+	InsertGaugeField(UUUds,UUUdag,mu+4);
    }
  }
--- a/Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
@@ -1,175 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma one 
 NAMESPACE_BEGIN(Grid);
 class TwoSpinWilsonFermion3plus1DStatic { 
 public:
  // S-direction is INNERMOST and takes no part in the parity.
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static constexpr int npoint = 6;
  static std::vector<int> MakeDirections(void);
  static std::vector<int> MakeDisplacements(void);
 };
 template<class Impl>
 class TwoSpinWilsonFermion3plus1D : public TwoSpinWilsonKernels<Impl>, public TwoSpinWilsonFermion3plus1DStatic
 {
 public:
  INHERIT_IMPL_TYPES(Impl);
  typedef TwoSpinWilsonKernels<Impl> Kernels;
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  int Dirichlet;
  Coordinate Block; 
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
  GridBase *GaugeGrid(void)              { return _ThreeDimGrid ;}
  GridBase *GaugeRedBlackGrid(void)      { return _ThreeDimRedBlackGrid ;}
  GridBase *FermionGrid(void)            { return _FourDimGrid;}
  GridBase *FermionRedBlackGrid(void)    { return _FourDimRedBlackGrid;}
  // full checkerboard operations; leave unimplemented as abstract for now
  virtual void   M    (const FermionField &in, FermionField &out){assert(0);};
  virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};
  // half checkerboard operations; leave unimplemented as abstract for now
  virtual void   Meooe       (const FermionField &in, FermionField &out);
  virtual void   Mooee       (const FermionField &in, FermionField &out);
  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  // These can be overridden by fancy 5d chiral action
  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  //  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  // Implement hopping term non-hermitian hopping term; half cb or both
  // Implement s-diagonal DW
  void DW    (const FermionField &in, FermionField &out,int dag);
  void Dhop  (const FermionField &in, FermionField &out,int dag);
  void DhopOE(const FermionField &in, FermionField &out,int dag);
  void DhopEO(const FermionField &in, FermionField &out,int dag);
  void DhopComms  (const FermionField &in, FermionField &out);
  void DhopCalc   (const FermionField &in, FermionField &out,uint64_t *ids);
  // add a DhopComm
  // -- suboptimal interface will presently trigger multiple comms.
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
  void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
  void DhopDirComms(const FermionField &in);
  void DhopDirCalc(const FermionField &in, FermionField &out,int point);
  ///////////////////////////////////////////////////////////////
  // New methods added 
  ///////////////////////////////////////////////////////////////
  void DerivInternal(StencilImpl & st,
 		     DoubledGaugeField & U,
 		     GaugeField &mat,
 		     const FermionField &A,
 		     const FermionField &B,
 		     int dag);
  void DhopInternal(StencilImpl & st,
 		    DoubledGaugeField &U,
 		    const FermionField &in, 
 		    FermionField &out,
 		    int dag);
  void DhopInternalOverlappedComms(StencilImpl & st,
 				   DoubledGaugeField &U,
 				   const FermionField &in, 
 				   FermionField &out,
 				   int dag);
  void DhopInternalSerialComms(StencilImpl & st,
 			       DoubledGaugeField &U,
 			       const FermionField &in, 
 			       FermionField &out,
 			       int dag);
  // Constructors
  TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
 		  GridCartesian         &FourDimGrid,
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  GridCartesian         &ThreeDimGrid,
 		  GridRedBlackCartesian &ThreeDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());
  virtual void DirichletBlock(const Coordinate & block)
  {
  }
  // DoubleStore
  void ImportGauge(const GaugeField &_Umu);
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
 public:
  // Add these to the support from Wilson
  GridBase *_ThreeDimGrid;
  GridBase *_ThreeDimRedBlackGrid;
  GridBase *_FourDimGrid;
  GridBase *_FourDimRedBlackGrid;
  double                        M5;
  int Ls;
  //Defines the stencils for even and odd
  StencilImpl Stencil; 
  StencilImpl StencilEven; 
  StencilImpl StencilOdd; 
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/TwoSpinWilsonImpl.h
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonImpl.h
@@ -1,222 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
 Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////////////////
 // Single flavour four spinors with colour index
 /////////////////////////////////////////////////////////////////////////////
 template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
 class TwoSpinWilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
 public:
  static const int Dimension = Representation::Dimension;
  static const bool isFundamental = Representation::isFundamental;
  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
  INHERIT_GIMPL_TYPES(Gimpl);
  //Necessary?
  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
  typedef typename Options::_Coeff_t Coeff_t;
  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Nhs> >;
  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
  typedef iImplSpinor<Simd>            SiteSpinor;
  typedef iImplPropagator<Simd>        SitePropagator;
  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
  typedef iImplHalfCommSpinor<Simd>    SiteHalfCommSpinor;
  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
  typedef Lattice<SiteSpinor>            FermionField;
  typedef Lattice<SitePropagator>        PropagatorField;
  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
  typedef SimpleCompressor<SiteSpinor> Compressor;
  typedef WilsonImplParams ImplParams;
  typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
  typedef const typename StencilImpl::View_type StencilView;
  ImplParams Params;
  TwoSpinWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){
  };
  template<class _Spinor>
  static accelerator_inline void multLink(_Spinor &phi,
 					  const SiteDoubledGaugeField &U,
 					  const _Spinor &chi,
 					  int mu) 
  {
    auto UU = coalescedRead(U(mu));
    mult(&phi(), &UU, &chi());
  }
  template<class _Spinor>
  static accelerator_inline void multLink(_Spinor &phi,
 					  const SiteDoubledGaugeField &U,
 					  const _Spinor &chi,
 					  int mu,
 					  StencilEntry *SE,
 					  StencilView &St) 
  {
    multLink(phi,U,chi,mu);
  }
  template<class _SpinorField> 
  inline void multLinkField(_SpinorField & out,
 			    const DoubledGaugeField &Umu,
 			    const _SpinorField & phi,
 			    int mu)
  {
    const int Nsimd = SiteHalfSpinor::Nsimd();
    autoView( out_v, out, AcceleratorWrite);
    autoView( phi_v, phi, AcceleratorRead);
    autoView( Umu_v, Umu, AcceleratorRead);
    typedef decltype(coalescedRead(out_v[0]))   calcSpinor;
    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
 	calcSpinor tmp;
 	multLink(tmp,Umu_v[sss],phi_v(sss),mu);
 	coalescedWrite(out_v[sss],tmp);
    });
  }
  template <class ref>
  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
  {
    reg = memory;
  }
  inline void DoubleStore(GridBase *GaugeGrid,
 			  DoubledGaugeField &Uds,
 			  const GaugeField &Umu) 
  {
    typedef typename Simd::scalar_type scalar_type;
    conformable(Uds.Grid(), GaugeGrid);
    conformable(Umu.Grid(), GaugeGrid);
    GaugeLinkField U(GaugeGrid);
    GaugeLinkField tmp(GaugeGrid);
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
      ////////////////////////////////////////////////////
      // apply any boundary phase or twists
      ////////////////////////////////////////////////////
    for (int mu = 0; mu < Nd; mu++) {
 	////////// boundary phase /////////////
      auto pha = Params.boundary_phases[mu];
      scalar_type phase( real(pha),imag(pha) );
 	int L   = GaugeGrid->GlobalDimensions()[mu];
        int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu);
 	// apply any twists
 	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
 	if ( theta != 0.0) { 
 	  scalar_type twphase(::cos(theta),::sin(theta));
 	  U = twphase*U;
 	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
 	}
      tmp = where(coor == Lmu, phase * U, U);
      PokeIndex<LorentzIndex>(Uds, tmp, mu);
      U = adj(Cshift(U, mu, -1));
      U = where(coor == 0, conjugate(phase) * U, U); 
      PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
    GaugeLinkField link(mat.Grid());
    link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
    PokeIndex<LorentzIndex>(mat,link,mu);
  }   
    inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
      mat = outerProduct(B,A); 
    }  
    inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
      mat = TraceIndex<SpinIndex>(P); 
    }
    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
    {
      for (int mu = 0; mu < Nd; mu++)
      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
    }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
  {
    int Ls=Btilde.Grid()->_fdimensions[0];
    autoView( mat_v , mat, AcceleratorWrite);
    {
      const int Nsimd = SiteSpinor::Nsimd();
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
 	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
 	  zeroit(sum);  
 	  for(int s=0;s<Ls;s++){
 	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
  	      auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF]()(spn) );
 	      auto op = outerProduct(bb,aa);
  	      sum = sum + op;
 	    }
 	  }
  	  coalescedWrite(mat_v[sU](mu)(), sum);
      });
    }
  }
 };
 typedef TwoSpinWilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplR;  // Real.. whichever prec
 typedef TwoSpinWilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplF;  // Float
 typedef TwoSpinWilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD;  // Double
 typedef TwoSpinWilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD2;  // Double
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/TwoSpinWilsonKernels.h
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonKernels.h
@@ -1,84 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.h
 Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Helper routines that implement Wilson stencil for a single site.
 // Common to both the WilsonFermion and WilsonFermion5D
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class Impl> class TwoSpinWilsonKernels : public FermionOperator<Impl>  { 
 public:
  INHERIT_IMPL_TYPES(Impl);
  typedef FermionOperator<Impl> Base;
  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
 public:
  static void DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
 			 int interior=1,int exterior=1) ;
  static void DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
 			 uint64_t *ids);
  static void DhopDagKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out,
 			    int interior=1,int exterior=1) ;
  static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
 			  int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
 private:
  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor * buf,
 				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
  static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
 public:
  TwoSpinWilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -32,209 +32,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////
 // Wilson compressor will need FaceGather policies for:
 // Periodic, Dirichlet, and partial Dirichlet for DWF
 ///////////////////////////////////////////////////////////////
 const int dwf_compressor_depth=2;
 #define DWF_COMPRESS
 class FaceGatherPartialDWF
 {
 public:
 #ifdef DWF_COMPRESS
  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
 #else
  static int PartialCompressionFactor(GridBase *grid) { return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
 				   const Lattice<vobj> &rhs,
 				   cobj *buffer,
 				   compressor &compress,
 				   int off,int so,int partial)
  {
    //DWF only hack: If a direction that is OFF node we use Partial Dirichlet
    //  Shrinks local and remote comms buffers
    GridBase *Grid = rhs.Grid();
    int Ls = Grid->_rdimensions[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else 
    int depth=Ls/2;
 #endif
    std::pair<int,int> *table_v = & table[0];
    auto rhs_v = rhs.View(AcceleratorRead);
    int vol=table.size()/Ls;
    accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
 	Integer i=idx/Ls;
 	Integer s=idx%Ls;
 	Integer sc=depth+s-(Ls-depth);
 	if(s<depth)     compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
 	if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
    });
    rhs_v.ViewClose();
  }
  template<class decompressor,class Decompression>
  static void DecompressFace(decompressor decompress,Decompression &dd)
  {
    auto Ls = dd.dims[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth=Ls/2;
 #endif    
    // Just pass in the Grid
    auto kp = dd.kernel_p;
    auto mp = dd.mpi_p;
    int size= dd.buffer_size;
    int vol= size/Ls;
    accelerator_forNB(o,size,1,{
 	int idx=o/Ls;
 	int   s=o%Ls;
 	if ( s < depth ) {
 	  int oo=s*vol+idx;
 	  kp[o]=mp[oo];
 	} else if ( s >= Ls-depth ) {
 	  int sc = depth + s - (Ls-depth);
 	  int oo=sc*vol+idx; 
 	  kp[o]=mp[oo];
 	} else {
 	  kp[o] = Zero();//fill rest with zero if partial dirichlet
 	}
    });
  }
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Need to gather *interior portions* for ALL s-slices in simd directions
  // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
    GridBase *Grid = rhs.Grid();
    int Ls = Grid->_rdimensions[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth = Ls/2;
 #endif
    // insertion of zeroes...
    assert( (table.size()&0x1)==0);
    int num=table.size()/2;
    int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
    auto rhs_v = rhs.View(AcceleratorRead);
    auto p0=&pointers[0][0];
    auto p1=&pointers[1][0];
    auto tp=&table[0];
    int nnum=num/Ls;
    accelerator_forNB(j, num, vobj::Nsimd(), {
 	//  Reorders both local and remote comms buffers
 	//  
 	int s  = j % Ls;
 	int sp1 = (s+depth)%Ls;  // peri incremented s slice
 	int hxyz= j/Ls;
 	int xyz0= hxyz*2; // xyzt part of coor
 	int xyz1= hxyz*2+1;
 	int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
 	int kk0= xyz0*Ls + s ; // s=0 goes to s=1
 	int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
 	compress.CompressExchange(p0[jj],p1[jj],
 				  rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
 				  rhs_v[so+tp[kk1 ].second], 
 				  type);
    });
    rhs_v.ViewClose();
  }
  // Merge routine is for SIMD faces
  template<class decompressor,class Merger>
  static void MergeFace(decompressor decompress,Merger &mm)
  {
    auto Ls = mm.dims[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth = Ls/2;
 #endif
    int  num= mm.buffer_size/2; // relate vol and Ls to buffer size
    auto mp = &mm.mpointer[0];
    auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
    auto vp1= &mm.vpointers[1][0];
    auto type= mm.type;
    int nnum = num/Ls;
    accelerator_forNB(o,num,Merger::Nsimd,{
 	int  s=o%Ls;
 	int hxyz=o/Ls; // xyzt related component
 	int xyz0=hxyz*2;
 	int xyz1=hxyz*2+1;
 	int sp = (s+depth)%Ls; 
 	int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
 	int oo0= s+xyz0*Ls;
 	int oo1= s+xyz1*Ls;
 	// same ss0, ss1 pair goes to new layout
 	decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
      });
  }
 };
 class FaceGatherDWFMixedBCs
 {
 public:
 #ifdef DWF_COMPRESS
  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
 #else 
  static int PartialCompressionFactor(GridBase *grid) {return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
 					 const Lattice<vobj> &rhs,
 					 cobj *buffer,
 					 compressor &compress,
 					 int off,int so,int partial)
  {
    //    std::cout << " face gather simple DWF partial "<<partial <<std::endl;
    if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
  }
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
    //    std::cout << " face gather exch DWF partial "<<partial <<std::endl;
    if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
    else        FaceGatherSimple::Gather_plane_exchange    (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
  }
  template<class decompressor,class Merger>
  static void MergeFace(decompressor decompress,Merger &mm)
  {
    int partial = mm.partial;
    //    std::cout << " merge DWF partial "<<partial <<std::endl;
    if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
    else           FaceGatherSimple::MergeFace(decompress,mm);
  }
  template<class decompressor,class Decompression>
  static void DecompressFace(decompressor decompress,Decompression &dd)
  {
    int partial = dd.partial;
    //    std::cout << " decompress DWF partial "<<partial <<std::endl;
    if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
    else           FaceGatherSimple::DecompressFace(decompress,dd);
  }
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
 // optimised versions supporting half precision too??? Deprecate
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -242,8 +39,7 @@ public:
 //Could make FaceGather a template param, but then behaviour is runtime not compile time
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
-class WilsonCompressorTemplate  : public FaceGatherDWFMixedBCs
+class WilsonCompressorTemplate : public FaceGatherSimple
 //  : public FaceGatherSimple
 {
 public:
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -38,8 +38,6 @@ public:
  static int MortonOrder;
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static std::vector<int> MakeDirections(void);
  static std::vector<int> MakeDisplacements(void);
  static const int npoint = 8;
 };
@@ -167,6 +165,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
  void SloppyComms(int sloppy)
  {
    Stencil.SetSloppyComms(sloppy);
    StencilEven.SetSloppyComms(sloppy);
    StencilOdd.SetSloppyComms(sloppy);
  }
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -62,8 +62,6 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static constexpr int npoint = 8;
  static std::vector<int> MakeDirections(void);
  static std::vector<int> MakeDisplacements(void);
 };
 template<class Impl>
@@ -206,7 +204,14 @@ public:
  DoubledGaugeField Umu;
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
-    
+
  void SloppyComms(int sloppy)
  {
    Stencil.SetSloppyComms(sloppy);
    StencilEven.SetSloppyComms(sloppy);
    StencilOdd.SetSloppyComms(sloppy);
  }
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -166,7 +166,7 @@ public:
      U = adj(Cshift(U, mu, -1));
      U = where(coor == 0, conjugate(phase) * U, U); 
-      PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
+      PokeIndex<LorentzIndex>(Uds, U, mu + 4);
    }
  }
--- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@@ -56,7 +56,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
 			Frbgrid,
 			Ugrid,
 			Urbgrid,
-			Nd*1.0,p)
+			4.0,p)
    {
      update(_mass,_mu);
@@ -83,7 +83,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    out.Checkerboard() = in.Checkerboard();
    //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
    for (int s=0;s<(int)this->mass.size();s++) {
-      ComplexD a = Nd*1.0+this->mass[s];
+      ComplexD a = 4.0+this->mass[s];
      ComplexD b(0.0,this->mu[s]);
      axpbg5y_ssp(out,a,in,b,in,s,s);
    }
@@ -92,7 +92,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
  virtual void MooeeDag(const FermionField &in, FermionField &out) {
    out.Checkerboard() = in.Checkerboard();
    for (int s=0;s<(int)this->mass.size();s++) {
-      ComplexD a = Nd*1.0+this->mass[s];
+      ComplexD a = 4.0+this->mass[s];
      ComplexD b(0.0,-this->mu[s]);
      axpbg5y_ssp(out,a,in,b,in,s,s);
    }
@@ -101,7 +101,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    for (int s=0;s<(int)this->mass.size();s++) {
      RealD m    = this->mass[s];
      RealD tm   = this->mu[s];
-      RealD mtil = Nd*1.0+this->mass[s];
+      RealD mtil = 4.0+this->mass[s];
      RealD sq   = mtil*mtil+tm*tm;
      ComplexD a    = mtil/sq;
      ComplexD b(0.0, -tm /sq);
@@ -112,7 +112,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    for (int s=0;s<(int)this->mass.size();s++) {
      RealD m    = this->mass[s];
      RealD tm   = this->mu[s];
-      RealD mtil = Nd*1.0+this->mass[s];
+      RealD mtil = 4.0+this->mass[s];
      RealD sq   = mtil*mtil+tm*tm;
      ComplexD a    = mtil/sq;
      ComplexD b(0.0,tm /sq);
@@ -126,7 +126,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    this->Dhop(in, out, DaggerNo);
    FermionField tmp(out.Grid());
    for (int s=0;s<(int)this->mass.size();s++) {
-      ComplexD a = Nd*1.0+this->mass[s];
+      ComplexD a = 4.0+this->mass[s];
      ComplexD b(0.0,this->mu[s]);
      axpbg5y_ssp(tmp,a,in,b,in,s,s);
    }
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
@@ -240,7 +240,7 @@ void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::ve
  this->ceo.resize(Ls);
  for(int i=0; i<Ls; ++i){
-    this->bee[i] = Nd*1.0 - this->M5 + 1.0;
+    this->bee[i] = 4.0 - this->M5 + 1.0;
    this->cee[i] = 1.0;
  }
--- a/Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h
@@ -1,486 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion2plus1D.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>
 #include <Grid/perfmon/PerfCount.h>
 NAMESPACE_BEGIN(Grid);
  // 5d lattice for DWF.
 template<class Impl>
 TwoSpinWilsonFermion3plus15D<Impl>::TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
 								GridCartesian         &FourDimGrid,
 								GridRedBlackCartesian &FourDimRedBlackGrid,
 								GridCartesian         &ThreeDimGrid,
 								GridRedBlackCartesian &ThreeDimRedBlackGrid,
               RealD _M5,const ImplParams &p) :
  Kernels(p),
  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
  _ThreeDimGrid        (&ThreeDimGrid),
  _ThreeDimRedBlackGrid(&ThreeDimRedBlackGrid),
  Stencil    (_FourDimGrid,npoint,Even,directions,displacements,p),
  StencilEven(_FourDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
  StencilOdd (_FourDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
  M5(_M5),
  Umu(_ThreeDimGrid),
  UmuEven(_ThreeDimRedBlackGrid),
  UmuOdd (_ThreeDimRedBlackGrid),
  _tmp(&FourDimRedBlackGrid),
  Dirichlet(0)
 {
  // some assertions
  assert(FourDimGrid._ndimension==Nd+1);
  assert(ThreeDimGrid._ndimension==Nd);
  assert(ThreeDimRedBlackGrid._ndimension==Nd);
  assert(FourDimRedBlackGrid._ndimension==Nd+1);
  assert(FourDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
  // extent of fifth dim and not spread out
  Ls=FourDimGrid._fdimensions[0];
  assert(FourDimRedBlackGrid._fdimensions[0]==Ls);
  assert(FourDimGrid._processors[0]         ==1);
  assert(FourDimRedBlackGrid._processors[0] ==1);
  // Other dimensions must match the decomposition of the four-D fields 
  for(int d=0;d<Nd;d++){
    assert(FourDimGrid._processors[d+1]         ==ThreeDimGrid._processors[d]);
    assert(FourDimRedBlackGrid._processors[d+1] ==ThreeDimGrid._processors[d]);
    assert(ThreeDimRedBlackGrid._processors[d]   ==ThreeDimGrid._processors[d]);
    assert(FourDimGrid._fdimensions[d+1]        ==ThreeDimGrid._fdimensions[d]);
    assert(FourDimRedBlackGrid._fdimensions[d+1]==ThreeDimGrid._fdimensions[d]);
    assert(ThreeDimRedBlackGrid._fdimensions[d]  ==ThreeDimGrid._fdimensions[d]);
    assert(FourDimGrid._simd_layout[d+1]        ==ThreeDimGrid._simd_layout[d]);
    assert(FourDimRedBlackGrid._simd_layout[d+1]==ThreeDimGrid._simd_layout[d]);
    assert(ThreeDimRedBlackGrid._simd_layout[d]  ==ThreeDimGrid._simd_layout[d]);
  }
  if ( p.dirichlet.size() == Nd+1) {
    Coordinate block = p.dirichlet;
    for(int d=0;d<Nd+1;d++) {
      if ( block[d] ){
 	Dirichlet = 1;
 	std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
 	std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
 	Block = block;
      }
    }
  } else {
    Coordinate block(Nd+1,0);
    Block = block;
  }
  // Dimension zero of the five-d is the Ls direction
  assert(FourDimRedBlackGrid._simd_layout[0]==1);
  assert(FourDimGrid._simd_layout[0]        ==1);
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  // Build lists of exterior only nodes
  int LLs = FourDimGrid._rdimensions[0];
  int vol3;
  vol3=ThreeDimGrid.oSites();
  Stencil.BuildSurfaceList(LLs,vol3);
  vol3=ThreeDimRedBlackGrid.oSites();
  StencilEven.BuildSurfaceList(LLs,vol3);
   StencilOdd.BuildSurfaceList(LLs,vol3);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
 {
  int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
                    // we drop off the innermost fifth dimension
  //  assert( (disp==1)||(disp==-1) );
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
  int skip = (disp==1) ? 0 : 1;
  int dirdisp = dir+skip*Nd;
  int gamma   = dir+(1-skip)*Nd;
  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in,compressor);
  uint64_t Nsite = Umu.Grid()->oSites();
  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);
 };
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
 {
  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in,compressor);
  uint64_t Nsite = Umu.Grid()->oSites();
  Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out);
 };
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DerivInternal(StencilImpl & st,
 					  DoubledGaugeField & U,
 					  GaugeField &mat,
 					  const FermionField &A,
 					  const FermionField &B,
 					  int dag)
 {
  assert((dag==DaggerNo) ||(dag==DaggerYes));
  conformable(st.Grid(),A.Grid());
  conformable(st.Grid(),B.Grid());
  Compressor compressor(dag);
  FermionField Btilde(B.Grid());
  FermionField Atilde(B.Grid());
  st.HaloExchange(B,compressor);
  Atilde=A;
  int LLs = B.Grid()->_rdimensions[0];
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma if dag
    ////////////////////////////////////////////////////////////////////////
    int gamma = mu;
    if (!dag) gamma += Nd;
    ////////////////////////
    // Call the single hop
    ////////////////////////
    int Usites = U.Grid()->oSites();
    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
    ////////////////////////////
    // spin trace outer product
    ////////////////////////////
    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
  }
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopDeriv(GaugeField &mat,
                                      const FermionField &A,
                                      const FermionField &B,
                                      int dag)
 {
  conformable(A.Grid(),FermionGrid());  
  conformable(A.Grid(),B.Grid());
  //conformable(GaugeGrid(),mat.Grid());// this is not general! leaving as a comment
  mat.Checkerboard() = A.Checkerboard();
  //  mat.checkerboard = A.checkerboard;
  DerivInternal(Stencil,Umu,mat,A,B,dag);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivEO(GaugeField &mat,
                                        const FermionField &A,
                                        const FermionField &B,
                                        int dag)
 {
  conformable(A.Grid(),FermionRedBlackGrid());
  conformable(A.Grid(),B.Grid());
  assert(B.Checkerboard()==Odd);
  assert(A.Checkerboard()==Even);
  mat.Checkerboard() = Even;
  DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivOE(GaugeField &mat,
                                        const FermionField &A,
                                        const FermionField &B,
                                        int dag)
 {
  conformable(A.Grid(),FermionRedBlackGrid());
  conformable(A.Grid(),B.Grid());
  assert(B.Checkerboard()==Even);
  assert(A.Checkerboard()==Odd);
  mat.Checkerboard() = Odd;
  DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternal(StencilImpl & st,
                                         DoubledGaugeField & U,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopInternalSerialComms(st,U,in,out,dag);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
  GRID_TRACE("DhopInternalOverlappedComms");
  Compressor compressor(dag);
  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();
  /////////////////////////////
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
  {
    //    std::cout << " TwoSpinWilsonFermion3plus1D gather " <<std::endl;
    GRID_TRACE("Gather");
    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
  }
  //  std::cout << " TwoSpinWilsonFermion3plus1D Communicate Begin " <<std::endl;
  std::vector<std::vector<CommsRequest_t> > requests;
 #if 1
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
  st.CommunicateBegin(requests);
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms 
 #endif
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
    Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  } else {
    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel   (st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
  //ifdef GRID_ACCELERATED
 #if 0
  /////////////////////////////
  // Overlap with comms -- on GPU the interior kernel call is nonblocking
  /////////////////////////////
  st.CommunicateBegin(requests);
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
 #endif
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  //  std::cout << " TwoSpinWilsonFermion3plus1D Comms Complete " <<std::endl;
  st.CommunicateComplete(requests);
  //  traceStop(id);
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
  {
    //    std::cout << " TwoSpinWilsonFermion3plus1D Comms Merge " <<std::endl;
    GRID_TRACE("Merge");
    st.CommsMerge(compressor);
  }
  //  std::cout << " TwoSpinWilsonFermion3plus1D Exterior " <<std::endl;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  //  std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalSerialComms(StencilImpl & st, 
 						    DoubledGaugeField & U,
 						    const FermionField &in, 
 						    FermionField &out,int dag)
 {
  GRID_TRACE("DhopInternalSerialComms");
  Compressor compressor(dag);
  int LLs = in.Grid()->_rdimensions[0];
  //  std::cout << " TwoSpinWilsonFermion3plus1D Halo exch " <<std::endl;
  {
    GRID_TRACE("HaloExchange");
    st.HaloExchangeOpt(in,compressor);
  }
  //  std::cout << " TwoSpinWilsonFermion3plus1D Dhop " <<std::endl;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDag");
    Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  } else {
    GRID_TRACE("Dhop");
    Kernels::DhopKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  //  std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
  assert(in.Checkerboard()==Even);
  out.Checkerboard() = Odd;
  DhopInternal(StencilEven,UmuOdd,in,out,dag);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
  assert(in.Checkerboard()==Odd);
  out.Checkerboard() = Even;
  DhopInternal(StencilOdd,UmuEven,in,out,dag);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopComms(const FermionField &in, FermionField &out)
 {
  int dag =0 ;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());
  out.Checkerboard() = in.Checkerboard();
  Compressor compressor(dag);
  Stencil.HaloExchangeOpt(in,compressor);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
 {
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());
  out.Checkerboard() = in.Checkerboard();
  int LLs = in.Grid()->_rdimensions[0];
  Kernels::DhopKernel(Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());
  out.Checkerboard() = in.Checkerboard();
  DhopInternal(Stencil,Umu,in,out,dag);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
 {
  out.Checkerboard()=in.Checkerboard();
  Dhop(in,out,dag); // -0.5 is included
  axpy(out,Nd*1.0-M5,in,out);
 }
 template <class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::Meooe(const FermionField &in, FermionField &out)
 {
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
    DhopOE(in, out, DaggerNo);
  }
 }
 template <class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
 {
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
    DhopOE(in, out, DaggerYes);
  }
 }
 template <class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::Mooee(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(Nd*1.0 + M5);
  out = scal * in;
 }
 template <class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
  out = (1.0/(Nd*1.0 + M5))*in;
 }
 template<class Impl>
 void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in,out);
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h
@@ -1,441 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/TwoSpinWilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/action/fermion/FermionCore.h>
 NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////
 // Generic implementation; move to different file?
 ////////////////////////////////////////////
 #define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
  SE = st.GetEntry(ptype, Dir, sF);				\
  if (SE->_is_local) {						\
    int perm= SE->_permute;					\
    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
    spProj(chi,tmp);						\
  } else {							\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
  acceleratorSynchronise();					\
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
  Recon(result, Uchi);
 #define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
  SE = st.GetEntry(ptype, Dir, sF);				\
  if (SE->_is_local) {						\
    int perm= SE->_permute;					\
    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
    spProj(chi,tmp);							\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);			\
    Recon(result, Uchi);						\
  }									\
  acceleratorSynchronise();
 #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
  SE = st.GetEntry(ptype, Dir, sF);				\
  if (!SE->_is_local ) {		\
    auto chi = coalescedRead(buf[SE->_offset],lane);		\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
    Recon(result, Uchi);					\
    nmu++;							\
  }								\
  acceleratorSynchronise();
 #define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
    if (SE->_is_local ) {					\
      int perm= SE->_permute;					\
      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
      spProj(chi,tmp);						\
    } else {							\
      chi = coalescedRead(buf[SE->_offset],lane);		\
    }								\
    acceleratorSynchronise();					\
    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
    Recon(result, Uchi);
 #define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
  if (gamma == Dir) {						\
    GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon);			\
  }
 ////////////////////////////////////////////////////////////////////
 // All legs kernels ; comms then compute
 ////////////////////////////////////////////////////////////////////
 template <class Impl> accelerator_inline
 void TwoSpinWilsonKernels<Impl>::DhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
 					     SiteSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(in[0])) calcSpinor;
  calcSpinor chi;
  calcSpinor Uchi;
  calcSpinor result;
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  GENERIC_STENCIL_LEG(Xp,pauliProjXp,pauliAssign);
  GENERIC_STENCIL_LEG(Yp,pauliProjYp,pauliAdd);
  GENERIC_STENCIL_LEG(Zp,pauliProjZp,pauliAdd);
  GENERIC_STENCIL_LEG(Xm,pauliProjXm,pauliAdd);
  GENERIC_STENCIL_LEG(Ym,pauliProjYm,pauliAdd);
  GENERIC_STENCIL_LEG(Zm,pauliProjZm,pauliAdd);
  coalescedWrite(out[sF],result,lane);
 };
 template <class Impl> accelerator_inline
 void TwoSpinWilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
 					  SiteSpinor *buf, int sF,
 					  int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
  calcSpinor chi;
  //  calcSpinor *chi_p;
  calcSpinor Uchi;
  calcSpinor result;
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  GENERIC_STENCIL_LEG(Xm,pauliProjXp,pauliAssign);
  GENERIC_STENCIL_LEG(Ym,pauliProjYp,pauliAdd);
  GENERIC_STENCIL_LEG(Zm,pauliProjZp,pauliAdd);
  GENERIC_STENCIL_LEG(Xp,pauliProjXm,pauliAdd);
  GENERIC_STENCIL_LEG(Yp,pauliProjYm,pauliAdd);
  GENERIC_STENCIL_LEG(Zp,pauliProjZm,pauliAdd);
  coalescedWrite(out[sF], result,lane);
 };
  ////////////////////////////////////////////////////////////////////
  // Interior kernels
  ////////////////////////////////////////////////////////////////////
 template <class Impl> accelerator_inline
 void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
 						       SiteSpinor *buf, int sF,
 						       int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
  calcSpinor chi;
  //  calcSpinor *chi_p;
  calcSpinor Uchi;
  calcSpinor result;
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  result=Zero();
  GENERIC_STENCIL_LEG_INT(Xp,pauliProjXp,pauliAdd);
  GENERIC_STENCIL_LEG_INT(Yp,pauliProjYp,pauliAdd);
  GENERIC_STENCIL_LEG_INT(Zp,pauliProjZp,pauliAdd);
  GENERIC_STENCIL_LEG_INT(Xm,pauliProjXm,pauliAdd);
  GENERIC_STENCIL_LEG_INT(Ym,pauliProjYm,pauliAdd);
  GENERIC_STENCIL_LEG_INT(Zm,pauliProjZm,pauliAdd);
  coalescedWrite(out[sF], result,lane);
 };
 template <class Impl> accelerator_inline
 void TwoSpinWilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
 						    SiteSpinor *buf, int sF,
 						    int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
  const int Nsimd = SiteSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  calcSpinor chi;
  //  calcSpinor *chi_p;
  calcSpinor Uchi;
  calcSpinor result;
  StencilEntry *SE;
  int ptype;
  result=Zero();
  GENERIC_STENCIL_LEG_INT(Xm,pauliProjXp,pauliAdd);
  GENERIC_STENCIL_LEG_INT(Ym,pauliProjYp,pauliAdd);
  GENERIC_STENCIL_LEG_INT(Zm,pauliProjZp,pauliAdd);
  GENERIC_STENCIL_LEG_INT(Xp,pauliProjXm,pauliAdd);
  GENERIC_STENCIL_LEG_INT(Yp,pauliProjYm,pauliAdd);
  GENERIC_STENCIL_LEG_INT(Zp,pauliProjZm,pauliAdd);
  coalescedWrite(out[sF], result,lane);
 };
 ////////////////////////////////////////////////////////////////////
 // Exterior kernels
 ////////////////////////////////////////////////////////////////////
 template <class Impl> accelerator_inline
 void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
  //  calcSpinor *chi_p;
  calcSpinor Uchi;
  calcSpinor result;
  StencilEntry *SE;
  int ptype;
  int nmu=0;
  const int Nsimd = SiteSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  result=Zero();
  GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXp,pauliAdd);
  GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYp,pauliAdd);
  GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZp,pauliAdd);
  GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXm,pauliAdd);
  GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYm,pauliAdd);
  GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZm,pauliAdd);
  if ( nmu ) {
    auto out_t = coalescedRead(out[sF],lane);
    out_t = out_t + result;
    coalescedWrite(out[sF],out_t,lane);
  }
 };
 template <class Impl> accelerator_inline
 void TwoSpinWilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
 					     SiteSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
  //  calcSpinor *chi_p;
  calcSpinor Uchi;
  calcSpinor result;
  StencilEntry *SE;
  int ptype;
  int nmu=0;
  const int Nsimd = SiteSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  result=Zero();
  GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXp,pauliAdd);
  GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYp,pauliAdd);
  GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZp,pauliAdd);
  GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXm,pauliAdd);
  GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYm,pauliAdd);
  GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZm,pauliAdd);
  if ( nmu ) {
    auto out_t = coalescedRead(out[sF],lane);
    out_t = out_t + result;
    coalescedWrite(out[sF],out_t,lane);
  }
 };
 #define DhopDirMacro(Dir,spProj,spRecon)	\
  template <class Impl> accelerator_inline				\
  void TwoSpinWilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF, \
 					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
  {									\
  typedef decltype(coalescedRead(in[0]))  calcSpinor;			\
  calcSpinor chi;							\
  calcSpinor result;							\
  calcSpinor Uchi;							\
  StencilEntry *SE;							\
  int ptype;								\
  const int Nsimd = SiteSpinor::Nsimd();				\
  const int lane=acceleratorSIMTlane(Nsimd);					\
 									\
  SE = st.GetEntry(ptype, dir, sF);					\
  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
  coalescedWrite(out[sF], result,lane);					\
  }
 DhopDirMacro(Xp,pauliProjXp,pauliAssign);
 DhopDirMacro(Yp,pauliProjYp,pauliAssign);
 DhopDirMacro(Zp,pauliProjZp,pauliAssign);
 DhopDirMacro(Xm,pauliProjXm,pauliAssign);
 DhopDirMacro(Ym,pauliProjYm,pauliAssign);
 DhopDirMacro(Zm,pauliProjZm,pauliAssign);
 template <class Impl> accelerator_inline
 void TwoSpinWilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
 {
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
  calcSpinor chi;
  calcSpinor result;
  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  SE = st.GetEntry(ptype, dir, sF);
  GENERIC_DHOPDIR_LEG(Xp,pauliProjXp,pauliAssign);
  GENERIC_DHOPDIR_LEG(Yp,pauliProjYp,pauliAssign);
  GENERIC_DHOPDIR_LEG(Zp,pauliProjZp,pauliAssign);
  GENERIC_DHOPDIR_LEG(Xm,pauliProjXm,pauliAssign);
  GENERIC_DHOPDIR_LEG(Ym,pauliProjYm,pauliAssign);
  GENERIC_DHOPDIR_LEG(Zm,pauliProjZm,pauliAssign);
  coalescedWrite(out[sF], result,lane);
 }
 template <class Impl>
 void TwoSpinWilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
 				      int Nsite, const FermionField &in, std::vector<FermionField> &out)
 {
   autoView(U_v  ,U,AcceleratorRead);
   autoView(in_v ,in,AcceleratorRead);
   autoView(st_v ,st,AcceleratorRead);
   autoView(out_Xm,out[0],AcceleratorWrite);
   autoView(out_Ym,out[1],AcceleratorWrite);
   autoView(out_Zm,out[2],AcceleratorWrite);
   autoView(out_Xp,out[4],AcceleratorWrite);
   autoView(out_Yp,out[5],AcceleratorWrite);
   autoView(out_Zp,out[6],AcceleratorWrite);
   auto CBp=st.CommBuf();
   accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
      int sU=sss/Ls;
      int sF =sss;
      DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
      DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
      DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
      DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,3);
      DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,4);
      DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,5);
   });
 }
 template <class Impl>
 void TwoSpinWilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
 					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma)
 {
  assert(dirdisp<=5);
  assert(dirdisp>=0);
   autoView(U_v  ,U  ,AcceleratorRead);
   autoView(in_v ,in ,AcceleratorRead);
   autoView(out_v,out,AcceleratorWrite);
   autoView(st_v ,st ,AcceleratorRead);
   auto CBp=st.CommBuf();
 #define LoopBody(Dir)				\
   case Dir :					\
     accelerator_for(ss,Nsite,Simd::Nsimd(),{	\
       for(int s=0;s<Ls;s++){			\
 	 int sU=ss;				\
 	 int sF = s+Ls*sU;						\
 	 DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
       }							       \
       });							       \
     break;
   switch(gamma){
   LoopBody(Xp);
   LoopBody(Yp);
   LoopBody(Zp);
   LoopBody(Xm);
   LoopBody(Ym);
   LoopBody(Zm);
   default:
     assert(0);
     break;
   }
 #undef LoopBody
 }
 #define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
      int sF = ss;							\
      int sU = ss/Ls;							\
      TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
    });
 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
 #define KERNEL_CALL_EXT(A)						\
  const uint64_t    sz = st.surface_list.size();			\
  auto ptr = &st.surface_list[0];					\
  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
      int sF = ptr[ss];							\
      int sU = sF/Ls;							\
      TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
    });									\
  accelerator_barrier();
 template <class Impl>
 void TwoSpinWilsonKernels<Impl>::DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
 					    int Ls, int Nsite, const FermionField &in, FermionField &out,
 					    int interior,int exterior)
 {
  autoView(U_v  ,  U,AcceleratorRead);
  autoView(in_v , in,AcceleratorRead);
  autoView(out_v,out,AcceleratorWrite);
  autoView(st_v , st,AcceleratorRead);
  if( interior && exterior ) {
    acceleratorFenceComputeStream();
    KERNEL_CALL(GenericDhopSite);
    return;
  } else if( interior ) {
    KERNEL_CALLNB(GenericDhopSiteInt);
    return;
  } else if( exterior ) {
    //     // dependent on result of merge
    acceleratorFenceComputeStream();
    KERNEL_CALL_EXT(GenericDhopSiteExt);
    return;
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
 template <class Impl>
 void TwoSpinWilsonKernels<Impl>::DhopDagKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
 					       int Ls, int Nsite, const FermionField &in, FermionField &out,
 					       int interior,int exterior)
 {
  autoView(U_v  ,U,AcceleratorRead);
  autoView(in_v ,in,AcceleratorRead);
  autoView(out_v,out,AcceleratorWrite);
  autoView(st_v ,st,AcceleratorRead);
  if( interior && exterior ) {
    acceleratorFenceComputeStream();
    KERNEL_CALL(GenericDhopSiteDag);
    return;
  } else if( interior ) {
    KERNEL_CALLNB(GenericDhopSiteDagInt); return;
  } else if( exterior ) {
    // Dependent on result of merge
    acceleratorFenceComputeStream();
    KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
 #undef KERNEL_CALLNB
 #undef KERNEL_CALL
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -61,7 +61,7 @@ WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&
    diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
  } else {
    csw_r     = _csw_r * 0.5;
-    diag_mass = Nd*1.0 + _mass;
+    diag_mass = 4.0 + _mass;
  }
  csw_t = _csw_t * 0.5;
@@ -297,9 +297,9 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
    {
      if (mu == nu)
      continue;
-      
+
      RealD factor;
-      if (nu == (Nd-1) || mu == (Nd-1)) // This was a bug - surely mu/nu is NEVER 4 but rather (Nd-1)=3 ??
+      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
@@ -307,11 +307,9 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
      {
        factor = 2.0 * csw_r;
      }
-      if ( mu < Nd && nu < Nd ) { // Allow to restrict range to Nd=3, but preserve orders of SigmaMuNu in table by counting ALL
+      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
-	PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-	Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
 	force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
      }
      count++;
    }
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -63,10 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Dirichlet(0)
 {
  // some assertions
-  assert(FiveDimGrid._ndimension==Nd+1);
+  assert(FiveDimGrid._ndimension==5);
-  assert(FourDimGrid._ndimension==Nd);
+  assert(FourDimGrid._ndimension==4);
-  assert(FourDimRedBlackGrid._ndimension==Nd);
+  assert(FourDimRedBlackGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._ndimension==Nd+1);
+  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
  // extent of fifth dim and not spread out
@@ -76,7 +76,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  assert(FiveDimRedBlackGrid._processors[0] ==1);
  // Other dimensions must match the decomposition of the four-D fields 
-  for(int d=0;d<Nd;d++){
+  for(int d=0;d<4;d++){
    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
@@ -93,13 +93,11 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  if ( p.dirichlet.size() == Nd+1) {
    Coordinate block = p.dirichlet;
-    for(int d=0;d<Nd+1;d++) {
+    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
-      if ( block[d] ){
+      Dirichlet = 1;
-	Dirichlet = 1;
+      std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
-	std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
+      std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
-	std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
+      Block = block;
 	Block = block;
      }
    }
  } else {
    Coordinate block(Nd+1,0);
@@ -114,7 +112,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    assert(FiveDimGrid._simd_layout[0]        ==nsimd);
    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
-    for(int d=0;d<Nd;d++){
+    for(int d=0;d<4;d++){
      assert(FourDimGrid._simd_layout[d]==1);
      assert(FourDimRedBlackGrid._simd_layout[d]==1);
      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
@@ -185,8 +183,8 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
  int skip = (disp==1) ? 0 : 1;
-  int dirdisp = dir+skip*Nd;
+  int dirdisp = dir+skip*4;
-  int gamma   = dir+(1-skip)*Nd;
+  int gamma   = dir+(1-skip)*4;
  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in,compressor);
@@ -485,7 +483,7 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 {
  out.Checkerboard()=in.Checkerboard();
  Dhop(in,out,dag); // -0.5 is included
-  axpy(out,Nd*1.0-M5,in,out);
+  axpy(out,4.0-M5,in,out);
 }
 template <class Impl>
 void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
@@ -511,7 +509,7 @@ template <class Impl>
 void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
-  typename FermionField::scalar_type scal(Nd*1.0 + M5);
+  typename FermionField::scalar_type scal(4.0 + M5);
  out = scal * in;
 }
@@ -526,7 +524,7 @@ template<class Impl>
 void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
-  out = (1.0/(Nd*1.0 + M5))*in;
+  out = (1.0/(4.0 + M5))*in;
 }
 template<class Impl>
@@ -637,7 +635,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
  F = eaLs * (one - Wea + (Wema - one) * mass*mass);
  F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
-  F = F - abs(W) * sinha * (Nd* 1.0) * mass;
+  F = F - abs(W) * sinha * 4.0 * mass;
  Bpp =  (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
  Bmm =  (A/F) * (one - ea2Ls)  * (one - Wea) * (one - mass*mass * one);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -63,7 +63,7 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
  if  (anisotropyCoeff.isAnisotropic){
    diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
  } else {
-    diag_mass = Nd*1.0 + mass;
+    diag_mass = 4.0 + mass;
  }
  int vol4;
@@ -354,8 +354,8 @@ void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int
  Stencil.HaloExchange(in, compressor);
  int skip = (disp == 1) ? 0 : 1;
-  int dirdisp = dir + skip * Nd;
+  int dirdisp = dir + skip * 4;
-  int gamma = dir + (1 - skip) * Nd;
+  int gamma = dir + (1 - skip) * 4;
  DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
 };
@@ -370,8 +370,8 @@ void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<Fermion
    for(int disp=-1;disp<=1;disp+=2){
      int skip = (disp == 1) ? 0 : 1;
-      int dirdisp = dir + skip * Nd;
+      int dirdisp = dir + skip * 4;
-      int gamma = dir + (1 - skip) * Nd;
+      int gamma = dir + (1 - skip) * 4;
      DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
    }
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
@@ -97,7 +97,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  distance = st._distances[DIR];				\
  sl = st._simd_layout[direction];			        \
  inplace_twist = 0;						\
-  if(SE->_around_the_world && st.parameters.twists[DIR % Nd]){		\
+  if(SE->_around_the_world && st.parameters.twists[DIR % 4]){		\
    if(sl == 1){							\
      g = (F+1) % 2;							\
    }else{								\
--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermion5DInstantiation.cc
@@ -32,30 +32,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 // S-direction is INNERMOST and takes no part in the parity.
-const std::vector<int> ImprovedStaggeredFermion5DStatic::directions(ImprovedStaggeredFermion5DStatic::MakeDirections());
+const std::vector<int> ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
-const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements(ImprovedStaggeredFermion5DStatic::MakeDisplacements());
+const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
 std::vector<int> ImprovedStaggeredFermion5DStatic::MakeDirections(void)
 {
  std::vector<int> directions(4*Nd);
  for(int d=0;d<Nd;d++){
    directions[d+Nd*0] = d+1;
    directions[d+Nd*1] = d+1;
    directions[d+Nd*2] = d+1;
    directions[d+Nd*3] = d+1;
  }
  return directions;
 }
 std::vector<int> ImprovedStaggeredFermion5DStatic::MakeDisplacements(void)
 {
  std::vector<int> displacements(4*Nd);
  for(int d=0;d<Nd;d++){
    displacements[d+Nd*0] =+1;
    displacements[d+Nd*1] =-1;
    displacements[d+Nd*2] =+3;
    displacements[d+Nd*3] =-3;
  }
  return displacements;
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
@@ -32,26 +32,5 @@ NAMESPACE_BEGIN(Grid);
 const std::vector<int> ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
 const std::vector<int> ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
-std::vector<int> ImprovedStaggeredFermionStatic::MakeDirections(void)
+
 {
  std::vector<int> directions(4*Nd);
  for(int d=0;d<Nd;d++){
    directions[d+Nd*0] = d;
    directions[d+Nd*1] = d;
    directions[d+Nd*2] = d;
    directions[d+Nd*3] = d;
  }
  return directions;
 }
 std::vector<int> ImprovedStaggeredFermionStatic::MakeDisplacements(void)
 {
  std::vector<int> displacements(4*Nd);
  for(int d=0;d<Nd;d++){
    displacements[d+Nd*0] =+1;
    displacements[d+Nd*1] =-1;
    displacements[d+Nd*2] =+3;
    displacements[d+Nd*3] =-3;
  }
  return displacements;
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
@@ -30,27 +30,7 @@ directory
 NAMESPACE_BEGIN(Grid);
-//const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
+const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
-//const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
 const std::vector<int> NaiveStaggeredFermionStatic::directions(NaiveStaggeredFermionStatic::MakeDirections());
 const std::vector<int> NaiveStaggeredFermionStatic::displacements(NaiveStaggeredFermionStatic::MakeDisplacements());
 std::vector<int> NaiveStaggeredFermionStatic::MakeDirections(void)
 {
  std::vector<int> directions(4*Nd);
  for(int d=0;d<Nd;d++){
    directions[d+Nd*0] = d;
    directions[d+Nd*1] = d;
  }
  return directions;
 }
 std::vector<int> NaiveStaggeredFermionStatic::MakeDisplacements(void)
 {
  std::vector<int> displacements(4*Nd);
  for(int d=0;d<Nd;d++){
    displacements[d+Nd*0] =+1;
    displacements[d+Nd*1] =-1;
  }
  return displacements;
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc
@@ -1,61 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
 NAMESPACE_BEGIN(Grid);
 // S-direction is INNERMOST and takes no part in the parity.
 const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::directions   (TwoSpinWilsonFermion3plus1DStatic::MakeDirections());
 const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::displacements(TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements());
 std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDirections (void)
 {
  std::vector<int> directions(2*Nd);
  for(int d=0;d<Nd;d++){
    directions[d]    = d+1;
    directions[d+Nd] = d+1;
  }
  return directions;
 }
 std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements(void)
 {
  std::vector<int> displacements(2*Nd);
  for(int d=0;d<Nd;d++){
    displacements[d]    = +1;
    displacements[d+Nd] = -1;
  }
  return displacements;
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc.master
@@ -1,40 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class TwoSpinWilsonFermion3plus1D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonKernelsInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonKernelsInstantiation.cc.master
@@ -1,40 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class TwoSpinWilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonFermion5DInstantiation.cc
@@ -34,28 +34,8 @@ directory
 NAMESPACE_BEGIN(Grid);
 // S-direction is INNERMOST and takes no part in the parity.
-
+const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
-const std::vector<int> WilsonFermion5DStatic::directions   (WilsonFermion5DStatic::MakeDirections());
+const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
 const std::vector<int> WilsonFermion5DStatic::displacements(WilsonFermion5DStatic::MakeDisplacements());
 std::vector<int> WilsonFermion5DStatic::MakeDirections (void)
 {
  std::vector<int> directions(2*Nd);
  for(int d=0;d<Nd;d++){
    directions[d]    = d+1;
    directions[d+Nd] = d+1;
  }
  return directions;
 }
 std::vector<int> WilsonFermion5DStatic::MakeDisplacements(void)
 {
  std::vector<int> displacements(2*Nd);
  for(int d=0;d<Nd;d++){
    displacements[d]    = +1;
    displacements[d+Nd] = -1;
  }
  return displacements;
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonFermionInstantiation.cc
@@ -33,27 +33,9 @@ directory
 NAMESPACE_BEGIN(Grid);
-const std::vector<int> WilsonFermionStatic::directions(WilsonFermionStatic::MakeDirections());
+const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> WilsonFermionStatic::displacements(WilsonFermionStatic::MakeDisplacements());
+const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
 int WilsonFermionStatic::HandOptDslash;
 std::vector<int> WilsonFermionStatic::MakeDirections (void)
 {
  std::vector<int> directions(2*Nd);
  for(int d=0;d<Nd;d++){
    directions[d]    = d;
    directions[d+Nd] = d;
  }
  return directions;
 }
 std::vector<int> WilsonFermionStatic::MakeDisplacements(void)
 {
  std::vector<int> displacements(2*Nd);
  for(int d=0;d<Nd;d++){
    displacements[d]    = +1;
    displacements[d+Nd] = -1;
  }
  return displacements;
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -36,16 +36,11 @@ DWF_IMPL_LIST=" \
 	   ZWilsonImplF \
 	   ZWilsonImplD2 "
 TWOSPIN_WILSON_IMPL_LIST=" \
 	   TwoSpinWilsonImplF \
 	   TwoSpinWilsonImplD "
 GDWF_IMPL_LIST=" \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "
-IMPL_LIST="$STAG_IMPL_LIST  $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST $TWOSPIN_WILSON_IMPL_LIST"
+IMPL_LIST="$STAG_IMPL_LIST  $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST"
 for impl in $IMPL_LIST
 do
@@ -115,12 +110,7 @@ do
 done
 done
-CC_LIST="TwoSpinWilsonFermion3plus1DInstantiation.cc.master	TwoSpinWilsonKernelsInstantiation.cc.master"
+CC_LIST=" \
  ImprovedStaggeredFermion5DInstantiation \
  StaggeredKernelsInstantiation "
 for impl in $TWOSPIN_WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -158,8 +158,8 @@ RealD WilsonFlowBase<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeF
  LatticeComplexD R(U.Grid());
  R = Zero();
-  for(int mu=0;mu<Nd-1;mu++){
+  for(int mu=0;mu<3;mu++){
-    for(int nu=mu+1;nu<Nd;nu++){
+    for(int nu=mu+1;nu<4;nu++){
      WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
      R = R + trace(F*F);
    }
--- a/Grid/qcd/spin/Pauli.h
+++ b/Grid/qcd/spin/Pauli.h
@@ -1,220 +0,0 @@
 #ifndef GRID_QCD_PAULI_H
 #define GRID_QCD_PAULI_H
 #include <array>
 NAMESPACE_BEGIN(Grid);
 //
 /*
 * Pauli basis
 * sx        sy       sz       ident
 * (0 1)  , (0 -i) , ( 1 0 )
 * (1 0)    (i  0)   ( 0 -1)
 *
 * These are hermitian.
 *
 * Also supply wilson "projectors" (1+/-sx), (1+/-sy), (1+/-sz)
 *
 * spPauliProjXm
 * spPauliProjYm etc...
 */
 class Pauli {
  public:
    GRID_SERIALIZABLE_ENUM(Algebra, undef,
                           SigmaX           , 0,
                           MinusSigmaX      , 1,
                           SigmaY           , 2,
                           MinusSigmaY      , 3,
                           SigmaZ           , 4,
                           MinusSigmaZ      , 5,
                           Identity         , 6,
 			   MinusIdentity    , 7);
    static constexpr unsigned int nPauli = 8;
    static const std::array<const char *, nPauli>                name;
    static const std::array<std::array<Algebra, nPauli>, nPauli> mul;
    static const std::array<Algebra, nPauli>                     adj;
    static const std::array<const Pauli, 4>                      gmu;
    static const std::array<const Pauli, 16>                     gall;
    Algebra                                                      g;
  public:
  accelerator Pauli(Algebra initg): g(initg) {}  
 };
 #define CopyImplementation(iTemplate,multPauli,multFlavour)	\
  template<class vtype>							\
  accelerator_inline void multPauli(iTemplate<vtype, Nhs> &ret, const iTemplate<vtype, Nhs> &rhs) {	\
    multFlavour(ret,rhs);						\
 }
 CopyImplementation(iVector,multPauliSigmaX,multFlavourSigmaX);
 CopyImplementation(iMatrix,lmultPauliSigmaX,lmultFlavourSigmaX);
 CopyImplementation(iMatrix,rmultPauliSigmaX,rmultFlavourSigmaX);
 CopyImplementation(iVector,multPauliMinusSigmaX ,multFlavourMinusSigmaX);
 CopyImplementation(iMatrix,lmultPauliMinusSigmaX,lmultFlavourMinusSigmaX);
 CopyImplementation(iMatrix,rmultPauliMinusSigmaX,rmultFlavourMinusSigmaX);
 CopyImplementation(iVector,multPauliSigmaY,multFlavourSigmaY);
 CopyImplementation(iMatrix,lmultPauliSigmaY,lmultFlavourSigmaY);
 CopyImplementation(iMatrix,rmultPauliSigmaY,rmultFlavourSigmaY);
 CopyImplementation(iVector,multPauliMinusSigmaY ,multFlavourMinusSigmaY);
 CopyImplementation(iMatrix,lmultPauliMinusSigmaY,lmultFlavourMinusSigmaY);
 CopyImplementation(iMatrix,rmultPauliMinusSigmaY,rmultFlavourMinusSigmaY);
 CopyImplementation(iVector,multPauliSigmaZ,multFlavourSigmaZ);
 CopyImplementation(iMatrix,lmultPauliSigmaZ,lmultFlavourSigmaZ);
 CopyImplementation(iMatrix,rmultPauliSigmaZ,rmultFlavourSigmaZ);
 CopyImplementation(iVector,multPauliMinusSigmaZ ,multFlavourMinusSigmaZ);
 CopyImplementation(iMatrix,lmultPauliMinusSigmaZ,lmultFlavourMinusSigmaZ);
 CopyImplementation(iMatrix,rmultPauliMinusSigmaZ,rmultFlavourMinusSigmaZ);
 CopyImplementation(iVector,multPauliIdentity,multFlavourIdentity);
 CopyImplementation(iMatrix,lmultPauliIdentity,lmultFlavourIdentity);
 CopyImplementation(iMatrix,rmultPauliIdentity,rmultFlavourIdentity);
 CopyImplementation(iVector,multPauliMinusIdentity ,multFlavourMinusIdentity);
 CopyImplementation(iMatrix,lmultPauliMinusIdentity,lmultFlavourMinusIdentity);
 CopyImplementation(iMatrix,rmultPauliMinusIdentity,rmultFlavourMinusIdentity);
 /*
 * sx        sy       sz       ident
 * (0 1)  , (0 -i) , ( 1 0 )
 * (1 0)    (i  0)   ( 0 -1)
 */
 template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
 {
  hspin(0)=fspin(0)+fspin(1);
  hspin(1)=fspin(1)+fspin(0);
 }
 template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
 {
  hspin(0)=fspin(0)-fspin(1);
  hspin(1)=fspin(1)-fspin(0);
 }
 template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
 {
  hspin(0)=fspin(0)-timesI(fspin(1));
  hspin(1)=fspin(1)+timesI(fspin(0));
 }
 template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
 {
  hspin(0)=fspin(0)+timesI(fspin(1));
  hspin(1)=fspin(1)-timesI(fspin(0));
 }
 template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
 {
  hspin(0)=fspin(0)+fspin(0);
  hspin(1)=Zero();
 }
 template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
 {
  hspin(0)=Zero();
  hspin(1)=fspin(1)+fspin(1);
 }
 template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAssign(iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
 {
  fspin = hspin;
 }
 template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAdd   (iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
 {
  fspin = fspin + hspin;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const Pauli &G, const iVector<vtype, Nhs> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Nhs>, PauliIndex>::value, iVector<vtype, Nhs>>::type
 {
  iVector<vtype, Nhs> ret;
  switch (G.g) 
  {
  case Pauli::Algebra::SigmaX:
    multPauliSigmaX(ret, arg); break;
  case Pauli::Algebra::MinusSigmaX:
    multPauliMinusSigmaX(ret, arg); break;
  case Pauli::Algebra::SigmaY:
    multPauliSigmaY(ret, arg); break;
  case Pauli::Algebra::MinusSigmaY:
    multPauliMinusSigmaY(ret, arg); break;
  case Pauli::Algebra::SigmaZ:
    multPauliSigmaZ(ret, arg); break;
  case Pauli::Algebra::MinusSigmaZ:
    multPauliMinusSigmaZ(ret, arg); break;
  case Pauli::Algebra::Identity:
    multPauliIdentity(ret, arg); break;
  case Pauli::Algebra::MinusIdentity:
    multPauliMinusIdentity(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const Pauli &G, const iMatrix<vtype, Nhs> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
 {
  iMatrix<vtype, Nhs> ret;
  switch (G.g) 
  {
  case Pauli::Algebra::SigmaX:
    lmultPauliSigmaX(ret, arg); break;
  case Pauli::Algebra::MinusSigmaX:
    lmultPauliMinusSigmaX(ret, arg); break;
  case Pauli::Algebra::SigmaY:
    lmultPauliSigmaY(ret, arg); break;
  case Pauli::Algebra::MinusSigmaY:
    lmultPauliMinusSigmaY(ret, arg); break;
  case Pauli::Algebra::SigmaZ:
    lmultPauliSigmaZ(ret, arg); break;
  case Pauli::Algebra::MinusSigmaZ:
    lmultPauliMinusSigmaZ(ret, arg); break;
  case Pauli::Algebra::Identity:
    lmultPauliIdentity(ret, arg); break;
  case Pauli::Algebra::MinusIdentity:
    lmultPauliMinusIdentity(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const iMatrix<vtype, Nhs> &arg, const Pauli &G)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
 {
  iMatrix<vtype, Nhs> ret;
  switch (G.g) 
  {
  case Pauli::Algebra::SigmaX:
    rmultPauliSigmaX(ret, arg); break;
  case Pauli::Algebra::MinusSigmaX:
    rmultPauliMinusSigmaX(ret, arg); break;
  case Pauli::Algebra::SigmaY:
    rmultPauliSigmaY(ret, arg); break;
  case Pauli::Algebra::MinusSigmaY:
    rmultPauliMinusSigmaY(ret, arg); break;
  case Pauli::Algebra::SigmaZ:
    rmultPauliSigmaZ(ret, arg); break;
  case Pauli::Algebra::MinusSigmaZ:
    rmultPauliMinusSigmaZ(ret, arg); break;
  case Pauli::Algebra::Identity:
    rmultPauliIdentity(ret, arg); break;
  case Pauli::Algebra::MinusIdentity:
    rmultPauliMinusIdentity(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 NAMESPACE_END(Grid);
 #endif // GRID_QCD_GAMMA_H
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -179,17 +179,20 @@ public:
  //////////////////////////////////////////////////
  // average over all x,y,z the temporal loop
  //////////////////////////////////////////////////
-  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  
+  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
    GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
    ComplexD out;
-    uint64_t vol = Umu.Grid()->gSites();
+    int T = Umu.Grid()->GlobalDimensions()[3];
-    int T = Umu.Grid()->GlobalDimensions()[Nd-1];
+    int X = Umu.Grid()->GlobalDimensions()[0];
-    Ut = peekLorentz(Umu,Nd-1); //Select temporal direction
+    int Y = Umu.Grid()->GlobalDimensions()[1];
    int Z = Umu.Grid()->GlobalDimensions()[2];
    Ut = peekLorentz(Umu,3); //Select temporal direction
    P = Ut;
    for (int t=1;t<T;t++){ 
-      P = Gimpl::CovShiftForward(Ut,Nd-1,P);
+      P = Gimpl::CovShiftForward(Ut,3,P);
    }
-   RealD norm = 1.0/(Nc*vol);
+   RealD norm = 1.0/(Nc*X*Y*Z*T);
   out = sum(trace(P))*norm;
   return out;   
 }
@@ -212,7 +215,7 @@ public:
    double vol = Umu.Grid()->gSites();
-    return p.real() / vol / (Nd * Nc ) ;
+    return p.real() / vol / (4.0 * Nc ) ;
  };
  //////////////////////////////////////////////////
@@ -737,7 +740,6 @@ public:
  //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf  Eq 6
  //output is the charge by timeslice: sum over timeslices to obtain the total
  static std::vector<Real> TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
    // Audit: 4D epsilon is hard coded
    assert(Nd == 4);
    std::vector<std::vector<GaugeMat*> > F(Nd,std::vector<GaugeMat*>(Nd,nullptr));
    //Note F_numu = - F_munu
@@ -827,25 +829,6 @@ public:
    return out;
  }
  //Compute the 5Li topological charge density
  static std::vector<Real> TopologicalChargeDensity5Li(const GaugeLorentz &U){
    static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };
    std::vector<std::vector<Real> > loops = TimesliceTopologicalCharge5LiContributions(U);
    double c5=1./20.;
    double c4=1./5.-2.*c5;
    double c3=(-64.+640.*c5)/45.;
    double c2=(1-64.*c5)/9.;
    double c1=(19.-55.*c5)/9.;
    int Lt = loops[0].size();
    std::vector<Real> out(Lt,0.);
    for(int t=0;t<Lt;t++)
      out[t] += c1*loops[0][t] + c2*loops[1][t] + c3*loops[2][t] + c4*loops[3][t] + c5*loops[4][t];
    return out;
  }
  static Real TopologicalCharge5Li(const GaugeLorentz &U){
    std::vector<Real> Qt = TimesliceTopologicalCharge5Li(U);
    Real Q = 0.;
@@ -1472,7 +1455,7 @@ public:
  //////////////////////////////////////////////////
  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(Nd, Umu.Grid());
+    std::vector<GaugeMat> U(4, Umu.Grid());
    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
@@ -1491,7 +1474,7 @@ public:
  //////////////////////////////////////////////////
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(Nd, Umu.Grid());
+    std::vector<GaugeMat> U(4, Umu.Grid());
    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
@@ -1509,8 +1492,8 @@ public:
  // sum over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
-				   const int R1, const int R2) {
+                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(Nd, Umu.Grid());
+    std::vector<GaugeMat> U(4, Umu.Grid());
    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
--- a/Grid/simd/Simd.h
+++ b/Grid/simd/Simd.h
@@ -252,7 +252,7 @@ inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){
 inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
  int nn=vComplexD::Nsimd();
-  std::vector<ComplexD> buf(nn);
+  std::vector<ComplexD,alignedAllocator<ComplexD> > buf(nn);
  vstore(o,&buf[0]);
  stream<<"<";
  for(int i=0;i<nn;i++){
@@ -272,7 +272,7 @@ inline std::ostream& operator<< (std::ostream& stream, const vComplexD2 &o){
 inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
  int nn=vRealF::Nsimd();
-  std::vector<RealF> buf(nn);
+  std::vector<RealF,alignedAllocator<RealF> > buf(nn);
  vstore(o,&buf[0]);
  stream<<"<";
  for(int i=0;i<nn;i++){
--- a/Grid/stencil/IcosahedralStencil.h
+++ b/Grid/stencil/IcosahedralStencil.h
--- a/Grid/stencil/Stencil.cc
+++ b/Grid/stencil/Stencil.cc
@@ -30,25 +30,26 @@
 NAMESPACE_BEGIN(Grid);
 uint64_t DslashFullCount;
-uint64_t DslashPartialCount;
+//uint64_t DslashPartialCount;
 uint64_t DslashDirichletCount;
 void DslashResetCounts(void)
 {
  DslashFullCount=0;
-  DslashPartialCount=0;
+  //  DslashPartialCount=0;
  DslashDirichletCount=0;
 }
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
 {
  dirichlet = DslashDirichletCount;
-  partial   = DslashPartialCount;
+  partial   = 0;
  full      = DslashFullCount;
 }
 void DslashLogFull(void)     { DslashFullCount++;}
-void DslashLogPartial(void)  { DslashPartialCount++;}
+//void DslashLogPartial(void)  { DslashPartialCount++;}
 void DslashLogDirichlet(void){ DslashDirichletCount++;}
 deviceVector<unsigned char> StencilBuffer::DeviceCommBuf; 
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table)
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct DefaultImplParams {
  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  int  partialDirichlet;
+  //  int  partialDirichlet;
  DefaultImplParams()  {
    dirichlet.resize(0);
-    partialDirichlet=0;
+    //    partialDirichlet=0;
  };
 };
@@ -69,6 +69,12 @@ struct DefaultImplParams {
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table);
 class StencilBuffer
 {
 public:
  static deviceVector<unsigned char> DeviceCommBuf;     // placed in Stencil.cc
 };
 void DslashResetCounts(void);
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 void DslashLogFull(void);
@@ -113,8 +119,8 @@ class CartesianStencilAccelerator {
  ///////////////////////////////////////////////////
  // If true, this is partially communicated per face
  ///////////////////////////////////////////////////
-  StencilVector _comms_partial_send; 
+  //  StencilVector _comms_partial_send; 
-  StencilVector _comms_partial_recv;
+  //  StencilVector _comms_partial_recv;
  //
  StencilVector _comm_buf_size;
  StencilVector _permute_type;
@@ -205,16 +211,16 @@ public:
  struct Packet {
    void * send_buf;
    void * recv_buf;
-#ifndef ACCELERATOR_AWARE_MPI
+    void * compressed_send_buf;
-    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
+    void * compressed_recv_buf;
    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
 #endif
    Integer to_rank;
    Integer from_rank;
    Integer do_send;
    Integer do_recv;
    Integer xbytes;
    Integer rbytes;
    Integer xbytes_compressed;
    Integer rbytes_compressed;
  };
  struct Merge {
    static constexpr int Nsimd = vobj::Nsimd();
@@ -223,7 +229,7 @@ public:
    std::vector<cobj *> vpointers;
    Integer buffer_size;
    Integer type;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct Decompress {
@@ -231,7 +237,7 @@ public:
    cobj * kernel_p;
    cobj * mpi_p;
    Integer buffer_size;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct CopyReceiveBuffer {
@@ -252,9 +258,45 @@ public:
 protected:
  GridBase *                        _grid;
  ///////////////////////////////////////////////////
  // Sloppy comms will make a second buffer upon comms
  ///////////////////////////////////////////////////
  size_t device_heap_top;  //
  size_t device_heap_bytes;//
  size_t device_heap_size; //
  void *DeviceBufferMalloc(size_t bytes)
  {
    void *ptr = (void *)device_heap_top;
    device_heap_top  += bytes;
    device_heap_bytes+= bytes;
    if ( device_heap_bytes > device_heap_size ) {
      std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
      assert (device_heap_bytes <= device_heap_size);
    }
    return ptr;
  }
  void  DeviceBufferFreeAll(void)
  {
    device_heap_size = _unified_buffer_size*sizeof(cobj);
    // Resize up if necessary, never down
    if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
      StencilBuffer::DeviceCommBuf.resize(device_heap_size);
    }
    device_heap_top  =(size_t) &StencilBuffer::DeviceCommBuf[0];
    device_heap_size = StencilBuffer::DeviceCommBuf.size();
    device_heap_bytes=0;
  }
 public:
  GridBase *Grid(void) const { return _grid; }
  /////////////////////////////////////////////////////////
  // Control reduced precision comms
  /////////////////////////////////////////////////////////
  int SloppyComms;
  void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
  ////////////////////////////////////////////////////////////////////////
  // Needed to conveniently communicate gparity parameters into GPU memory
  // without adding parameters. Perhaps a template parameter to StenciView is
@@ -268,7 +310,7 @@ public:
  }
  int face_table_computed;
-  int partialDirichlet;
+  //  int partialDirichlet;
  int fullDirichlet;
  std::vector<deviceVector<std::pair<int,int> > > face_table ;
  deviceVector<int> surface_list;
@@ -361,24 +403,145 @@ public:
  ////////////////////////////////////////////////////////////////////////
  // Non blocking send and receive. Necessarily parallel.
  ////////////////////////////////////////////////////////////////////////
  void DecompressPacket(Packet &packet)
  {
    if ( !SloppyComms ) return;
    if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
      typedef typename getPrecision<cobj>::real_scalar_type word;
      uint64_t words = packet.rbytes/sizeof(word);
      const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
      const uint64_t outer = words/nsimd;
      if(sizeof(word)==8) {
 	// Can either choose to represent as float vs double and prec change
 	// OR
 	// truncate the mantissa bfp16 style
 	double *dbuf =(double *) packet.recv_buf;
 	float  *fbuf =(float  *) packet.compressed_recv_buf;
 	accelerator_forNB(ss,outer,nsimd,{
 	  int lane = acceleratorSIMTlane(nsimd);
 	  dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
 	});
      } else if ( sizeof(word)==4){
 	// Can either choose to represent as half vs float and prec change
        // OR
 	// truncate the mantissa bfp16 style
 	uint32_t *fbuf =(uint32_t *) packet.recv_buf;
 	uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
 	accelerator_forNB(ss,outer,nsimd,{
 	  int lane = acceleratorSIMTlane(nsimd);
 	  fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
 	});
      } else {
 	assert(0 && "unknown floating point precision");
      }
    }
  }
  void CompressPacket(Packet &packet)
  {
    packet.xbytes_compressed = packet.xbytes;
    packet.compressed_send_buf = packet.send_buf;
    packet.rbytes_compressed = packet.rbytes;
    packet.compressed_recv_buf = packet.recv_buf;
    if ( !SloppyComms  ) {
      return;
    }
    typedef typename getPrecision<cobj>::real_scalar_type word;
    uint64_t words = packet.xbytes/sizeof(word);
    const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
    const uint64_t outer = words/nsimd;
    if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
      packet.rbytes_compressed = packet.rbytes/2;
      packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
      //      std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
    }
    //else {
    //      std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
    //    }
    if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
      packet.xbytes_compressed = packet.xbytes/2;
      packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
      //      std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
      if(sizeof(word)==8) {
 	double *dbuf =(double *) packet.send_buf;
 	float  *fbuf =(float  *) packet.compressed_send_buf;
 	accelerator_forNB(ss,outer,nsimd,{
 	  int lane = acceleratorSIMTlane(nsimd);
 	  fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
 	});
      } else if ( sizeof(word)==4){
 	uint32_t *fbuf =(uint32_t *) packet.send_buf;
 	uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
 	accelerator_forNB(ss,outer,nsimd,{
 	  int lane = acceleratorSIMTlane(nsimd);
 	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
 	});
      } else {
 	assert(0 && "unknown floating point precision");
      }
    }
    //    else {
    //      std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
    //    }
    return;
  }
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    //    std::cout << "Communicate Begin "<<std::endl;
    //    _grid->Barrier();
    FlightRecorder::StepLog("Communicate begin");
    ///////////////////////////////////////////////
    // All GPU kernel tasks must complete
-    //    accelerator_barrier();     // All kernels should ALREADY be complete
+    //    accelerator_barrier();      All kernels should ALREADY be complete
-    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
+    //Everyone is here, so noone running slow and still using receive buffer
-                               // But the HaloGather had a barrier too.
+    _grid->StencilBarrier();
    // But the HaloGather had a barrier too.
    ///////////////////////////////////////////////
    if (SloppyComms) {
      DeviceBufferFreeAll();
    }
    for(int i=0;i<Packets.size();i++){
      this->CompressPacket(Packets[i]);
    }
    if (SloppyComms) { 
      accelerator_barrier();
 #ifdef NVLINK_GET
      _grid->StencilBarrier(); 
 #endif
    }
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate prepare "<<i<<std::endl;
      //      _grid->Barrier();
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
-					  Packets[i].send_buf,
+					  Packets[i].compressed_send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
-					  Packets[i].recv_buf,
+					  Packets[i].compressed_recv_buf,
 					  Packets[i].from_rank,Packets[i].do_recv,
-					  Packets[i].xbytes,Packets[i].rbytes,i);
+					  Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
    }
    //    std::cout << "Communicate PollDtoH "<<std::endl;
    //    _grid->Barrier();
@@ -389,19 +552,22 @@ public:
    // Starts intranode
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate Begin "<<i<<std::endl;
      //      _grid->Barrier();
      _grid->StencilSendToRecvFromBegin(MpiReqs,
-					Packets[i].send_buf,
+					Packets[i].send_buf,Packets[i].compressed_send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
-					Packets[i].recv_buf,
+					Packets[i].recv_buf,Packets[i].compressed_recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes,Packets[i].rbytes,i);
+					Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
      //      std::cout << "Communicate Begin started "<<i<<std::endl;
      //      _grid->Barrier();
    }
    FlightRecorder::StepLog("Communicate begin has finished");
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_send )
-	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
+	FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
    }
  }
@@ -416,14 +582,15 @@ public:
    //    std::cout << "Communicate Complete Complete "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
-    if   ( this->partialDirichlet ) DslashLogPartial();
+    //    if   ( this->partialDirichlet ) DslashLogPartial();
-    else if ( this->fullDirichlet ) DslashLogDirichlet();
+    if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
    //    accelerator_barrier(); 
    for(int i=0;i<Packets.size();i++){
      this->DecompressPacket(Packets[i]);
      if ( Packets[i].do_recv )
-	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
+	FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
    }
    FlightRecorder::StepLog("Finish communicate complete");
  }
@@ -618,7 +785,7 @@ public:
  }
  void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
    Decompress d;
-    d.partial  = this->partialDirichlet;
+    //    d.partial  = this->partialDirichlet;
    d.dims     = _grid->_fdimensions;
    d.kernel_p = k_p;
    d.mpi_p    = m_p;
@@ -627,7 +794,7 @@ public:
  }
  void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
    Merge m;
-    m.partial  = this->partialDirichlet;
+    //    m.partial  = this->partialDirichlet;
    m.dims     = _grid->_fdimensions;
    m.type     = type;
    m.mpointer = merge_p;
@@ -732,8 +899,8 @@ public:
      int block = dirichlet_block[dimension];
      this->_comms_send[ii] = comm_dim;
      this->_comms_recv[ii] = comm_dim;
-      this->_comms_partial_send[ii] = 0;
+      //      this->_comms_partial_send[ii] = 0;
-      this->_comms_partial_recv[ii] = 0;
+      //      this->_comms_partial_recv[ii] = 0;
      if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
 	// Quiesce communication across block boundaries
@@ -754,10 +921,10 @@ public:
 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
-	if ( partialDirichlet ) {
+	//	if ( partialDirichlet ) {
-	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
+	//	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
-	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
+	//	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
-	}
+	//	}
      }
    }
  }
@@ -769,6 +936,7 @@ public:
 		   Parameters p=Parameters(),
 		   bool preserve_shm=false)
  {
    SloppyComms = 0;
    face_table_computed=0;
    _grid    = grid;
    this->parameters=p;
@@ -786,7 +954,7 @@ public:
    this->same_node.resize(npoints);
    if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
-    partialDirichlet = p.partialDirichlet;
+    //    partialDirichlet = p.partialDirichlet;
    DirichletBlock(p.dirichlet); // comms send/recv set up
    fullDirichlet=0;
    for(int d=0;d<p.dirichlet.size();d++){
@@ -867,7 +1035,7 @@ public:
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();
-    // Allow for multiple stencils to exist simultaneously
+    // Allow for multiple stencils to be communicated simultaneously
    if (!preserve_shm)
      _grid->ShmBufferFreeAll();
@@ -935,7 +1103,8 @@ public:
    GridBase *grid=_grid;
    const int Nsimd = grid->Nsimd();
-    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    //    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
    int comms_recv      = this->_comms_recv[point];
    int fd              = _grid->_fdimensions[dimension];
    int ld              = _grid->_ldimensions[dimension];
    int rd              = _grid->_rdimensions[dimension];
@@ -1124,8 +1293,8 @@ public:
    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
    assert(rhs.Grid()==_grid);
    //	  conformable(_grid,rhs.Grid());
@@ -1160,11 +1329,11 @@ public:
 	int rbytes;
 	if ( comms_send ) xbytes = bytes; // Full send
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0; // full dirichlet
 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@@ -1191,7 +1360,8 @@ public:
 	}
-	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	//	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
 	if ( compress.DecompressionStep()&&comms_recv) {
 	  recv_buf=u_simd_recv_buf[0];
 	} else {
 	  recv_buf=this->u_recv_buf_p;
@@ -1225,7 +1395,8 @@ public:
 #endif
 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
-	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	//	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
 	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
        int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
 	if ( !duplicate ) { // Force comms for now
@@ -1234,8 +1405,8 @@ public:
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
-	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	  int do_send = (comms_send) && (!shm_send );
-	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
+	  int do_recv = (comms_send) && (!shm_recv );
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
@@ -1243,7 +1414,7 @@ public:
 		    xbytes,rbytes);
 	}
-	if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
+	if ( (compress.DecompressionStep() && comms_recv) ) {
 	  AddDecompress(&this->u_recv_buf_p[comm_off],
 			&recv_buf[comm_off],
 			words,Decompressions);
@@ -1265,8 +1436,8 @@ public:
    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
    int fd = _grid->_fdimensions[dimension];
    int rd = _grid->_rdimensions[dimension];
@@ -1341,18 +1512,20 @@ public:
 	if ( comms_send ) xbytes = bytes;
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0;
 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 	// Gathers SIMD lanes for send and merge
 	// Different faces can be full comms or partial comms with  multiple ranks per node
-	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	//	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
 	if ( comms_send || comms_recv ) {
-	  int partial = partialDirichlet;
+	  //	  int partial = partialDirichlet;
 	  int partial = 0;
 	  compressor::Gather_plane_exchange(face_table[face_idx],rhs,
 					    spointers,dimension,sx,cbmask,
 					    compress,permute_type,partial );
@@ -1418,7 +1591,8 @@ public:
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
-	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      //	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
 	      int do_send = (comms_send) && (!shm_send );
 	      AddPacket((void *)sp,(void *)rp,
 			xmit_to_rank,do_send,
 			recv_from_rank,do_send,
@@ -1432,7 +1606,8 @@ public:
 	  }
 	}
 	// rpointer may be doing a remote read in the gather over SHM
-	if ( comms_recv|comms_partial_recv ) {
+	//	if ( comms_recv|comms_partial_recv ) {
 	if ( comms_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -67,7 +67,7 @@ void acceleratorInit(void)
 	printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
-	GPU_PROP_FMT(totalGlobalMem,"%lld");
+	GPU_PROP_FMT(totalGlobalMem,"%zu");
 	GPU_PROP(managedMemory);
 	GPU_PROP(isMultiGpuBoard);
 	GPU_PROP(warpSize);
@@ -240,7 +240,7 @@ void acceleratorInit(void)
  char hostname[HOST_NAME_MAX+1];
  gethostname(hostname, HOST_NAME_MAX+1);
-  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
+  if ( rank==0 ) printf("AcceleratorSyclInit world_rank %d is host %s \n",world_rank,hostname);
  auto devices = sycl::device::get_devices();
  for(int d = 0;d<devices.size();d++){
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
  auto err = cudaMallocHost((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
    assert(0);
  }
  return ptr;
@@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
    assert(0);
  }
  return ptr;
@@ -237,7 +237,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
 };
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -46,10 +46,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <cstdlib>
 #include <memory>
 #include <Grid/Grid.h>
 #include <Grid/util/CompilerCompatible.h>
 #ifdef HAVE_UNWIND
 #include <libunwind.h>
 #endif
 #include <fenv.h>
 #ifdef __APPLE__
@@ -187,9 +191,8 @@ void GridParseLayout(char **argv,int argc,
 		     Coordinate &latt_c,
 		     Coordinate &mpi_c)
 {
-  auto mpi =std::vector<int>(Nd,1);
+  auto mpi =std::vector<int>({1,1,1,1});
-  auto latt=std::vector<int>(Nd,8);
+  auto latt=std::vector<int>({8,8,8,8});
  GridThread::SetMaxThreads();
@@ -229,9 +232,6 @@ void GridParseLayout(char **argv,int argc,
  }
  // Copy back into coordinate format
  int nd = mpi.size();
  //  std::cout << "mpi.size() "<<nd<<std::endl;
  //  std::cout << "latt.size() "<<latt.size()<<std::endl;
  //  std::cout << "Nd "<<Nd<<std::endl;
  assert(latt.size()==nd);
  latt_c.resize(nd);
   mpi_c.resize(nd);
@@ -299,6 +299,20 @@ void GridBanner(void)
    std::cout << std::setprecision(9);
 }
 //Some file local variables
 static int fileno_stdout;
 static int fileno_stderr;
 static int signal_delay;
 class dlRegion {
 public:
  uint64_t start;
  uint64_t end;
  uint64_t size;
  uint64_t offset;
  std::string name;
 };
 std::vector<dlRegion> dlMap;
 void Grid_init(int *argc,char ***argv)
 {
@@ -351,6 +365,19 @@ void Grid_init(int *argc,char ***argv)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
  // Sleep n-seconds at end of handler
  if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
    GridCmdOptionInt(arg,signal_delay);
  }
  // periodic wakeup with stack trace printed
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
    Grid_debug_heartbeat();
  }
  // periodic wakeup with empty handler (interrupts some system calls)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
    Grid_heartbeat();
  }
 #if defined(A64FX)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
@@ -400,15 +427,25 @@ void Grid_init(int *argc,char ***argv)
    fp=freopen(ename.str().c_str(),"w",stderr);
    assert(fp!=(FILE *)NULL);
  }
  fileno_stdout = fileno(stdout);
  fileno_stderr = fileno(stderr) ;
  ////////////////////////////////////////////////////
  // OK to use GridLogMessage etc from here on
  ////////////////////////////////////////////////////
  std::cout << GridLogMessage << "================================================ "<<std::endl;
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
  std::cout << GridLogMessage << "================================================ "<<std::endl;
-
+  {
-  gethostname(hostname, HOST_NAME_MAX+1);
+    gethostname(hostname, HOST_NAME_MAX+1);
-  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
+    time_t mytime;
    struct tm *info;
    char buffer[80];
    time(&mytime);
    info = localtime(&mytime);
    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", info);
    std::cout << GridLogMessage << "This rank is running on host "<< hostname<<" at local time "<<buffer<<std::endl;
  }
  /////////////////////////////////////////////////////////
  // Reporting
@@ -425,6 +462,47 @@ void Grid_init(int *argc,char ***argv)
    MemoryProfiler::stats = &dbgMemStats;
  }
  /////////////////////////////////////////////////////////
  // LD.so space
  /////////////////////////////////////////////////////////
 #ifndef __APPLE__
  {
    // Provides mapping of .so files 
    FILE *f = fopen("/proc/self/maps", "r");
    if (f) {
      char line[256];
      while (fgets(line, sizeof(line), f)) {
 	if (strstr(line, "r-xp")) {
 	  dlRegion region;
 	  uint32_t major, minor, inode;
 	  uint64_t start,end,offset;
 	  char path[PATH_MAX];
 	  sscanf(line,"%lx-%lx r-xp %lx %x:%x %d %s",
 		 &start,&end,&offset,
 		 &major,&minor,&inode,path);
 	  region.start=start;
 	  region.end  =end;
 	  region.offset=offset;
 	  region.name = std::string(path);
 	  region.size = region.end-region.start;
 	  dlMap.push_back(region);
 	  //	  std::cout << GridLogMessage<< line;
 	}
      }
      fclose(f);
    }
    if( GridCmdOptionExists(*argv,*argv+*argc,"--dylib-map") ){
      std::cout << GridLogMessage << "================================================ "<<std::endl;
      std::cout << GridLogMessage<< " Dynamic library map: " <<std::endl; 
      std::cout << GridLogMessage << "================================================ "<<std::endl;
      for(int r=0;r<dlMap.size();r++){
 	auto region = dlMap[r];
 	std::cout << GridLogMessage<<" "<<region.name<<std::hex<<region.start<<"-"<<region.end<<" sz "<<region.size<<std::dec<<std::endl;
      }
      std::cout << GridLogMessage << "================================================ "<<std::endl;
    }
  }
 #endif
  ////////////////////////////////////
  // Logging
  ////////////////////////////////////
@@ -457,14 +535,19 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
    std::cout<<GridLogMessage<<"  --device-mem M  : Size of device software cache for lattice fields (MB) "<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
-    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
+    std::cout<<GridLogMessage<<"Verbose:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --log list      : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-mem     : print Grid allocator activity"<<std::endl;
    std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;
    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"Debug:"<<std::endl;
    std::cout<<GridLogMessage<<"  --dylib-map     : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
    std::cout<<GridLogMessage<<"  --heartbeat     : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
    std::cout<<GridLogMessage<<"  --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-heartbeat : periodically report backtrace "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-mem     : print Grid allocator activity"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
@@ -559,17 +642,56 @@ void GridLogLayout() {
 }
 void * Grid_backtrace_buffer[_NBACKTRACE];
 #define SIGLOG(A) ::write(fileno_stderr,A,strlen(A));
-void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
+void sig_print_dig(uint32_t dig)
 {
-  fprintf(stderr,"Signal handler on host %s\n",hostname);
+  const char *digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
-  fprintf(stderr,"FlightRecorder step %d stage %s \n",
+  if ( dig>=0 && dig< 16){
-	  FlightRecorder::StepLoggingCounter,
+    SIGLOG(digits[dig]);
-	  FlightRecorder::StepName);
+  }
-  fprintf(stderr,"Caught signal %d\n",si->si_signo);
+}
-  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
+void sig_print_uint(uint32_t A)
-  fprintf(stderr,"         code %d\n",si->si_code);
+{
-  // x86 64bit
+  int dig;
  int nz=0;
 #define DIGIT(DIV) dig = (A/DIV)%10 ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
  DIGIT(1000000000); // Catches 4BN = 2^32
  DIGIT(100000000);
  DIGIT(10000000);
  DIGIT(1000000);
  DIGIT(100000);
  DIGIT(10000);
  DIGIT(1000);
  DIGIT(100);
  DIGIT(10);
  DIGIT(1);
  if (nz==0) SIGLOG("0");
 }
 void sig_print_hex(uint64_t A)
 {
  int nz=0;
  int dig;
 #define NIBBLE(A) dig = A ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;  
  SIGLOG("0x");
  NIBBLE((A>>(15*4))&0xF);
  NIBBLE((A>>(14*4))&0xF);
  NIBBLE((A>>(13*4))&0xF);
  NIBBLE((A>>(12*4))&0xF);
  NIBBLE((A>>(11*4))&0xF);
  NIBBLE((A>>(10*4))&0xF);
  NIBBLE((A>>(9*4))&0xF);
  NIBBLE((A>>(8*4))&0xF);
  NIBBLE((A>>(7*4))&0xF);
  NIBBLE((A>>(6*4))&0xF);
  NIBBLE((A>>(5*4))&0xF);
  NIBBLE((A>>(4*4))&0xF);
  NIBBLE((A>>(3*4))&0xF);
  NIBBLE((A>>(2*4))&0xF);
  NIBBLE((A>>4)&0xF);
  sig_print_dig(A&0xF);
 }
 /*
 #ifdef __linux__
 #ifdef __x86_64__
  ucontext_t * uc= (ucontext_t *)ptr;
@@ -577,80 +699,158 @@ void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
 #endif
 #endif
-  fflush(stderr);
+*/
-  BACKTRACEFP(stderr);
+void Grid_generic_handler(int sig,siginfo_t *si,void * ptr)
-  fprintf(stderr,"Called backtrace\n");
+{
-  fflush(stdout);
+  SIGLOG("Signal handler on host ");
-  fflush(stderr);
+  SIGLOG(hostname);
  SIGLOG(" process id ");
  sig_print_uint((uint32_t)getpid());
  SIGLOG("\n");
  SIGLOG("FlightRecorder step ");
  sig_print_uint(FlightRecorder::StepLoggingCounter);
  SIGLOG(" stage ");
  SIGLOG(FlightRecorder::StepName);
  SIGLOG("\n");
  SIGLOG("Caught signal ");
  sig_print_uint(si->si_signo);
  SIGLOG("\n");
  SIGLOG("  mem address ");
  sig_print_hex((uint64_t)si->si_addr);
  SIGLOG("\n");
  SIGLOG("  code ");
  sig_print_uint(si->si_code);
  SIGLOG("\n");
  ucontext_t *uc= (ucontext_t *)ptr;
  SIGLOG("Backtrace:\n");
 #ifdef HAVE_UNWIND
  // Debug cross check on offsets
  //  int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
  //  backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
  unw_cursor_t cursor;
  unw_word_t ip, off;
  if (!unw_init_local(&cursor, uc) ) {
    SIGLOG("   frame     IP       function\n");
    int level = 0;
    int ret = 0;
    while(1) {
      char name[128];
      if (level >= _NBACKTRACE) return;
      unw_get_reg(&cursor, UNW_REG_IP, &ip);
      sig_print_uint(level); SIGLOG(" ");
      sig_print_hex(ip);     SIGLOG(" ");
      for(int r=0;r<dlMap.size();r++){
 	if((ip>=dlMap[r].start) &&(ip<dlMap[r].end)){
 	  SIGLOG(dlMap[r].name.c_str());
 	  SIGLOG("+");
 	  sig_print_hex((ip-dlMap[r].start));
 	  break;
 	}
      }
      SIGLOG("\n");
      Grid_backtrace_buffer[level]=(void *)ip;
      level++;
      ret = unw_step(&cursor);
      if (ret <= 0) {
 	return;
      }
    }
  }
 #else
  // Known Asynch-Signal unsafe
  int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
  backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
 #endif
 }
 void Grid_heartbeat_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
  Grid_generic_handler(sig,si,ptr);
  SIGLOG("\n");
 }
 void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
  Grid_generic_handler(sig,si,ptr);
  if (signal_delay) {
    SIGLOG("Adding extra signal delay ");
    sig_print_uint(signal_delay);
    SIGLOG(" s\n");
    usleep( (uint64_t) signal_delay*1000LL*1000LL);
  }
  SIGLOG("\n");
  return;
 }
-void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
+void Grid_fatal_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
-  fprintf(stderr,"Signal handler on host %s\n",hostname);
+  Grid_generic_handler(sig,si,ptr);
-  fprintf(stderr,"Caught signal %d\n",si->si_signo);
+  SIGLOG("\n");
  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
  fprintf(stderr,"         code %d\n",si->si_code);
  // Linux/Posix
 #ifdef __linux__
  // And x86 64bit
 #ifdef __x86_64__
  ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
 #define REG(A)  fprintf(stderr,"  %s %lx\n",#A,sc-> A);
  REG(rdi);
  REG(rsi);
  REG(rbp);
  REG(rbx);
  REG(rdx);
  REG(rax);
  REG(rcx);
  REG(rsp);
  REG(rip);
  REG(r8);
  REG(r9);
  REG(r10);
  REG(r11);
  REG(r12);
  REG(r13);
  REG(r14);
  REG(r15);
 #endif
 #endif
  fflush(stderr);
  BACKTRACEFP(stderr);
  fprintf(stderr,"Called backtrace\n");
  fflush(stdout);
  fflush(stderr);
  exit(0);
  return;
 };
 void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
  //  SIGLOG("heartbeat signal handled\n");
  return;
 }
 void Grid_debug_heartbeat(void)
 {
  struct sigaction sa_ping;
  sigemptyset (&sa_ping.sa_mask);
  sa_ping.sa_sigaction= Grid_usr_signal_handler;
  sa_ping.sa_flags    = SA_SIGINFO;
  sigaction(SIGALRM,&sa_ping,NULL);
  // repeating 10s heartbeat
  struct itimerval it_val;
  it_val.it_value.tv_sec = 10;
  it_val.it_value.tv_usec = 0;
  it_val.it_interval = it_val.it_value;
  setitimer(ITIMER_REAL, &it_val, NULL);
 }
 void Grid_heartbeat(void)
 {
  struct sigaction sa_ping;
  sigemptyset (&sa_ping.sa_mask);
  sa_ping.sa_sigaction= Grid_empty_signal_handler;
  sa_ping.sa_flags    = SA_SIGINFO;
  sigaction(SIGALRM,&sa_ping,NULL);
  // repeating 10s heartbeat
  struct itimerval it_val;
  it_val.it_value.tv_sec = 10;
  it_val.it_value.tv_usec = 1000;
  it_val.it_interval = it_val.it_value;
  setitimer(ITIMER_REAL, &it_val, NULL);
 }
 void Grid_exit_handler(void)
 {
-  //  BACKTRACEFP(stdout);
+  BACKTRACEFP(stdout);
-  //  fflush(stdout);
+  fflush(stdout);
 }
 void Grid_debug_handler_init(void)
 {
  struct sigaction sa;
  sigemptyset (&sa.sa_mask);
-  sa.sa_sigaction= Grid_sa_signal_handler;
+  sa.sa_sigaction= Grid_fatal_signal_handler;
  sa.sa_flags    = SA_SIGINFO;
  //  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  //  sigaction(SIGBUS,&sa,NULL);
  //  sigaction(SIGUSR2,&sa,NULL);
  //  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
  //  sigaction(SIGFPE,&sa,NULL);
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);
 #ifndef GRID_SYCL
  sigaction(SIGSEGV,&sa,NULL); // SYCL is using SIGSEGV
  sigaction(SIGBUS,&sa,NULL);
  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
  sigaction(SIGFPE,&sa,NULL);
 #endif
-  // Non terminating SIGUSR1/2 handler
+  // Non terminating SIGHUP handler
  struct sigaction sa_ping;
  sigemptyset (&sa_ping.sa_mask);
  sa_ping.sa_sigaction= Grid_usr_signal_handler;
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -38,7 +38,11 @@ char * GridHostname(void);
 // internal, controled with --handle
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
 void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr);
 void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr);
 void Grid_debug_handler_init(void);
 void Grid_debug_heartbeat(void);
 void Grid_heartbeat(void);
 void Grid_quiesce_nodes(void);
 void Grid_unquiesce_nodes(void);
--- a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
@@ -201,8 +201,7 @@ int main(int argc, char **argv) {
  Params.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
-  ParamsDir.partialDirichlet=0;
+  //  ParamsDir.partialDirichlet=0;
  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
@@ -298,11 +297,11 @@ int main(int argc, char **argv) {
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;
-    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
+    //    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
-    else                      ParamsNum.partialDirichlet = 0;
+    //    else                      ParamsNum.partialDirichlet = 0;
-    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
+    //    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
-    else                      ParamsDen.partialDirichlet = 0;
+    //    else                      ParamsDen.partialDirichlet = 0;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
@@ -333,9 +333,9 @@ int main(int argc, char **argv) {
  ParamsF.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
  ParamsDirF.dirichlet=Dirichlet;
-  ParamsDir.partialDirichlet=1;
+  //  ParamsDir.partialDirichlet=1;
-  ParamsDirF.partialDirichlet=1;
+  //  ParamsDirF.partialDirichlet=1;
-  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
+  //  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
@@ -481,21 +481,21 @@ int main(int argc, char **argv) {
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;
-    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
+    //    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
-    else                      ParamsNum.partialDirichlet = 0;
+    //    else                      ParamsNum.partialDirichlet = 0;
-    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
+    //    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
-    else                      ParamsDen.partialDirichlet = 0;
+    //    else                      ParamsDen.partialDirichlet = 0;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
    ParamsDenF.dirichlet = ParamsDen.dirichlet;
-    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
+    //    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
    ParamsNumF.dirichlet = ParamsNum.dirichlet;
-    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
+    //    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
    NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
--- a/5
+++ b/5
@@ -1,8 +1,3 @@
 * Clean up the extract merge and replace with insertLane/extractLane
 -----
 i)    Refine subspace with HDCG & recompute
 ii)   Block Lanczos in coarse space
 iii)  Batched block project in the operator computation
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -166,18 +166,18 @@ int main (int argc, char ** argv)
  }  
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=8;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){
      Coordinate latt_size  ({lat*mpi_layout[0],
-	                      lat*mpi_layout[1],
+    	                      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
+                              lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
+	                      lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
@@ -193,101 +193,6 @@ int main (int argc, char ** argv)
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
      }
      int ncomm;
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
 	dbytes=0;
 	ncomm=0;
 	std::vector<CommsRequest_t> requests;
 	for(int mu=0;mu<4;mu++){
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu][0],
 					      xmit_to_rank,1,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,1,
 					      bytes,bytes,mu);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu+4][0],
 					      xmit_to_rank,1,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,1,
 					      bytes,bytes,mu+4);
 	  }
 	}
 	Grid.StencilSendToRecvFromComplete(requests,0);
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
      dbytes=dbytes*ppn;
      double xbytes    = dbytes*0.5;
      //      double rbytes    = dbytes*0.5;
      double bidibytes = dbytes;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=8;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){
      Coordinate latt_size  ({lat*mpi_layout[0],
      			      lat*mpi_layout[1],
      			      lat*mpi_layout[2],
      			      lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
      }
      int ncomm;
      double dbytes;
      for(int i=0;i<Nloop;i++){
@@ -296,45 +201,34 @@ int main (int argc, char ** argv)
 	std::vector<CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
-	for(int mu=0;mu<4;mu++){
+
-	
+	for(int dir=0;dir<8;dir++) {
 	  double tbytes;
 	  int mu =dir % 4;
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
-	    
+	    if ( dir == mu ) { 
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      int comm_proc=1;
-	    dbytes+=
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.StencilSendToRecvFromBegin(requests,
+	    } else { 
-					      (void *)&xbuf[mu][0],
+	      int comm_proc = mpi_layout[mu]-1;
-					      xmit_to_rank,1,
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-					      (void *)&rbuf[mu][0],
+	    }
-					      recv_from_rank,1,
+            int tid = omp_get_thread_num();
-					      bytes,bytes,mu);
+	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
-	    Grid.StencilSendToRecvFromComplete(requests,mu);
+					       (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);
 	    requests.resize(0);
-	    comm_proc = mpi_layout[mu]-1;
+	    dbytes+=tbytes;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu+4][0],
 					      xmit_to_rank,1,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,1,
 					      bytes,bytes,mu+4);
 	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 	    requests.resize(0);
 	  }
-	}
+        }
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -32,18 +32,18 @@
 using namespace std;
 using namespace Grid;
-template<class d>
+////////////////////////
-struct scal {
+/// Move to domains ////
-  d internal;
+////////////////////////
 Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaX,
 			 Gamma::Algebra::GammaY,
 			 Gamma::Algebra::GammaZ,
 			 Gamma::Algebra::GammaT
 };
-  Gamma::Algebra Gmu [] = {
+void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
  };
 int main (int argc, char ** argv)
 {
@@ -52,39 +52,108 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();
-  Coordinate latt4 = GridDefaultLatt();
+  int Ls=16;
-  int Ls=8;
+  for(int i=0;i<argc;i++) {
  for(int i=0;i<argc;i++)
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
  }
  //////////////////
  // With comms
  //////////////////
  Coordinate Dirichlet(Nd+1,0);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  Benchmark(Ls,Dirichlet,false);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  Benchmark(Ls,Dirichlet,true);
  //////////////////
  // Domain decomposed
  //////////////////
  /*
  Coordinate latt4  = GridDefaultLatt();
  Coordinate mpi    = GridDefaultMpi();
  Coordinate CommDim(Nd);
  Coordinate shm;
  GlobalSharedMemory::GetShmDims(mpi,shm);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  //  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
  Dirichlet[0] = 0;
  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
  Benchmark(Ls,Dirichlet,false);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
  Benchmark(Ls,Dirichlet,true);
  */
  Grid_finalize();
  exit(0);
 }
 void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
 {
  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();
  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
-
+  std::vector<int> seeds4({1,2,3,4});
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  std::vector<int> seeds5({5,6,7,8});
 #undef SINGLE
 #ifdef SINGLE
  typedef vComplexF          Simd;
  typedef LatticeFermionF    FermionField;
  typedef LatticeGaugeFieldF GaugeField;
  typedef LatticeColourMatrixF ColourMatrixField;
  typedef DomainWallFermionF FermionAction;
 #else
  typedef vComplexD          Simd;
  typedef LatticeFermionD    FermionField;
  typedef LatticeGaugeFieldD GaugeField;
  typedef LatticeColourMatrixD ColourMatrixField;
  typedef DomainWallFermionD FermionAction;
 #endif
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
-  LatticeFermion src   (FGrid); random(RNG5,src);
+ 
  FermionField src   (FGrid); random(RNG5,src);
 #if 0
  src = Zero();
  {
@@ -100,46 +169,39 @@ int main (int argc, char ** argv)
  src = src*N2;
 #endif
-
+  FermionField result(FGrid); result=Zero();
-  LatticeFermion result(FGrid); result=Zero();
+  FermionField    ref(FGrid);    ref=Zero();
-  LatticeFermion    ref(FGrid);    ref=Zero();
+  FermionField    tmp(FGrid);
-  LatticeFermion    tmp(FGrid);
+  FermionField    err(FGrid);
  LatticeFermion    err(FGrid);
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
-  LatticeGaugeField Umu(UGrid);
+  GaugeField Umu(UGrid);
  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
  //  SU<Nc>::ColdConfiguration(Umu);
  UmuCopy=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 #if 0
  Umu=1.0;
  for(int mu=0;mu<Nd;mu++){
    LatticeColourMatrix ttmp(UGrid);
    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
    //    if (mu !=2 ) ttmp = 0;
    //    ttmp = ttmp* pow(10.0,mu);
    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
  }
  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
 #endif
  ////////////////////////////////////
  // Apply BCs
  ////////////////////////////////////
  Coordinate Block(4);
  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
  DirichletFilter<GaugeField> Filter(Block);
  Filter.applyFilter(Umu);
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
-  // replicate across fifth dimension
+  std::vector<ColourMatrixField> U(4,UGrid);
  LatticeGaugeField Umu5d(FGrid);
  std::vector<LatticeColourMatrix> U(4,FGrid);
  {
    autoView( Umu5d_v, Umu5d, CpuWrite);
    autoView( Umu_v  , Umu  , CpuRead);
    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
      for(int s=0;s<Ls;s++){
 	Umu5d_v[Ls*ss+s] = Umu_v[ss];
      }
    }
  }
  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
  if (1)
@@ -147,10 +209,28 @@ int main (int argc, char ** argv)
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
 	  }
 	}
      }
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-      tmp =adj(U[mu])*src;
+      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	  }
 	}
      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
@@ -167,11 +247,9 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
-  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
+  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -181,9 +259,15 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  FermionAction::ImplParams p;
-  int ncall =1000;
+  p.dirichlet=Dirichlet;
-
+  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
  Dw.SloppyComms(sloppy);
  Dw.ImportGauge(Umu);
  int ncall =300;
  RealD n2e;
  if (1) {
    FGrid->Barrier();
    Dw.Dhop(src,result,0);
@@ -198,8 +282,8 @@ int main (int argc, char ** argv)
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=single_site_flops*volume*ncall;
-    auto nsimd = vComplex::Nsimd();
+    auto nsimd = Simd::Nsimd();
-    auto simdwidth = sizeof(vComplex);
+    auto simdwidth = sizeof(Simd);
    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
@@ -208,28 +292,27 @@ int main (int argc, char ** argv)
    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
    err = ref-result;
-    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+    n2e = norm2(err);
-    //exit(0);
+    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
-    if(( norm2(err)>1.0e-4) ) {
+    if(( n2e>1.0e-4) ) {
      /*
      std::cout << "RESULT\n " << result<<std::endl;
      std::cout << "REF   \n " << ref   <<std::endl;
      std::cout << "ERR   \n " << err   <<std::endl;
      */
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
      std::cout<<GridLogMessage << "RESULT" << std::endl;
      //      std::cout << result<<std::endl;
      std::cout << norm2(result)<<std::endl;
      std::cout<<GridLogMessage << "REF" << std::endl;
      std::cout << norm2(ref)<<std::endl;
      std::cout<<GridLogMessage << "ERR" << std::endl;
      std::cout << norm2(err)<<std::endl;
      FGrid->Barrier();
      exit(-1);
    }
-    assert (norm2(err)< 1.0e-4 );
+    assert (n2e< 1.0e-4 );
  }
  if (1)
@@ -238,16 +321,30 @@ int main (int argc, char ** argv)
    for(int mu=0;mu<Nd;mu++){
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( ref_v, ref, CpuWrite);
 	autoView( tmp_v, tmp, CpuRead);
-	for(int i=0;i<ref_v.size();i++){
+	autoView( U_v  , U[mu]  , CpuRead);
-	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    int i=s+Ls*ss;
 	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
 	  }
 	}
      }
-
+      
-      tmp =adj(U[mu])*src;
+      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	  }
 	}
      }
      //      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      {
 	autoView( ref_v, ref, CpuWrite);
@@ -259,27 +356,27 @@ int main (int argc, char ** argv)
    }
    ref = -0.5*ref;
  }
-  //  dump=1;
+
-  Dw.Dhop(src,result,1);
+  Dw.Dhop(src,result,DaggerYes);
  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
  err = ref-result;
-  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
+  n2e= norm2(err);
-  if((norm2(err)>1.0e-4)){
+  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
 /*
 	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
 	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
 	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
 */
  }
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
  LatticeFermion r_o   (FrbGrid);
  LatticeFermion r_eo  (FGrid);
  assert((n2e)<1.0e-4);
  FermionField src_e (FrbGrid);
  FermionField src_o (FrbGrid);
  FermionField r_e   (FrbGrid);
  FermionField r_o   (FrbGrid);
  FermionField r_eo  (FGrid);
  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
@@ -291,10 +388,8 @@ int main (int argc, char ** argv)
  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -308,13 +403,7 @@ int main (int argc, char ** argv)
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
 #ifdef CUDA_PROFILE
      if(i==10) cudaProfilerStart();
 #endif
      Dw.DhopEO(src_o,r_e,DaggerNo);
 #ifdef CUDA_PROFILE
      if(i==20) cudaProfilerStop();
 #endif
    }
    double t1=usecond();
    FGrid->Barrier();
@@ -338,14 +427,9 @@ int main (int argc, char ** argv)
  setCheckerboard(r_eo,r_e);
  err = r_eo-result;
-  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+  n2e= norm2(err);
-  if((norm2(err)>1.0e-4)){
+  std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
-    /*
+  assert(n2e<1.0e-4);
 	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
 	std::cout<< "Deo REF\n " <<result  << std::endl;
 	std::cout<< "Deo ERR   \n " << err <<std::endl;
    */
  }
  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
@@ -354,6 +438,4 @@ int main (int argc, char ** argv)
  assert(norm2(src_e)<1.0e-4);
  assert(norm2(src_o)<1.0e-4);
  Grid_finalize();
  exit(0);
 }
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -43,7 +43,7 @@ Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaT
 };
-void Benchmark(int Ls, Coordinate Dirichlet);
+void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
 int main (int argc, char ** argv)
 {
@@ -69,11 +69,19 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  Benchmark(Ls,Dirichlet);
+  Benchmark(Ls,Dirichlet,false);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  Benchmark(Ls,Dirichlet,true);
  //////////////////
  // Domain decomposed
  //////////////////
  /*
  Coordinate latt4  = GridDefaultLatt();
  Coordinate mpi    = GridDefaultMpi();
  Coordinate CommDim(Nd);
@@ -81,42 +89,35 @@ int main (int argc, char ** argv)
  GlobalSharedMemory::GetShmDims(mpi,shm);
  //////////////////////
  // Node level
  //////////////////////
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
+  //  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-  //  Dirichlet[0] = 0;
+  Dirichlet[0] = 0;
-  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
-  Benchmark(Ls,Dirichlet);
+  Benchmark(Ls,Dirichlet,false);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
  //  Dirichlet[0] = 0;
  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
-  Benchmark(Ls,Dirichlet);
+  Benchmark(Ls,Dirichlet,true);
-
+  */
  Grid_finalize();
  exit(0);
 }
-void Benchmark(int Ls, Coordinate Dirichlet)
+void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
 {
  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();
@@ -132,21 +133,13 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  typedef LatticeGaugeFieldF GaugeField;
  typedef LatticeColourMatrixF ColourMatrixField;
  typedef DomainWallFermionF FermionAction;
-#endif
+#else
 #ifdef DOUBLE
  typedef vComplexD          Simd;
  typedef LatticeFermionD    FermionField;
  typedef LatticeGaugeFieldD GaugeField;
  typedef LatticeColourMatrixD ColourMatrixField;
  typedef DomainWallFermionD FermionAction;
 #endif
 #ifdef DOUBLE2
  typedef vComplexD2          Simd;
  typedef LatticeFermionD2    FermionField;
  typedef LatticeGaugeFieldD2 GaugeField;
  typedef LatticeColourMatrixD2 ColourMatrixField;
  typedef DomainWallFermionD2 FermionAction;
 #endif
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -269,6 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  FermionAction::ImplParams p;
  p.dirichlet=Dirichlet;
  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
  Dw.SloppyComms(sloppy);
  Dw.ImportGauge(Umu);
  int ncall =300;
--- a/benchmarks/Benchmark_dwf_fp32_partial.cc
+++ b/benchmarks/Benchmark_dwf_fp32_partial.cc
@@ -1,465 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
 #endif
 #ifdef CUDA_PROFILE
 #include <cuda_profiler_api.h>
 #endif
 using namespace std;
 using namespace Grid;
 ////////////////////////
 /// Move to domains ////
 ////////////////////////
 Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaX,
 			 Gamma::Algebra::GammaY,
 			 Gamma::Algebra::GammaZ,
 			 Gamma::Algebra::GammaT
 };
 void Benchmark(int Ls, Coordinate Dirichlet, int partial);
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  int Ls=8;
  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
  }
  //////////////////
  // With comms
  //////////////////
  Coordinate Dirichlet(Nd+1,0);
  for(auto partial : {0}) {
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
    Benchmark(Ls,Dirichlet,partial);
  }
  //////////////////
  // Domain decomposed
  //////////////////
  Coordinate latt4  = GridDefaultLatt();
  Coordinate mpi    = GridDefaultMpi();
  Coordinate CommDim(Nd);
  //Coordinate shm({2,1,1,1});
  Coordinate shm;
  GlobalSharedMemory::GetShmDims(mpi,shm);
  std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
  //////////////////////
  // Node level
  //////////////////////
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
  //  for(int d=0;d<Nd;d++) CommDim[d]= 1;
  Dirichlet[0] = 0;
  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
  for(auto partial : {0,1}) {
    std::cout << "\n\n\n\n\n\n" <<std::endl;
    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
    std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
    Benchmark(Ls,Dirichlet,partial);
  }
  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
  Dirichlet[0] = 0;
  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
  for(auto partial : {0,1}) {
    std::cout << "\n\n\n\n\n\n" <<std::endl;
    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
    std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
    Benchmark(Ls,Dirichlet,partial);
  }
  Grid_finalize();
  exit(0);
 }
 void Benchmark(int Ls, Coordinate Dirichlet, int partial)
 {
  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();
  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
 #define SINGLE
 #ifdef SINGLE
  typedef vComplexF          Simd;
  typedef LatticeFermionF    FermionField;
  typedef LatticeGaugeFieldF GaugeField;
  typedef LatticeColourMatrixF ColourMatrixField;
  typedef DomainWallFermionF FermionAction;
 #endif
 #ifdef DOUBLE
  typedef vComplexD          Simd;
  typedef LatticeFermionD    FermionField;
  typedef LatticeGaugeFieldD GaugeField;
  typedef LatticeColourMatrixD ColourMatrixField;
  typedef DomainWallFermionD FermionAction;
 #endif
 #ifdef DOUBLE2
  typedef vComplexD2          Simd;
  typedef LatticeFermionD2    FermionField;
  typedef LatticeGaugeFieldD2 GaugeField;
  typedef LatticeColourMatrixD2 ColourMatrixField;
  typedef DomainWallFermionD2 FermionAction;
 #endif
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
  FermionField src   (FGrid); random(RNG5,src);
 #if 0
  src = Zero();
  {
    Coordinate origin({0,0,0,latt4[2]-1,0});
    SpinColourVectorF tmp;
    tmp=Zero();
    tmp()(0)(0)=Complex(-2.0,0.0);
    std::cout << " source site 0 " << tmp<<std::endl;
    pokeSite(tmp,src,origin);
  }
 #else
  RealD N2 = 1.0/::sqrt(norm2(src));
  src = src*N2;
 #endif
  FermionField result(FGrid); result=Zero();
  FermionField    ref(FGrid);    ref=Zero();
  FermionField    tmp(FGrid);
  FermionField    err(FGrid);
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
  GaugeField Umu(UGrid);
  GaugeField UmuFull(UGrid);
  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
  UmuCopy=Umu;
  UmuFull=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
  ////////////////////////////////////
  // Apply BCs
  ////////////////////////////////////
  Coordinate Block(4);
  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
  DirichletFilter<GaugeField> Filter(Block);
  Filter.applyFilter(Umu);
  if(!partial) Filter.applyFilter(UmuCopy);
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  std::vector<ColourMatrixField> U(4,UGrid);
  std::vector<ColourMatrixField> Ucopy(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
    Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
  }
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
  if (1)
  {
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){
      int depth=dwf_compressor_depth;
      tmp = Cshift(src,mu+1,1);
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v    , U[mu]  , CpuRead);
 	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    if ( (s<depth) || (s>=Ls-depth)){
 	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
 	    } else {
 	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
 	    }
 	  }
 	}
      }
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    if ( (s<depth) || (s>=Ls-depth)){
 	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
 	    } else {
 	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	    }
 	  }
 	}
      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
    ref = -0.5*ref;
  }
  RealD mass=0.1;
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
  RealD NN = UGrid->NodeCount();
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
  std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
  std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  FermionAction::ImplParams p;
  p.dirichlet=Dirichlet;
  p.partialDirichlet=partial;
  FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
  int ncall =1;
  RealD n2e;
  if (1) {
    FGrid->Barrier();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.Dhop(src,result,0);
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=single_site_flops*volume*ncall;
    auto nsimd = Simd::Nsimd();
    auto simdwidth = sizeof(Simd);
    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    err = ref-result;
    n2e = norm2(err);
    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
    if(( n2e>1.0e-4) ) {
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
      DumpSliceNorm("s-slice ref ",ref,1);
      DumpSliceNorm("s-slice res ",result,1);
      DumpSliceNorm("s-slice error ",err,1);
      exit(-1);
    }
    assert (n2e< 1.0e-4 );
  }
  if (1)
  { // Naive wilson dag implementation
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){
      int depth=dwf_compressor_depth;
      tmp = Cshift(src,mu+1,1);
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v    , U[mu]  , CpuRead);
 	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    if ( (s<depth) || (s>=Ls-depth)){
 	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
 	    } else {
 	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
 	    }
 	  }
 	}
      }
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    if ( (s<depth) || (s>=Ls-depth)){
 	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
 	    } else {
 	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	    }
 	  }
 	}
      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
    }
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,DaggerYes);
  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
  err = ref-result;
  n2e= norm2(err);
  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
  assert((n2e)<1.0e-4);
  FermionField src_e (FrbGrid);
  FermionField src_o (FrbGrid);
  FermionField r_e   (FrbGrid);
  FermionField r_o   (FrbGrid);
  FermionField r_eo  (FGrid);
  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  {
    FGrid->Barrier();
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=(single_site_flops*volume*ncall)/2.0;
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
  Dw.Dhop  (src  ,result,DaggerNo);
  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
  setCheckerboard(r_eo,r_o);
  setCheckerboard(r_eo,r_e);
  err = r_eo-result;
  n2e= norm2(err);
  std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
  assert(n2e<1.0e-4);
  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
  assert(norm2(src_e)<1.0e-4);
  assert(norm2(src_o)<1.0e-4);
 }
--- a/configure.ac
+++ b/configure.ac
@@ -86,6 +86,7 @@ AC_ARG_WITH([gmp],
    [try this for a non-standard install prefix of the GMP library])],
    [AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
 AC_ARG_WITH([mpfr],
    [AS_HELP_STRING([--with-mpfr=prefix],
    [try this for a non-standard install prefix of the MPFR library])],
@@ -106,6 +107,13 @@ AC_ARG_WITH([lime],
            [AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
 ############### LIBUNWIND
 AC_ARG_WITH([unwind],
            [AS_HELP_STRING([--with-unwind=prefix],
            [try this for a non-standard install prefix of the libunwind library])],
            [AM_CXXFLAGS="-I$with_unwind/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_unwind/lib $AM_LDFLAGS"])
 ############### OpenSSL
 AC_ARG_WITH([openssl],
            [AS_HELP_STRING([--with-openssl=prefix],
@@ -198,8 +206,6 @@ AC_ARG_ENABLE([Nc],
    [ac_Nc=${enable_Nc}], [ac_Nc=3])
 case ${ac_Nc} in
     1)
        AC_DEFINE([Config_Nc],[1],[Gauge group Nc]);;
    2)
        AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);;
    3)
@@ -213,21 +219,6 @@ case ${ac_Nc} in
    *)
      AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
 esac
 ############### Nd
 AC_ARG_ENABLE([Nd],
    [AS_HELP_STRING([--enable-Nd=2|3|4],[enable default LGT dimension])],
    [ac_Nd=${enable_Nd}], [ac_Nd=4])
 case ${ac_Nd} in
    2)
        AC_DEFINE([Config_Nd],[2],[Gauge field dimension Nd]);;
    3)
        AC_DEFINE([Config_Nd],[3],[Gauge field dimension Nd]);;
    4)
        AC_DEFINE([Config_Nd],[4],[Gauge field dimension Nd]);;
    *)
      AC_MSG_ERROR(["Unsupport dimension Nd = ${ac_Nd}"]);;
 esac
 ############### Symplectic group
 AC_ARG_ENABLE([Sp],
@@ -390,6 +381,16 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
               [have_lime=true],
 	             [AC_MSG_WARN(LIME library was not found in your system.)])
 AC_SEARCH_LIBS([unw_backtrace], [unwind],
               [AC_DEFINE([HAVE_UNWIND], [1], [Define to 1 if you have the `libunwind' library])]
               [have_unwind=true],
 	             [AC_MSG_WARN(libunwind library was not found in your system.)])
 AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
               [AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
               [have_unwind_x86_64=true],
 	             [AC_MSG_WARN(libunwind library was not found in your system.)])
 AC_SEARCH_LIBS([SHA256_Init], [crypto],
               [AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]
               [have_crypto=true],
@@ -835,7 +836,6 @@ os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
 Nd                          : ${ac_Nd}
 Nc                          : ${ac_Nc}
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp}
--- a/systems/Jupiter/benchmarks/dwf.1node.perf
+++ b/systems/Jupiter/benchmarks/dwf.1node.perf
@@ -0,0 +1,273 @@
 RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
 RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
 RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
 RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
 SLURM detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 1 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 0 device 0 bus id: 0009:01:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 4
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 81604378624 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
 Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
 Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 0.309000 s :  Testing with full communication 
 Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 0.313000 s : Grid Layout
 Grid : Message : 0.313000 s : 	Global lattice size  : 32 32 64 64 
 Grid : Message : 0.319000 s : 	OpenMP threads       : 4
 Grid : Message : 0.320000 s : 	MPI tasks            : 1 1 2 2 
 Grid : Message : 0.129590 s : Initialising 4d RNG
 Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 0.942440 s : Initialising 5d RNG
 Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 local rank 1 device 0 bus id: 0019:01:00.0
 local rank 2 device 0 bus id: 0029:01:00.0
 local rank 3 device 0 bus id: 0039:01:00.0
 Grid : Message : 43.893114 s : Drawing gauge field
 Grid : Message : 54.574150 s : Random gauge initialised 
 Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 54.580032 s : Setting up Cshift based reference 
 Grid : Message : 60.407451 s : *****************************************************************
 Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 60.407470 s : *****************************************************************
 Grid : Message : 60.407471 s : *****************************************************************
 Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 60.407473 s : * Vectorising space-time by 8
 Grid : Message : 60.407475 s : * VComplex size is 64 B
 Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
 Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 60.407480 s : *****************************************************************
 Grid : Message : 61.102178 s : Called warmup
 Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
 Grid : Message : 62.177198 s : mflop/s =   24721998.6
 Grid : Message : 62.177201 s : mflop/s per rank =  6180499.64
 Grid : Message : 62.177204 s : mflop/s per node =  24721998.6
 Grid : Message : 62.182696 s : norm diff   5.8108784e-14  Line 306
 Grid : Message : 71.328862 s : ----------------------------------------------------------------
 Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 71.328885 s : ----------------------------------------------------------------
 Grid : Message : 71.328886 s : Called DwDag
 Grid : Message : 71.328887 s : norm dag result 4.12810493
 Grid : Message : 71.329493 s : norm dag ref    4.12810493
 Grid : Message : 71.331967 s : norm dag diff   3.40632318e-14  Line 377
 Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 71.803650 s : src_e0.500003185
 Grid : Message : 71.819727 s : src_o0.499996882
 Grid : Message : 71.821991 s : *********************************************************
 Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 71.821995 s : * Vectorising space-time by 8
 Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
 Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 71.822003 s : *********************************************************
 Grid : Message : 72.377054 s : Deo mflop/s =   24065467
 Grid : Message : 72.377071 s : Deo mflop/s per rank   6016366.75
 Grid : Message : 72.377074 s : Deo mflop/s per node   24065467
 Grid : Message : 72.624877 s : r_e2.06377678
 Grid : Message : 72.625198 s : r_o2.06381058
 Grid : Message : 72.625507 s : res4.12758736
 Grid : Message : 73.759140 s : norm diff   0
 Grid : Message : 73.868204 s : norm diff even  0
 Grid : Message : 73.907201 s : norm diff odd   0
 Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 74.414582 s :  Testing without internode communication 
 Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 74.414586 s : Grid Layout
 Grid : Message : 74.414586 s : 	Global lattice size  : 32 32 64 64 
 Grid : Message : 74.414594 s : 	OpenMP threads       : 4
 Grid : Message : 74.414595 s : 	MPI tasks            : 1 1 2 2 
 Grid : Message : 74.679364 s : Initialising 4d RNG
 Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 74.759525 s : Initialising 5d RNG
 Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 119.252016 s : Drawing gauge field
 Grid : Message : 129.919846 s : Random gauge initialised 
 Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 129.923611 s : Setting up Cshift based reference 
 Grid : Message : 135.522878 s : *****************************************************************
 Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 135.522899 s : *****************************************************************
 Grid : Message : 135.522899 s : *****************************************************************
 Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 135.522901 s : * Vectorising space-time by 8
 Grid : Message : 135.522903 s : * VComplex size is 64 B
 Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
 Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 135.522908 s : *****************************************************************
 Grid : Message : 136.151202 s : Called warmup
 Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
 Grid : Message : 137.224748 s : mflop/s =   24755806
 Grid : Message : 137.224751 s : mflop/s per rank =  6188951.49
 Grid : Message : 137.224753 s : mflop/s per node =  24755806
 Grid : Message : 137.235239 s : norm diff   5.8108784e-14  Line 306
 Grid : Message : 146.451686 s : ----------------------------------------------------------------
 Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 146.451710 s : ----------------------------------------------------------------
 Grid : Message : 146.451712 s : Called DwDag
 Grid : Message : 146.451714 s : norm dag result 4.12810493
 Grid : Message : 146.452323 s : norm dag ref    4.12810493
 Grid : Message : 146.454799 s : norm dag diff   3.40632318e-14  Line 377
 Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 146.940894 s : src_e0.500003185
 Grid : Message : 146.953676 s : src_o0.499996882
 Grid : Message : 146.955927 s : *********************************************************
 Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 146.955932 s : * Vectorising space-time by 8
 Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
 Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 146.955941 s : *********************************************************
 Grid : Message : 147.511975 s : Deo mflop/s =   24036256.5
 Grid : Message : 147.511989 s : Deo mflop/s per rank   6009064.13
 Grid : Message : 147.511991 s : Deo mflop/s per node   24036256.5
 Grid : Message : 147.522100 s : r_e2.06377678
 Grid : Message : 147.522433 s : r_o2.06381058
 Grid : Message : 147.522745 s : res4.12758736
 Grid : Message : 148.229848 s : norm diff   0
 Grid : Message : 149.233474 s : norm diff even  0
 Grid : Message : 149.235815 s : norm diff odd   0
 Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 149.960990 s :  Testing without intranode communication 
 Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 149.960995 s : Grid Layout
 Grid : Message : 149.960995 s : 	Global lattice size  : 32 32 64 64 
 Grid : Message : 149.961003 s : 	OpenMP threads       : 4
 Grid : Message : 149.961004 s : 	MPI tasks            : 1 1 2 2 
 Grid : Message : 150.155810 s : Initialising 4d RNG
 Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 150.973420 s : Initialising 5d RNG
 Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 193.933765 s : Drawing gauge field
 Grid : Message : 204.611551 s : Random gauge initialised 
 Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 204.615265 s : Setting up Cshift based reference 
 Grid : Message : 210.117788 s : *****************************************************************
 Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 210.117809 s : *****************************************************************
 Grid : Message : 210.117810 s : *****************************************************************
 Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 210.117813 s : * Vectorising space-time by 8
 Grid : Message : 210.117814 s : * VComplex size is 64 B
 Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
 Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 210.117819 s : *****************************************************************
 Grid : Message : 210.714641 s : Called warmup
 Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
 Grid : Message : 211.892252 s : mflop/s =   22568003.2
 Grid : Message : 211.892255 s : mflop/s per rank =  5642000.8
 Grid : Message : 211.892257 s : mflop/s per node =  22568003.2
 Grid : Message : 211.896037 s : norm diff   5.8108784e-14  Line 306
 Grid : Message : 220.751375 s : ----------------------------------------------------------------
 Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 220.751409 s : ----------------------------------------------------------------
 Grid : Message : 220.751411 s : Called DwDag
 Grid : Message : 220.751412 s : norm dag result 4.12810493
 Grid : Message : 220.753307 s : norm dag ref    4.12810493
 Grid : Message : 220.755796 s : norm dag diff   3.40632318e-14  Line 377
 Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 221.697800 s : src_e0.500003185
 Grid : Message : 221.890920 s : src_o0.499996882
 Grid : Message : 221.913430 s : *********************************************************
 Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 221.913480 s : * Vectorising space-time by 8
 Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
 Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 221.913550 s : *********************************************************
 Grid : Message : 221.645213 s : Deo mflop/s =   24114032
 Grid : Message : 221.645228 s : Deo mflop/s per rank   6028508.01
 Grid : Message : 221.645231 s : Deo mflop/s per node   24114032
 Grid : Message : 221.656021 s : r_e2.06377678
 Grid : Message : 221.656389 s : r_o2.06381058
 Grid : Message : 221.656698 s : res4.12758736
 Grid : Message : 222.110075 s : norm diff   0
 Grid : Message : 222.857692 s : norm diff even  0
 Grid : Message : 222.875763 s : norm diff odd   0
 Grid : Message : 223.598127 s : *******************************************
 Grid : Message : 223.598145 s : ******* Grid Finalize                ******
 Grid : Message : 223.598146 s : *******************************************
--- a/systems/Jupiter/benchmarks/dwf.4node.perf
+++ b/systems/Jupiter/benchmarks/dwf.4node.perf
@@ -0,0 +1,286 @@
 RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
 RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
 RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
 RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
 RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
 RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
 RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
 RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
 RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
 RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
 RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
 RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
 RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
 RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
 RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
 RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
 SLURM detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 1 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 0 device 0 bus id: 0009:01:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 16
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 81604378624 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
 Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
 Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 0.838000 s :  Testing with full communication 
 Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 0.840000 s : Grid Layout
 Grid : Message : 0.840000 s : 	Global lattice size  : 64 64 64 64 
 Grid : Message : 0.846000 s : 	OpenMP threads       : 4
 Grid : Message : 0.846000 s : 	MPI tasks            : 2 2 2 2 
 Grid : Message : 0.165970 s : Initialising 4d RNG
 Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 0.960410 s : Initialising 5d RNG
 Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 local rank 2 device 0 bus id: 0029:01:00.0
 local rank 3 device 0 bus id: 0039:01:00.0
 local rank 1 device 0 bus id: 0019:01:00.0
 Grid : Message : 44.657270 s : Drawing gauge field
 Grid : Message : 55.247733 s : Random gauge initialised 
 Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 55.253053 s : Setting up Cshift based reference 
 Grid : Message : 62.191747 s : *****************************************************************
 Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 62.191768 s : *****************************************************************
 Grid : Message : 62.191769 s : *****************************************************************
 Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 62.191769 s : * Vectorising space-time by 8
 Grid : Message : 62.191770 s : * VComplex size is 64 B
 Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
 Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 62.191772 s : *****************************************************************
 Grid : Message : 62.857568 s : Called warmup
 Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
 Grid : Message : 65.582120 s : mflop/s =   48306525
 Grid : Message : 65.582140 s : mflop/s per rank =  3019157.81
 Grid : Message : 65.582150 s : mflop/s per node =  12076631.3
 Grid : Message : 65.637550 s : norm diff   5.80156793e-14  Line 306
 Grid : Message : 75.122153 s : ----------------------------------------------------------------
 Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 75.122167 s : ----------------------------------------------------------------
 Grid : Message : 75.122167 s : Called DwDag
 Grid : Message : 75.122167 s : norm dag result 4.12801829
 Grid : Message : 75.123295 s : norm dag ref    4.12801829
 Grid : Message : 75.125890 s : norm dag diff   3.42093991e-14  Line 377
 Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 75.605683 s : src_e0.500004005
 Grid : Message : 75.617824 s : src_o0.499996067
 Grid : Message : 75.620089 s : *********************************************************
 Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 75.620093 s : * Vectorising space-time by 8
 Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
 Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 75.620096 s : *********************************************************
 Grid : Message : 76.732272 s : Deo mflop/s =   48068252.4
 Grid : Message : 76.732283 s : Deo mflop/s per rank   3004265.77
 Grid : Message : 76.732285 s : Deo mflop/s per node   12017063.1
 Grid : Message : 76.749317 s : r_e2.06443136
 Grid : Message : 76.749652 s : r_o2.06378451
 Grid : Message : 76.749955 s : res4.12821587
 Grid : Message : 77.198827 s : norm diff   0
 Grid : Message : 77.981760 s : norm diff even  0
 Grid : Message : 78.455900 s : norm diff odd   0
 Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 78.539337 s :  Testing without internode communication 
 Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 78.539339 s : Grid Layout
 Grid : Message : 78.539339 s : 	Global lattice size  : 64 64 64 64 
 Grid : Message : 78.539347 s : 	OpenMP threads       : 4
 Grid : Message : 78.539348 s : 	MPI tasks            : 2 2 2 2 
 Grid : Message : 78.798501 s : Initialising 4d RNG
 Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 78.879916 s : Initialising 5d RNG
 Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 124.586264 s : Drawing gauge field
 Grid : Message : 135.338090 s : Random gauge initialised 
 Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 135.341266 s : Setting up Cshift based reference 
 Grid : Message : 142.604280 s : *****************************************************************
 Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 142.604460 s : *****************************************************************
 Grid : Message : 142.604470 s : *****************************************************************
 Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 142.604480 s : * Vectorising space-time by 8
 Grid : Message : 142.604500 s : * VComplex size is 64 B
 Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
 Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 142.604520 s : *****************************************************************
 Grid : Message : 142.686034 s : Called warmup
 Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
 Grid : Message : 144.868559 s : mflop/s =   48706194.1
 Grid : Message : 144.868561 s : mflop/s per rank =  3044137.13
 Grid : Message : 144.868562 s : mflop/s per node =  12176548.5
 Grid : Message : 144.887595 s : norm diff   5.80156793e-14  Line 306
 Grid : Message : 153.622978 s : ----------------------------------------------------------------
 Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 153.622995 s : ----------------------------------------------------------------
 Grid : Message : 153.622995 s : Called DwDag
 Grid : Message : 153.622996 s : norm dag result 4.12801829
 Grid : Message : 153.623604 s : norm dag ref    4.12801829
 Grid : Message : 153.626098 s : norm dag diff   3.42093991e-14  Line 377
 Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 154.148319 s : src_e0.500004005
 Grid : Message : 154.151454 s : src_o0.499996067
 Grid : Message : 154.153722 s : *********************************************************
 Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 154.153725 s : * Vectorising space-time by 8
 Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
 Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 154.153728 s : *********************************************************
 Grid : Message : 155.200671 s : Deo mflop/s =   51121022.4
 Grid : Message : 155.200682 s : Deo mflop/s per rank   3195063.9
 Grid : Message : 155.200684 s : Deo mflop/s per node   12780255.6
 Grid : Message : 155.217204 s : r_e2.06443136
 Grid : Message : 155.217550 s : r_o2.06378451
 Grid : Message : 155.217869 s : res4.12821587
 Grid : Message : 155.673744 s : norm diff   0
 Grid : Message : 156.463329 s : norm diff even  0
 Grid : Message : 156.878866 s : norm diff odd   0
 Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 157.620764 s :  Testing without intranode communication 
 Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 157.620766 s : Grid Layout
 Grid : Message : 157.620766 s : 	Global lattice size  : 64 64 64 64 
 Grid : Message : 157.620773 s : 	OpenMP threads       : 4
 Grid : Message : 157.620774 s : 	MPI tasks            : 2 2 2 2 
 Grid : Message : 157.671479 s : Initialising 4d RNG
 Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 157.755651 s : Initialising 5d RNG
 Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 202.465158 s : Drawing gauge field
 Grid : Message : 213.214546 s : Random gauge initialised 
 Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 213.217711 s : Setting up Cshift based reference 
 Grid : Message : 219.662772 s : *****************************************************************
 Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 219.662787 s : *****************************************************************
 Grid : Message : 219.662788 s : *****************************************************************
 Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 219.662789 s : * Vectorising space-time by 8
 Grid : Message : 219.662790 s : * VComplex size is 64 B
 Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
 Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 219.662791 s : *****************************************************************
 Grid : Message : 220.425592 s : Called warmup
 Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
 Grid : Message : 222.536267 s : mflop/s =   50365105.5
 Grid : Message : 222.536269 s : mflop/s per rank =  3147819.09
 Grid : Message : 222.536270 s : mflop/s per node =  12591276.4
 Grid : Message : 222.541053 s : norm diff   5.80156793e-14  Line 306
 Grid : Message : 232.135901 s : ----------------------------------------------------------------
 Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 232.135916 s : ----------------------------------------------------------------
 Grid : Message : 232.135917 s : Called DwDag
 Grid : Message : 232.135918 s : norm dag result 4.12801829
 Grid : Message : 232.151938 s : norm dag ref    4.12801829
 Grid : Message : 232.154451 s : norm dag diff   3.42093991e-14  Line 377
 Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 232.630529 s : src_e0.500004005
 Grid : Message : 232.643197 s : src_o0.499996067
 Grid : Message : 232.645527 s : *********************************************************
 Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 232.645532 s : * Vectorising space-time by 8
 Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
 Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 232.645535 s : *********************************************************
 Grid : Message : 233.774184 s : Deo mflop/s =   47432091.9
 Grid : Message : 233.774194 s : Deo mflop/s per rank   2964505.74
 Grid : Message : 233.774196 s : Deo mflop/s per node   11858023
 Grid : Message : 233.791552 s : r_e2.06443136
 Grid : Message : 233.791899 s : r_o2.06378451
 Grid : Message : 233.792204 s : res4.12821587
 Grid : Message : 234.230783 s : norm diff   0
 Grid : Message : 235.162780 s : norm diff even  0
 Grid : Message : 235.291950 s : norm diff odd   0
 Grid : Message : 235.765411 s : *******************************************
 Grid : Message : 235.765424 s : ******* Grid Finalize                ******
 Grid : Message : 235.765425 s : *******************************************
--- a/systems/Jupiter/benchmarks/dwf1.slurm
+++ b/systems/Jupiter/benchmarks/dwf1.slurm
@@ -0,0 +1,57 @@
 #!/bin/sh
 #SBATCH --account=jureap14
 #SBATCH --nodes=1
 #SBATCH --ntasks=4
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=64
 #SBATCH --time=2:00:00
 #SBATCH --partition=booster
 #SBATCH --gres=gpu:4
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 OPT="--comms-overlap"
 source ../sourceme.sh
 cat << EOF > bind_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3)
 export NUMA_MAP=(0 1 2 3)
 export NIC_MAP=(0 1 2 3)
 export GPU=\$SLURM_LOCALID
 export NUMA=\$SLURM_LOCALID
 export NIC=\$SLURM_LOCALID
 export CUDA_VISIBLE_DEVICES=\$GPU
 export UCX_NET_DEVICES=mlx5_\${NIC}:1
 echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
 exec numactl -m \$NUMA -N \$NUMA \$*
 EOF
 chmod +x ./bind_gpu
 srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
        ./bind_gpu ./Benchmark_dwf_fp32 \
 	$OPT \
 	--mpi 1.1.2.2 \
 	--accelerator-threads 8 \
 	--grid 32.32.64.64 \
 	--shm 2048 > dwf.1node.perf
 srun --cpu-bind=no -N 1  -n $SLURM_NTASKS \
 	./bind_gpu ./Benchmark_comms_host_device \
 	--mpi 1.1.2.2 \
 	--accelerator-threads 8 \
 	--grid 32.32.64.64 \
 	--shm 2048 > comms.1node.perf
--- a/systems/Jupiter/benchmarks/dwf4.slurm
+++ b/systems/Jupiter/benchmarks/dwf4.slurm
@@ -0,0 +1,57 @@
 #!/bin/sh
 #SBATCH --account=jureap14
 #SBATCH --nodes=4
 #SBATCH --ntasks=16
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=64
 #SBATCH --time=2:00:00
 #SBATCH --partition=booster
 #SBATCH --gres=gpu:4
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 OPT="--comms-overlap"
 source ../sourceme.sh
 cat << EOF > bind_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3)
 export NUMA_MAP=(0 1 2 3)
 export NIC_MAP=(0 1 2 3)
 export GPU=\$SLURM_LOCALID
 export NUMA=\$SLURM_LOCALID
 export NIC=\$SLURM_LOCALID
 export CUDA_VISIBLE_DEVICES=\$GPU
 export UCX_NET_DEVICES=mlx5_\${NIC}:1
 echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
 exec numactl -m \$NUMA -N \$NUMA \$*
 EOF
 chmod +x ./bind_gpu
 srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
        ./bind_gpu ./Benchmark_dwf_fp32 \
 	$OPT \
 	--mpi 2.2.2.2 \
 	--accelerator-threads 8 \
 	--grid 64.64.64.64 \
 	--shm 2048 > dwf.4node.perf
 srun --cpu-bind=no -N 4  -n $SLURM_NTASKS \
 	./bind_gpu ./Benchmark_comms_host_device \
 	--mpi 2.2.2.2 \
 	--accelerator-threads 8 \
 	--grid 32.32.64.64 \
 	--shm 2048 > comms.4node.perf
--- a/systems/Jupiter/config-command
+++ b/systems/Jupiter/config-command
@@ -0,0 +1,16 @@
 export CXX=nvcc
 export OPENMPI=/p/software/default/stages/2025/software/OpenMPI/5.0.5-NVHPC-24.9-CUDA-12/
 export LDFLAGS="-cudart shared -L${OPENMPI}/lib" 
 export CXXFLAGS="-ccbin clang++ -gencode arch=compute_90,code=sm_90 -std=c++17 -cudart shared -lcublas -lmpi -I${OPENMPI}/include"
 ../../configure \
    --enable-comms=mpi \
    --enable-simd=GPU \
    --enable-gen-simd-width=64 \
    --enable-shm=nvlink \
    --enable-accelerator=cuda \
    --with-lime=$CLIME \
    --disable-gparity \
    --disable-fermion-reps \
    --disable-unified 
--- a/systems/Jupiter/sourceme.sh
+++ b/systems/Jupiter/sourceme.sh
@@ -0,0 +1,10 @@
 CLIME=$HOME/install/
 module load Clang
 module load CUDA
 module load FFTW
 module load OpenSSL
 module load MPFR
 module load NVHPC
 module load UCX
 module load OpenMPI
 ulimit -c 0
--- a/systems/mac-arm/config-command
+++ b/systems/mac-arm/config-command
@@ -1,12 +0,0 @@
 MPICXX=mpicxx CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ CXX=clang++ ../../configure \
 	      --enable-simd=GEN \
 	      --enable-Nc=1 \
 	      --enable-debug \
 	      --enable-unified=yes \
 	      --prefix $HOME/QCD/GridInstall \
 	      --with-lime=/Users/peterboyle/QCD/SciDAC/install/ \
 	      --with-openssl=$BREW \
 	      --disable-fermion-reps \
 	      --disable-gparity \
 	      --enable-debug
--- a/tests/debug/Test_icosahedron.cc
+++ b/tests/debug/Test_icosahedron.cc
--- a/tests/lanczos/Test_dwf_G5R5.cc
+++ b/tests/lanczos/Test_dwf_G5R5.cc
@@ -179,8 +179,8 @@ int main(int argc, char** argv) {
  Np=LanParams.Np;
  int Nm = Nk + Np;
-  int MaxIt = 10000;
+  int MaxIt = 100;
-  RealD resid = 1.0e-5;
+  RealD resid = 1.0e-4;
 //while ( mass > - 5.0){
@@ -314,7 +314,7 @@ int main(int argc, char** argv) {
      }
    }
  }
-    for(int i = 0; i < Nconv; i++){
+  for(int i = 0; i < Nconv; i++){
    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
@@ -322,6 +322,7 @@ int main(int argc, char** argv) {
  cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
  cout << eMMe << endl;
 //  vector<LatticeFermion> finalevec(Nconv, FGrid);
 // temporary, until doing rotation
@@ -343,6 +344,20 @@ int main(int argc, char** argv) {
    }
  }
  for(int i = 0; i < Nconv; i++){
    Ddwf.M(finalevec[i], G5R5Mevec[i]);
    for(int j = 0; j < Nconv; j++){
      std::cout << "<"<<j<<"|Ddwf|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
    }
  }
  for(int i = 0; i < Nconv; i++){
    RealD t1,t2;
    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], t1, t2);
    for(int j = 0; j < Nconv; j++){
      std::cout << "<"<<j<<"|G5R5 M|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
    }
  }
  for(int i = 0; i < Nconv; i++){
    chiral_matrix_real[i].resize(Nconv);
    chiral_matrix[i].resize(Nconv);
@@ -380,7 +395,7 @@ int main(int argc, char** argv) {
  PYTHON_LINE("ax = fig.add_subplot(projection='3d')");
  PYTHON_LINE("");
  PYTHON_LINE("x, y = np.random.rand(2, 100) * 4");
-  PYTHON_LINE("hist, xedges, yedges = np.histogram2d(x, y, bins=10, range=[[0, 9], [0, 9]])");
+  fprintf(fp,"hist, xedges, yedges = np.histogram2d(x, y, bins=%d, range=[[0, %d], [0, %d]])\n",Nconv,Nconv-1,Nconv-1);
  PYTHON_LINE("");
  PYTHON_LINE("# Construct arrays for the anchor positions of the 16 bars");
  PYTHON_LINE("xpos, ypos = np.meshgrid(xedges[:-1] + 0.25, yedges[:-1] + 0.25, indexing=\"ij\")");
--- a/visualisation/README
+++ b/visualisation/README
@@ -125,4 +125,29 @@ Extensions
 8) Example python code: FieldDensity.py . This is not interfaced to Grid.
 ================
 Windowless generation of AVI files: must enable offscreen rendering. From Shuhei Yamamoto:
 ================
 Hi Peter,
 To make visualization work on Frontier, I did the following.
 For headless off-screen rendering, ccmake tabs in advanced mode shown below are set as indicated.
 VTK_OPENGL_HAS_* off  
 VTK_USE_X off  
 VTK_DEFAULT_RENDER_WINDOW_OFFSCREEN on
 VTK_DEFAULT_RENDER_WINDOW_HEADLESS on 
 The list can be greater than necessary.
 VTK can fall back to EGL or OSMesa at runtime. So I installed mesa via spack (as well as nasm and yasm).  Either mesa or meson package requires llvm-config, which is included after rocm6.1.  On Frontier, I used /opt/rocm-6.2.4.  The only problem is that llvm-config is located on /opt/rocm-6.2.4/llvm/bin, instead of /opt/rocm-6.2.4/bin.  So I edited packages.yaml for spack so that the prefix for rocm compiler is /opt/rocm-6.2.4/llvm.  Just in case, I also changed c and cxx to /opt/rocm-6.2.4/llvm/bin/amdclang, amdclang++, respectively, but this change might not be necessary. 
 After installation, I added a path to libOSMesa.so to LD_LIBRARY_PATH, for which there might be a better way such as specifying -rpath for OSMesa lib by editing cmake files.
 In addition, I have editied CMakeLists.txt for vtk to force vtk to find OSMesa package via find_package(OSMesa REQUIRED) after list(INSERT CMAKE_MODULE_PATH 0 "${vtk_cmake_dir}"), as there is Find package in vtk/CMake.  There will be more elegant method, but I was not able to find a tab to switch on OSMesa. 
 When I compiled vtk and linked to Grid visualization code, with ffmpeg option, it produces avi file.
 Best,
 Shuhei
Author	SHA1	Message	Date
Peter Boyle	73af020f98	improved	2025-06-27 06:08:54 +00:00
Peter Boyle	bffb83c46e	std::cout<<GridLogMessage<<"Debug:"<<std::endl; std::cout<<GridLogMessage<<" --dylib-map : print dynamic library map, useful for interpreting signal backtraces "<<std::endl; std::cout<<GridLogMessage<<" --heartbeat : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl; std::cout<<GridLogMessage<<" --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl; std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl; std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl; std::cout<<GridLogMessage<<" --debug-heartbeat : periodically report backtrace "<<std::endl; --dylib-map : Grid prints its dylib regions --heartbeat : itimer based / SIGALRM wake up which seems to make Aurora more stable --debug-heartbeat : periodically report to stderr where we are in code Now have libunwind option (configure: --with-unwind=<prefix>) to give an Asynch-Signal safe backtrace. Avoid glibc backtrace due to mallocs.	2025-06-27 06:08:54 +00:00
Peter Boyle	7031f37350	Use libunwind for backtrace as it is signal asynch safe	2025-06-27 06:08:54 +00:00
Peter Boyle	829dd74cb2	Verbose change	2025-06-27 06:08:54 +00:00
Peter Boyle	66e671985d	P2P	2025-06-27 06:08:54 +00:00
Peter Boyle	5afcbcf0f3	Cshift uses flight recorder	2025-06-27 06:08:54 +00:00
Peter Boyle	9730579312	Simplify and verbose	2025-06-27 06:08:51 +00:00
Peter Boyle	bfae14d035	More flight logging	2025-06-27 06:07:34 +00:00
Peter Boyle	b78fc73d19	Better signal handler	2025-06-27 06:07:34 +00:00
Peter Boyle	709f8ae76c	Update README	2025-06-26 23:06:11 -04:00
Peter Boyle	7aa06329d0	Update for new stencil compression options	2025-06-17 18:06:19 +02:00
Peter Boyle	9d6a38c44c	Compressed comms options as Sloppy	2025-06-17 16:43:53 +02:00
Peter Boyle	6ec5cee368	Preparing for compressed comms	2025-06-17 16:38:10 +02:00
Peter Boyle	f2e9a68825	Simplify	2025-06-13 17:32:05 +02:00
Peter Boyle	d88750e6b6	Sloppy + non-sloppy	2025-06-13 16:42:01 +02:00
Peter Boyle	821358eda7	Remove partial dirichlet. Favour intro reduced prec comms options	2025-06-13 05:08:45 +02:00
Peter Boyle	fce6e1f135	Kill core files for quota reasons	2025-06-13 05:08:15 +02:00
Peter Boyle	8f0bb3e676	remove partial dirichlet	2025-06-13 05:07:56 +02:00
Peter Boyle	262c70d967	USe sloppy comms options	2025-06-13 05:07:23 +02:00
Peter Boyle	da43ef7c2d	REmove partial dirichlet option. It's going nowhere	2025-06-13 05:05:15 +02:00
Peter Boyle	7b60ab5df1	Warning suppress	2025-06-13 05:04:55 +02:00
Peter Boyle	f6b961a64e	Warning suppress	2025-06-13 05:04:47 +02:00
Peter Boyle	f1ed988aa3	Interface to reduced precision comms	2025-06-13 05:04:12 +02:00
Peter Boyle	eea51bb604	Suppress annoying warns	2025-06-13 05:03:36 +02:00
Peter Boyle	9203126aa5	Scripts	2025-06-11 15:30:16 +02:00
Peter Boyle	f90ba4712a	Update for Jupiter	2025-06-11 15:24:34 +02:00
Peter Boyle	3737a24096	Updated python output	2025-06-03 14:09:29 -04:00