Update for new stencil compression options

Compressed comms options as Sloppy
Preparing for compressed comms
2025-11-05 06:19:31 +00:00 · 2025-06-17 18:06:19 +02:00 · 2025-06-17 16:43:53 +02:00 · 2025-06-17 16:38:10 +02:00 · 2025-06-13 17:32:05 +02:00 · 2025-06-13 16:42:01 +02:00
22 changed files with 561 additions and 1068 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -51,11 +51,13 @@ directory
 #pragma nv_diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma nv_diag_suppress esa_on_defaulted_function_ignored
+#pragma nv_diag_suppress declared_but_not_referenced
 #pragma nv_diag_suppress extra_semicolon
 #else
 //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
+#pragma diag_suppress declared_but_not_referenced
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -183,6 +183,7 @@ public:
 		      int recv_from_rank,
 		      int bytes);
  
+  int IsOffNode(int rank);
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,int do_xmit,
 			       void *recv,
@@ -201,9 +202,9 @@ public:
  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);

  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-				    void *xmit,
+				    void *xmit,void *xmit_comp,
 				    int xmit_to_rank,int do_xmit,
-				    void *recv,
+				    void *recv,void *recv_comp,
 				    int recv_from_rank,int do_recv,
 				    int xbytes,int rbytes,int dir);
  
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -395,11 +395,16 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
-  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
-
+int CartesianCommunicator::IsOffNode(int rank)
+{
+  int grank = ShmRanks[rank];
+  if ( grank == MPI_UNDEFINED ) return true;
+  else return false;
+}

 #ifdef ACCELERATOR_AWARE_MPI
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
@@ -414,9 +419,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
  return 0.0; // Do nothing -- no preparation required
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
+							 void *xmit,void *xmit_comp,
 							 int dest,int dox,
-							 void *recv,
+							 void *recv,void *recv_comp,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
@@ -440,7 +445,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
-      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      //      std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
+      ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      assert(ierr==0);
      list.push_back(rrq);
      off_node_bytes+=rbytes;
@@ -449,6 +455,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    else { 
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
+      //      std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
    }
 #endif
@@ -457,7 +464,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      list.push_back(xrq);
      off_node_bytes+=xbytes;
@@ -676,9 +683,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
 }  

 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
+							 void *xmit,void *xmit_comp,
 							 int dest,int dox,
-							 void *recv,
+							 void *recv,void *recv_comp,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -124,6 +124,8 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
  dest=0;
 }

+int CartesianCommunicator::IsOffNode(int rank) { return false; }
+
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,int dox,
 						     void *recv,
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -543,37 +543,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
  // printf("Host buffer allocate for GPU non-aware MPI\n");
-#if 0
-  HostCommBuf= acceleratorAllocHost(bytes);
-#else 
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
-#if 0
-  #warning "Moving host buffers to specific NUMA domain"
-  int numa;
-  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
-  if(numa_name) {
-    unsigned long page_size = sysconf(_SC_PAGESIZE);
-    numa = atoi(numa_name);
-    unsigned long page_count = bytes/page_size;
-    std::vector<void *> pages(page_count);
-    std::vector<int>    nodes(page_count,numa);
-    std::vector<int>    status(page_count,-1);
-    for(unsigned long p=0;p<page_count;p++){
-      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
-    }
-    int ret = move_pages(0,
-			 page_count,
-			 &pages[0],
-			 &nodes[0],
-			 &status[0],
-			 MPOL_MF_MOVE);
-    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
-    if (ret) perror(" move_pages failed for reason:");
-  }
-#endif  
-  acceleratorPin(HostCommBuf,bytes);
-#endif  
-
+  //  acceleratorPin(HostCommBuf,bytes);
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
@@ -1039,11 +1010,13 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  int gpeer = ShmRanks[rank];
  assert(gpeer!=ShmRank); // never send to self
+  //  std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
+    //    std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
    return (void *) remote;
  }
 }
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -154,6 +154,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }

  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -179,6 +179,12 @@ public:
  StencilImpl Stencil; 
  StencilImpl StencilEven; 
  StencilImpl StencilOdd; 
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }
    
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -146,6 +146,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }

  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -32,209 +32,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-///////////////////////////////////////////////////////////////
-// Wilson compressor will need FaceGather policies for:
-// Periodic, Dirichlet, and partial Dirichlet for DWF
-///////////////////////////////////////////////////////////////
-const int dwf_compressor_depth=2;
-#define DWF_COMPRESS
-class FaceGatherPartialDWF
-{
-public:
-#ifdef DWF_COMPRESS
-  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
-#else
-  static int PartialCompressionFactor(GridBase *grid) { return 1;}
-#endif
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
-				   const Lattice<vobj> &rhs,
-				   cobj *buffer,
-				   compressor &compress,
-				   int off,int so,int partial)
-  {
-    //DWF only hack: If a direction that is OFF node we use Partial Dirichlet
-    //  Shrinks local and remote comms buffers
-    GridBase *Grid = rhs.Grid();
-    int Ls = Grid->_rdimensions[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else 
-    int depth=Ls/2;
-#endif
-    std::pair<int,int> *table_v = & table[0];
-    auto rhs_v = rhs.View(AcceleratorRead);
-    int vol=table.size()/Ls;
-    accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
-	Integer i=idx/Ls;
-	Integer s=idx%Ls;
-	Integer sc=depth+s-(Ls-depth);
-	if(s<depth)     compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
-	if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
-    });
-    rhs_v.ViewClose();
-  }
-  template<class decompressor,class Decompression>
-  static void DecompressFace(decompressor decompress,Decompression &dd)
-  {
-    auto Ls = dd.dims[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth=Ls/2;
-#endif    
-    // Just pass in the Grid
-    auto kp = dd.kernel_p;
-    auto mp = dd.mpi_p;
-    int size= dd.buffer_size;
-    int vol= size/Ls;
-    accelerator_forNB(o,size,1,{
-	int idx=o/Ls;
-	int   s=o%Ls;
-	if ( s < depth ) {
-	  int oo=s*vol+idx;
-	  kp[o]=mp[oo];
-	} else if ( s >= Ls-depth ) {
-	  int sc = depth + s - (Ls-depth);
-	  int oo=sc*vol+idx; 
-	  kp[o]=mp[oo];
-	} else {
-	  kp[o] = Zero();//fill rest with zero if partial dirichlet
-	}
-    });
-  }
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  // Need to gather *interior portions* for ALL s-slices in simd directions
-  // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
-  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
-				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
-				    compressor &compress,int type,int partial)
-  {
-    GridBase *Grid = rhs.Grid();
-    int Ls = Grid->_rdimensions[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth = Ls/2;
-#endif
-    
-    // insertion of zeroes...
-    assert( (table.size()&0x1)==0);
-    int num=table.size()/2;
-    int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
-    
-    auto rhs_v = rhs.View(AcceleratorRead);
-    auto p0=&pointers[0][0];
-    auto p1=&pointers[1][0];
-    auto tp=&table[0];
-    int nnum=num/Ls;
-    accelerator_forNB(j, num, vobj::Nsimd(), {
-	//  Reorders both local and remote comms buffers
-	//  
-	int s  = j % Ls;
-	int sp1 = (s+depth)%Ls;  // peri incremented s slice
-	
-	int hxyz= j/Ls;
-
-	int xyz0= hxyz*2; // xyzt part of coor
-	int xyz1= hxyz*2+1;
-	
-	int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
-	
-	int kk0= xyz0*Ls + s ; // s=0 goes to s=1
-	int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
-	compress.CompressExchange(p0[jj],p1[jj],
-				  rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
-				  rhs_v[so+tp[kk1 ].second], 
-				  type);
-    });
-    rhs_v.ViewClose();
-  }
-  // Merge routine is for SIMD faces
-  template<class decompressor,class Merger>
-  static void MergeFace(decompressor decompress,Merger &mm)
-  {
-    auto Ls = mm.dims[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth = Ls/2;
-#endif
-    int  num= mm.buffer_size/2; // relate vol and Ls to buffer size
-    auto mp = &mm.mpointer[0];
-    auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
-    auto vp1= &mm.vpointers[1][0];
-    auto type= mm.type;
-    int nnum = num/Ls;
-    accelerator_forNB(o,num,Merger::Nsimd,{
-
-	int  s=o%Ls;
-	int hxyz=o/Ls; // xyzt related component
-	int xyz0=hxyz*2;
-	int xyz1=hxyz*2+1;
-
-	int sp = (s+depth)%Ls; 
-	int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
-
-	int oo0= s+xyz0*Ls;
-	int oo1= s+xyz1*Ls;
-
-	// same ss0, ss1 pair goes to new layout
-	decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
-      });
-  }
-};
-class FaceGatherDWFMixedBCs
-{
-public:
-#ifdef DWF_COMPRESS
-  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
-#else 
-  static int PartialCompressionFactor(GridBase *grid) {return 1;}
-#endif
-  
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
-					 const Lattice<vobj> &rhs,
-					 cobj *buffer,
-					 compressor &compress,
-					 int off,int so,int partial)
-  {
-    //    std::cout << " face gather simple DWF partial "<<partial <<std::endl;
-    if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
-    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
-  }
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
-				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
-				    compressor &compress,int type,int partial)
-  {
-    //    std::cout << " face gather exch DWF partial "<<partial <<std::endl;
-    if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
-    else        FaceGatherSimple::Gather_plane_exchange    (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
-  }
-  template<class decompressor,class Merger>
-  static void MergeFace(decompressor decompress,Merger &mm)
-  {
-    int partial = mm.partial;
-    //    std::cout << " merge DWF partial "<<partial <<std::endl;
-    if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
-    else           FaceGatherSimple::MergeFace(decompress,mm);
-  }
-
-  template<class decompressor,class Decompression>
-  static void DecompressFace(decompressor decompress,Decompression &dd)
-  {
-    int partial = dd.partial;
-    //    std::cout << " decompress DWF partial "<<partial <<std::endl;
-    if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
-    else           FaceGatherSimple::DecompressFace(decompress,dd);
-  }
-};
-
 /////////////////////////////////////////////////////////////////////////////////////////////
 // optimised versions supporting half precision too??? Deprecate
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -242,8 +39,7 @@ public:

 //Could make FaceGather a template param, but then behaviour is runtime not compile time
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
-class WilsonCompressorTemplate  : public FaceGatherDWFMixedBCs
-//  : public FaceGatherSimple
+class WilsonCompressorTemplate : public FaceGatherSimple
 {
 public:
  
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -165,6 +165,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }

  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -204,7 +204,14 @@ public:
  DoubledGaugeField Umu;
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
-    
+
+
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;

--- a/Grid/stencil/Stencil.cc
+++ b/Grid/stencil/Stencil.cc
@@ -30,25 +30,26 @@
 NAMESPACE_BEGIN(Grid);

 uint64_t DslashFullCount;
-uint64_t DslashPartialCount;
+//uint64_t DslashPartialCount;
 uint64_t DslashDirichletCount;

 void DslashResetCounts(void)
 {
  DslashFullCount=0;
-  DslashPartialCount=0;
+  //  DslashPartialCount=0;
  DslashDirichletCount=0;
 }
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
 {
  dirichlet = DslashDirichletCount;
-  partial   = DslashPartialCount;
+  partial   = 0;
  full      = DslashFullCount;
 }
 void DslashLogFull(void)     { DslashFullCount++;}
-void DslashLogPartial(void)  { DslashPartialCount++;}
+//void DslashLogPartial(void)  { DslashPartialCount++;}
 void DslashLogDirichlet(void){ DslashDirichletCount++;}

+deviceVector<unsigned char> StencilBuffer::DeviceCommBuf; 

 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table)
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct DefaultImplParams {
  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  int  partialDirichlet;
+  //  int  partialDirichlet;
  DefaultImplParams()  {
    dirichlet.resize(0);
-    partialDirichlet=0;
+    //    partialDirichlet=0;
  };
 };

@@ -69,6 +69,12 @@ struct DefaultImplParams {
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table);

+class StencilBuffer
+{
+public:
+  static deviceVector<unsigned char> DeviceCommBuf;     // placed in Stencil.cc
+};
+
 void DslashResetCounts(void);
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 void DslashLogFull(void);
@@ -113,8 +119,8 @@ class CartesianStencilAccelerator {
  ///////////////////////////////////////////////////
  // If true, this is partially communicated per face
  ///////////////////////////////////////////////////
-  StencilVector _comms_partial_send; 
-  StencilVector _comms_partial_recv;
+  //  StencilVector _comms_partial_send; 
+  //  StencilVector _comms_partial_recv;
  //
  StencilVector _comm_buf_size;
  StencilVector _permute_type;
@@ -205,16 +211,16 @@ public:
  struct Packet {
    void * send_buf;
    void * recv_buf;
-#ifndef ACCELERATOR_AWARE_MPI
-    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
-    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
-#endif
+    void * compressed_send_buf;
+    void * compressed_recv_buf;
    Integer to_rank;
    Integer from_rank;
    Integer do_send;
    Integer do_recv;
    Integer xbytes;
    Integer rbytes;
+    Integer xbytes_compressed;
+    Integer rbytes_compressed;
  };
  struct Merge {
    static constexpr int Nsimd = vobj::Nsimd();
@@ -223,7 +229,7 @@ public:
    std::vector<cobj *> vpointers;
    Integer buffer_size;
    Integer type;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct Decompress {
@@ -231,7 +237,7 @@ public:
    cobj * kernel_p;
    cobj * mpi_p;
    Integer buffer_size;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct CopyReceiveBuffer {
@@ -252,9 +258,45 @@ public:

 protected:
  GridBase *                        _grid;
+
+  ///////////////////////////////////////////////////
+  // Sloppy comms will make a second buffer upon comms
+  ///////////////////////////////////////////////////
+  size_t device_heap_top;  //
+  size_t device_heap_bytes;//
+  size_t device_heap_size; //
+  void *DeviceBufferMalloc(size_t bytes)
+  {
+    void *ptr = (void *)device_heap_top;
+    device_heap_top  += bytes;
+    device_heap_bytes+= bytes;
+    if ( device_heap_bytes > device_heap_size ) {
+      std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
+      assert (device_heap_bytes <= device_heap_size);
+    }
+    return ptr;
+  }
+  void  DeviceBufferFreeAll(void)
+  {
+    device_heap_size = _unified_buffer_size*sizeof(cobj);
+    // Resize up if necessary, never down
+    if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
+      StencilBuffer::DeviceCommBuf.resize(device_heap_size);
+    }
+    device_heap_top  =(size_t) &StencilBuffer::DeviceCommBuf[0];
+    device_heap_size = StencilBuffer::DeviceCommBuf.size();
+    device_heap_bytes=0;
+  }
+
 public:
  GridBase *Grid(void) const { return _grid; }

+  /////////////////////////////////////////////////////////
+  // Control reduced precision comms
+  /////////////////////////////////////////////////////////
+  int SloppyComms;
+  void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
+
  ////////////////////////////////////////////////////////////////////////
  // Needed to conveniently communicate gparity parameters into GPU memory
  // without adding parameters. Perhaps a template parameter to StenciView is
@@ -268,7 +310,7 @@ public:
  }

  int face_table_computed;
-  int partialDirichlet;
+  //  int partialDirichlet;
  int fullDirichlet;
  std::vector<deviceVector<std::pair<int,int> > > face_table ;
  deviceVector<int> surface_list;
@@ -361,24 +403,145 @@ public:
  ////////////////////////////////////////////////////////////////////////
  // Non blocking send and receive. Necessarily parallel.
  ////////////////////////////////////////////////////////////////////////
+  void DecompressPacket(Packet &packet)
+  {
+    if ( !SloppyComms ) return;
+
+    if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
+
+      typedef typename getPrecision<cobj>::real_scalar_type word;
+      uint64_t words = packet.rbytes/sizeof(word);
+      const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+      const uint64_t outer = words/nsimd;
+
+      if(sizeof(word)==8) {
+
+	// Can either choose to represent as float vs double and prec change
+	// OR
+	// truncate the mantissa bfp16 style
+	double *dbuf =(double *) packet.recv_buf;
+	float  *fbuf =(float  *) packet.compressed_recv_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
+	});
+
+      } else if ( sizeof(word)==4){
+	// Can either choose to represent as half vs float and prec change
+        // OR
+	// truncate the mantissa bfp16 style
+
+	uint32_t *fbuf =(uint32_t *) packet.recv_buf;
+	uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
+	});
+
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+    }
+  }
+  void CompressPacket(Packet &packet)
+  {
+    packet.xbytes_compressed = packet.xbytes;
+    packet.compressed_send_buf = packet.send_buf;
+
+    packet.rbytes_compressed = packet.rbytes;
+    packet.compressed_recv_buf = packet.recv_buf;
+
+    if ( !SloppyComms  ) {
+      return;
+    }
+
+    typedef typename getPrecision<cobj>::real_scalar_type word;
+    uint64_t words = packet.xbytes/sizeof(word);
+    const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+    const uint64_t outer = words/nsimd;
+
+    if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
+
+      packet.rbytes_compressed = packet.rbytes/2;
+      packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
+      //      std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
+      
+    }
+    //else {
+    //      std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
+    //    }
+    
+    if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
+
+      packet.xbytes_compressed = packet.xbytes/2;
+      packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
+      //      std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
+
+      if(sizeof(word)==8) {
+
+	double *dbuf =(double *) packet.send_buf;
+	float  *fbuf =(float  *) packet.compressed_send_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
+	});
+
+      } else if ( sizeof(word)==4){
+
+	uint32_t *fbuf =(uint32_t *) packet.send_buf;
+	uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
+	
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
+	});
+
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+
+    }
+    //    else {
+    //      std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
+    //    }
+
+    return;
+  }
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    //    std::cout << "Communicate Begin "<<std::endl;
-    //    _grid->Barrier();
    FlightRecorder::StepLog("Communicate begin");
+    ///////////////////////////////////////////////
    // All GPU kernel tasks must complete
-    //    accelerator_barrier();     // All kernels should ALREADY be complete
-    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
-                               // But the HaloGather had a barrier too.
+    //    accelerator_barrier();      All kernels should ALREADY be complete
+    //Everyone is here, so noone running slow and still using receive buffer
+    _grid->StencilBarrier();
+    // But the HaloGather had a barrier too.
+    ///////////////////////////////////////////////
+    if (SloppyComms) {
+      DeviceBufferFreeAll();
+    }
+    for(int i=0;i<Packets.size();i++){
+      this->CompressPacket(Packets[i]);
+    }
+    if (SloppyComms) { 
+      accelerator_barrier();
+#ifdef NVLINK_GET
+      _grid->StencilBarrier(); 
+#endif
+    }
+    
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate prepare "<<i<<std::endl;
      //      _grid->Barrier();
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
-					  Packets[i].send_buf,
+					  Packets[i].compressed_send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
-					  Packets[i].recv_buf,
+					  Packets[i].compressed_recv_buf,
 					  Packets[i].from_rank,Packets[i].do_recv,
-					  Packets[i].xbytes,Packets[i].rbytes,i);
+					  Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
    }
    //    std::cout << "Communicate PollDtoH "<<std::endl;
    //    _grid->Barrier();
@@ -389,19 +552,22 @@ public:
    // Starts intranode
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate Begin "<<i<<std::endl;
+      //      _grid->Barrier();
      _grid->StencilSendToRecvFromBegin(MpiReqs,
-					Packets[i].send_buf,
+					Packets[i].send_buf,Packets[i].compressed_send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
-					Packets[i].recv_buf,
+					Packets[i].recv_buf,Packets[i].compressed_recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes,Packets[i].rbytes,i);
+					Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
+      //      std::cout << "Communicate Begin started "<<i<<std::endl;
+      //      _grid->Barrier();
    }
    FlightRecorder::StepLog("Communicate begin has finished");
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_send )
-	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
+	FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
    }
  }

@@ -416,14 +582,15 @@ public:
    //    std::cout << "Communicate Complete Complete "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
-    if   ( this->partialDirichlet ) DslashLogPartial();
-    else if ( this->fullDirichlet ) DslashLogDirichlet();
+    //    if   ( this->partialDirichlet ) DslashLogPartial();
+    if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
    //    accelerator_barrier(); 
    for(int i=0;i<Packets.size();i++){
+      this->DecompressPacket(Packets[i]);
      if ( Packets[i].do_recv )
-	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
+	FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
    }
    FlightRecorder::StepLog("Finish communicate complete");
  }
@@ -618,7 +785,7 @@ public:
  }
  void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
    Decompress d;
-    d.partial  = this->partialDirichlet;
+    //    d.partial  = this->partialDirichlet;
    d.dims     = _grid->_fdimensions;
    d.kernel_p = k_p;
    d.mpi_p    = m_p;
@@ -627,7 +794,7 @@ public:
  }
  void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
    Merge m;
-    m.partial  = this->partialDirichlet;
+    //    m.partial  = this->partialDirichlet;
    m.dims     = _grid->_fdimensions;
    m.type     = type;
    m.mpointer = merge_p;
@@ -732,8 +899,8 @@ public:
      int block = dirichlet_block[dimension];
      this->_comms_send[ii] = comm_dim;
      this->_comms_recv[ii] = comm_dim;
-      this->_comms_partial_send[ii] = 0;
-      this->_comms_partial_recv[ii] = 0;
+      //      this->_comms_partial_send[ii] = 0;
+      //      this->_comms_partial_recv[ii] = 0;
      if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
 	// Quiesce communication across block boundaries
@@ -754,10 +921,10 @@ public:
 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
-	if ( partialDirichlet ) {
-	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
-	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
-	}
+	//	if ( partialDirichlet ) {
+	//	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
+	//	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
+	//	}
      }
    }
  }
@@ -769,6 +936,7 @@ public:
 		   Parameters p=Parameters(),
 		   bool preserve_shm=false)
  {
+    SloppyComms = 0;
    face_table_computed=0;
    _grid    = grid;
    this->parameters=p;
@@ -786,7 +954,7 @@ public:
    this->same_node.resize(npoints);

    if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
-    partialDirichlet = p.partialDirichlet;
+    //    partialDirichlet = p.partialDirichlet;
    DirichletBlock(p.dirichlet); // comms send/recv set up
    fullDirichlet=0;
    for(int d=0;d<p.dirichlet.size();d++){
@@ -867,7 +1035,7 @@ public:
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();

-    // Allow for multiple stencils to exist simultaneously
+    // Allow for multiple stencils to be communicated simultaneously
    if (!preserve_shm)
      _grid->ShmBufferFreeAll();

@@ -935,7 +1103,8 @@ public:
    GridBase *grid=_grid;
    const int Nsimd = grid->Nsimd();

-    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    //    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    int comms_recv      = this->_comms_recv[point];
    int fd              = _grid->_fdimensions[dimension];
    int ld              = _grid->_ldimensions[dimension];
    int rd              = _grid->_rdimensions[dimension];
@@ -1124,8 +1293,8 @@ public:

    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
    
    assert(rhs.Grid()==_grid);
    //	  conformable(_grid,rhs.Grid());
@@ -1160,11 +1329,11 @@ public:
 	int rbytes;

 	if ( comms_send ) xbytes = bytes; // Full send
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0; // full dirichlet

 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 	
 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@@ -1191,7 +1360,8 @@ public:
 	}


-	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	//	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	if ( compress.DecompressionStep()&&comms_recv) {
 	  recv_buf=u_simd_recv_buf[0];
 	} else {
 	  recv_buf=this->u_recv_buf_p;
@@ -1225,7 +1395,8 @@ public:
 #endif

 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
-	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	//	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);

        int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
 	if ( !duplicate ) { // Force comms for now
@@ -1234,8 +1405,8 @@ public:
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
-	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
-	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
+	  int do_send = (comms_send) && (!shm_send );
+	  int do_recv = (comms_send) && (!shm_recv );
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
@@ -1243,7 +1414,7 @@ public:
 		    xbytes,rbytes);
 	}

-	if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
+	if ( (compress.DecompressionStep() && comms_recv) ) {
 	  AddDecompress(&this->u_recv_buf_p[comm_off],
 			&recv_buf[comm_off],
 			words,Decompressions);
@@ -1265,8 +1436,8 @@ public:

    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;

    int fd = _grid->_fdimensions[dimension];
    int rd = _grid->_rdimensions[dimension];
@@ -1341,18 +1512,20 @@ public:

 	
 	if ( comms_send ) xbytes = bytes;
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0;

 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;

 	// Gathers SIMD lanes for send and merge
 	// Different faces can be full comms or partial comms with  multiple ranks per node
-	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	//	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	if ( comms_send || comms_recv ) {

-	  int partial = partialDirichlet;
+	  //	  int partial = partialDirichlet;
+	  int partial = 0;
 	  compressor::Gather_plane_exchange(face_table[face_idx],rhs,
 					    spointers,dimension,sx,cbmask,
 					    compress,permute_type,partial );
@@ -1418,7 +1591,8 @@ public:
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
-	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      //	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      int do_send = (comms_send) && (!shm_send );
 	      AddPacket((void *)sp,(void *)rp,
 			xmit_to_rank,do_send,
 			recv_from_rank,do_send,
@@ -1432,7 +1606,8 @@ public:
 	  }
 	}
 	// rpointer may be doing a remote read in the gather over SHM
-	if ( comms_recv|comms_partial_recv ) {
+	//	if ( comms_recv|comms_partial_recv ) {
+	if ( comms_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}

--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -67,7 +67,7 @@ void acceleratorInit(void)
 	printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);


-	GPU_PROP_FMT(totalGlobalMem,"%lld");
+	GPU_PROP_FMT(totalGlobalMem,"%zu");
 	GPU_PROP(managedMemory);
 	GPU_PROP(isMultiGpuBoard);
 	GPU_PROP(warpSize);
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
  auto err = cudaMallocHost((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
    assert(0);
  }
  return ptr;
@@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
    assert(0);
  }
  return ptr;
@@ -237,7 +237,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
 };
--- a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
@@ -201,8 +201,7 @@ int main(int argc, char **argv) {

  Params.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
-  ParamsDir.partialDirichlet=0;
-  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
+  //  ParamsDir.partialDirichlet=0;

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
@@ -298,11 +297,11 @@ int main(int argc, char **argv) {
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;

-    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
-    else                      ParamsNum.partialDirichlet = 0;
+    //    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
+    //    else                      ParamsNum.partialDirichlet = 0;

-    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
-    else                      ParamsDen.partialDirichlet = 0;
+    //    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
+    //    else                      ParamsDen.partialDirichlet = 0;
    
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
@@ -333,9 +333,9 @@ int main(int argc, char **argv) {
  ParamsF.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
  ParamsDirF.dirichlet=Dirichlet;
-  ParamsDir.partialDirichlet=1;
-  ParamsDirF.partialDirichlet=1;
-  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
+  //  ParamsDir.partialDirichlet=1;
+  //  ParamsDirF.partialDirichlet=1;
+  //  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
@@ -481,21 +481,21 @@ int main(int argc, char **argv) {
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;

-    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
-    else                      ParamsNum.partialDirichlet = 0;
+    //    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
+    //    else                      ParamsNum.partialDirichlet = 0;

-    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
-    else                      ParamsDen.partialDirichlet = 0;
+    //    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
+    //    else                      ParamsDen.partialDirichlet = 0;
    
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));

    ParamsDenF.dirichlet = ParamsDen.dirichlet;
-    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
+    //    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));

    ParamsNumF.dirichlet = ParamsNum.dirichlet;
-    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
+    //    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
    NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));

    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -166,18 +166,18 @@ int main (int argc, char ** argv)
  }  


+
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
-
  for(int lat=8;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){

      Coordinate latt_size  ({lat*mpi_layout[0],
-	                      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
+    	                      lat*mpi_layout[1],
+                              lat*mpi_layout[2],
+	                      lat*mpi_layout[3]});

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
@@ -193,101 +193,6 @@ int main (int argc, char ** argv)
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
      }

-      int ncomm;
-
-      double dbytes;
-      for(int i=0;i<Nloop;i++){
-	double start=usecond();
-
-	dbytes=0;
-	ncomm=0;
-
-	std::vector<CommsRequest_t> requests;
-
-	for(int mu=0;mu<4;mu++){
-	
-
-	  if (mpi_layout[mu]>1 ) {
-	  
-	    ncomm++;
-	    int comm_proc=1;
-	    int xmit_to_rank;
-	    int recv_from_rank;
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu);
-	
-	    comm_proc = mpi_layout[mu]-1;
-	  
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu+4][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu+4][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu+4);
-	  
-	  }
-	}
-	Grid.StencilSendToRecvFromComplete(requests,0);
-	Grid.Barrier();
-	double stop=usecond();
-	t_time[i] = stop-start; // microseconds
-	
-      }
-
-      timestat.statistics(t_time);
-
-      dbytes=dbytes*ppn;
-      double xbytes    = dbytes*0.5;
-      //      double rbytes    = dbytes*0.5;
-      double bidibytes = dbytes;
-
-      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
-               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
-               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
-               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
-               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
-               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
-
-
-    }
-  }    
-
-
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
-
-  for(int lat=8;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=8;Ls*=2){
-
-      Coordinate latt_size  ({lat*mpi_layout[0],
-      			      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
-
-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      RealD Nrank = Grid._Nprocessors;
-      RealD Nnode = Grid.NodeCount();
-      RealD ppn = Nrank/Nnode;
-
-      std::vector<HalfSpinColourVectorD *> xbuf(8);
-      std::vector<HalfSpinColourVectorD *> rbuf(8);
-      Grid.ShmBufferFreeAll();
-      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-      for(int d=0;d<8;d++){
-	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
-	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
-      }
-
      int ncomm;
      double dbytes;
      for(int i=0;i<Nloop;i++){
@@ -296,45 +201,34 @@ int main (int argc, char ** argv)
 	std::vector<CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
-	for(int mu=0;mu<4;mu++){
-	
+
+	for(int dir=0;dir<8;dir++) {
+
+	  double tbytes;
+	  int mu =dir % 4;
+
 	  if (mpi_layout[mu]>1 ) {
 	  
 	    ncomm++;
-	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
-	    
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu);
-	    Grid.StencilSendToRecvFromComplete(requests,mu);
-	    requests.resize(0);
+	    if ( dir == mu ) { 
+	      int comm_proc=1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    } else { 
+	      int comm_proc = mpi_layout[mu]-1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    }
+            int tid = omp_get_thread_num();
+	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
+					       (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);

-	    comm_proc = mpi_layout[mu]-1;
-	  
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu+4][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu+4][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu+4);
-	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
-	    requests.resize(0);
-	  
+	    dbytes+=tbytes;
 	  }
-	}
+        }
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
-	
      }

      timestat.statistics(t_time);
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -32,18 +32,18 @@
 using namespace std;
 using namespace Grid;

-template<class d>
-struct scal {
-  d internal;
+////////////////////////
+/// Move to domains ////
+////////////////////////
+
+Gamma::Algebra Gmu [] = {
+			 Gamma::Algebra::GammaX,
+			 Gamma::Algebra::GammaY,
+			 Gamma::Algebra::GammaZ,
+			 Gamma::Algebra::GammaT
 };

-  Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT
-  };
-
+void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);

 int main (int argc, char ** argv)
 {
@@ -52,39 +52,108 @@ int main (int argc, char ** argv)

  int threads = GridThread::GetThreads();

-  Coordinate latt4 = GridDefaultLatt();
-  int Ls=8;
-  for(int i=0;i<argc;i++)
+  int Ls=16;
+  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
+  }

+  //////////////////
+  // With comms
+  //////////////////
+  Coordinate Dirichlet(Nd+1,0);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet,false);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet,true);
+
+  //////////////////
+  // Domain decomposed
+  //////////////////
+  /*
+  Coordinate latt4  = GridDefaultLatt();
+  Coordinate mpi    = GridDefaultMpi();
+  Coordinate CommDim(Nd);
+  Coordinate shm;
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  //  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+
+  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+
+  Benchmark(Ls,Dirichlet,false);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+
+  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
+  
+  Benchmark(Ls,Dirichlet,true);
+  */
+  
+  Grid_finalize();
+  exit(0);
+}
+void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
+{
+  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();

  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);

-
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+#undef SINGLE
+#ifdef SINGLE
+  typedef vComplexF          Simd;
+  typedef LatticeFermionF    FermionField;
+  typedef LatticeGaugeFieldF GaugeField;
+  typedef LatticeColourMatrixF ColourMatrixField;
+  typedef DomainWallFermionF FermionAction;
+#else
+  typedef vComplexD          Simd;
+  typedef LatticeFermionD    FermionField;
+  typedef LatticeGaugeFieldD GaugeField;
+  typedef LatticeColourMatrixD ColourMatrixField;
+  typedef DomainWallFermionD FermionAction;
+#endif
+  
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

-  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
-  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
-  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
-  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
-  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
-  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;

-  LatticeFermion src   (FGrid); random(RNG5,src);
+ 
+  FermionField src   (FGrid); random(RNG5,src);
 #if 0
  src = Zero();
  {
@@ -100,46 +169,39 @@ int main (int argc, char ** argv)
  src = src*N2;
 #endif

-
-  LatticeFermion result(FGrid); result=Zero();
-  LatticeFermion    ref(FGrid);    ref=Zero();
-  LatticeFermion    tmp(FGrid);
-  LatticeFermion    err(FGrid);
+  FermionField result(FGrid); result=Zero();
+  FermionField    ref(FGrid);    ref=Zero();
+  FermionField    tmp(FGrid);
+  FermionField    err(FGrid);

  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
-  LatticeGaugeField Umu(UGrid);
+  GaugeField Umu(UGrid);
+  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
+  //  SU<Nc>::ColdConfiguration(Umu);
+  UmuCopy=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
-#if 0
-  Umu=1.0;
-  for(int mu=0;mu<Nd;mu++){
-    LatticeColourMatrix ttmp(UGrid);
-    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
-    //    if (mu !=2 ) ttmp = 0;
-    //    ttmp = ttmp* pow(10.0,mu);
-    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
-  }
-  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
-#endif

+  ////////////////////////////////////
+  // Apply BCs
+  ////////////////////////////////////
+  Coordinate Block(4);
+  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
+
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
+
+  DirichletFilter<GaugeField> Filter(Block);
+  Filter.applyFilter(Umu);
+  
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
-  // replicate across fifth dimension
-  LatticeGaugeField Umu5d(FGrid);
-  std::vector<LatticeColourMatrix> U(4,FGrid);
-  {
-    autoView( Umu5d_v, Umu5d, CpuWrite);
-    autoView( Umu_v  , Umu  , CpuRead);
-    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
-      for(int s=0;s<Ls;s++){
-	Umu5d_v[Ls*ss+s] = Umu_v[ss];
-      }
-    }
-  }
+  std::vector<ColourMatrixField> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
+
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;

  if (1)
@@ -147,10 +209,28 @@ int main (int argc, char ** argv)
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){

-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
+	  }
+	}
+      }
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;

-      tmp =adj(U[mu])*src;
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
@@ -167,11 +247,9 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop                  "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
-  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
-  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -181,9 +259,15 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;

-  DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =1000;
-
+  FermionAction::ImplParams p;
+  p.dirichlet=Dirichlet;
+  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
+  Dw.SloppyComms(sloppy);
+  Dw.ImportGauge(Umu);
+  
+  int ncall =300;
+  RealD n2e;
+  
  if (1) {
    FGrid->Barrier();
    Dw.Dhop(src,result,0);
@@ -198,8 +282,8 @@ int main (int argc, char ** argv)
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=single_site_flops*volume*ncall;

-    auto nsimd = vComplex::Nsimd();
-    auto simdwidth = sizeof(vComplex);
+    auto nsimd = Simd::Nsimd();
+    auto simdwidth = sizeof(Simd);

    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
@@ -208,28 +292,27 @@ int main (int argc, char ** argv)
    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);

    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
-    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
-    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
    err = ref-result;
-    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    //exit(0);
+    n2e = norm2(err);
+    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;

-    if(( norm2(err)>1.0e-4) ) {
-      /*
-      std::cout << "RESULT\n " << result<<std::endl;
-      std::cout << "REF   \n " << ref   <<std::endl;
-      std::cout << "ERR   \n " << err   <<std::endl;
-      */
+    if(( n2e>1.0e-4) ) {
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
+      std::cout<<GridLogMessage << "RESULT" << std::endl;
+      //      std::cout << result<<std::endl;
+      std::cout << norm2(result)<<std::endl;
+      std::cout<<GridLogMessage << "REF" << std::endl;
+      std::cout << norm2(ref)<<std::endl;
+      std::cout<<GridLogMessage << "ERR" << std::endl;
+      std::cout << norm2(err)<<std::endl;
+      FGrid->Barrier();
      exit(-1);
    }
-    assert (norm2(err)< 1.0e-4 );
+    assert (n2e< 1.0e-4 );
  }

  if (1)
@@ -238,16 +321,30 @@ int main (int argc, char ** argv)
    for(int mu=0;mu<Nd;mu++){

      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( ref_v, ref, CpuWrite);
 	autoView( tmp_v, tmp, CpuRead);
-	for(int i=0;i<ref_v.size();i++){
-	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    int i=s+Ls*ss;
+	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
+	  }
 	}
      }
-
-      tmp =adj(U[mu])*src;
+      
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      //      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      {
 	autoView( ref_v, ref, CpuWrite);
@@ -259,27 +356,27 @@ int main (int argc, char ** argv)
    }
    ref = -0.5*ref;
  }
-  //  dump=1;
-  Dw.Dhop(src,result,1);
+
+  Dw.Dhop(src,result,DaggerYes);
+
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
+
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
  err = ref-result;
-  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
-/*
-	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
-	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
-	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
-*/
-  }
-  LatticeFermion src_e (FrbGrid);
-  LatticeFermion src_o (FrbGrid);
-  LatticeFermion r_e   (FrbGrid);
-  LatticeFermion r_o   (FrbGrid);
-  LatticeFermion r_eo  (FGrid);
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;

+  assert((n2e)<1.0e-4);
+  
+  FermionField src_e (FrbGrid);
+  FermionField src_o (FrbGrid);
+  FermionField r_e   (FrbGrid);
+  FermionField r_o   (FrbGrid);
+  FermionField r_eo  (FGrid);

  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
@@ -291,10 +388,8 @@ int main (int argc, char ** argv)

  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO                "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
-  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -308,13 +403,7 @@ int main (int argc, char ** argv)
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-#ifdef CUDA_PROFILE
-      if(i==10) cudaProfilerStart();
-#endif
      Dw.DhopEO(src_o,r_e,DaggerNo);
-#ifdef CUDA_PROFILE
-      if(i==20) cudaProfilerStop();
-#endif
    }
    double t1=usecond();
    FGrid->Barrier();
@@ -338,14 +427,9 @@ int main (int argc, char ** argv)
  setCheckerboard(r_eo,r_e);

  err = r_eo-result;
-  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
-    /*
-	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
-	std::cout<< "Deo REF\n " <<result  << std::endl;
-	std::cout<< "Deo ERR   \n " << err <<std::endl;
-    */
-  }
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
+  assert(n2e<1.0e-4);

  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
@@ -354,6 +438,4 @@ int main (int argc, char ** argv)

  assert(norm2(src_e)<1.0e-4);
  assert(norm2(src_o)<1.0e-4);
-  Grid_finalize();
-  exit(0);
 }
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -43,7 +43,7 @@ Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaT
 };

-void Benchmark(int Ls, Coordinate Dirichlet);
+void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);

 int main (int argc, char ** argv)
 {
@@ -69,11 +69,19 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  
-  Benchmark(Ls,Dirichlet);
+  Benchmark(Ls,Dirichlet,false);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet,true);

  //////////////////
  // Domain decomposed
  //////////////////
+  /*
  Coordinate latt4  = GridDefaultLatt();
  Coordinate mpi    = GridDefaultMpi();
  Coordinate CommDim(Nd);
@@ -81,42 +89,35 @@ int main (int argc, char ** argv)
  GlobalSharedMemory::GetShmDims(mpi,shm);


-  //////////////////////
-  // Node level
-  //////////////////////
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
+  //  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;

  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-  //  Dirichlet[0] = 0;
-  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];

-  Benchmark(Ls,Dirichlet);
+  Benchmark(Ls,Dirichlet,false);

  std::cout << "\n\n\n\n\n\n" <<std::endl;

  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;

  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
-  //  Dirichlet[0] = 0;
-  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
-  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
-  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
-  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
  
-  Benchmark(Ls,Dirichlet);
-
+  Benchmark(Ls,Dirichlet,true);
+  */
+  
  Grid_finalize();
  exit(0);
 }
-void Benchmark(int Ls, Coordinate Dirichlet)
+void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
 {
  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();
@@ -132,21 +133,13 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  typedef LatticeGaugeFieldF GaugeField;
  typedef LatticeColourMatrixF ColourMatrixField;
  typedef DomainWallFermionF FermionAction;
-#endif
-#ifdef DOUBLE
+#else
  typedef vComplexD          Simd;
  typedef LatticeFermionD    FermionField;
  typedef LatticeGaugeFieldD GaugeField;
  typedef LatticeColourMatrixD ColourMatrixField;
  typedef DomainWallFermionD FermionAction;
 #endif
-#ifdef DOUBLE2
-  typedef vComplexD2          Simd;
-  typedef LatticeFermionD2    FermionField;
-  typedef LatticeGaugeFieldD2 GaugeField;
-  typedef LatticeColourMatrixD2 ColourMatrixField;
-  typedef DomainWallFermionD2 FermionAction;
-#endif
  
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -269,6 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  FermionAction::ImplParams p;
  p.dirichlet=Dirichlet;
  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
+  Dw.SloppyComms(sloppy);
  Dw.ImportGauge(Umu);
  
  int ncall =300;
--- a/benchmarks/Benchmark_dwf_fp32_partial.cc
+++ b/benchmarks/Benchmark_dwf_fp32_partial.cc
@@ -1,465 +0,0 @@
- /*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./benchmarks/Benchmark_dwf.cc
-    Copyright (C) 2015
-
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#ifdef GRID_CUDA
-#define CUDA_PROFILE
-#endif
-
-#ifdef CUDA_PROFILE
-#include <cuda_profiler_api.h>
-#endif
-
-using namespace std;
-using namespace Grid;
-
-////////////////////////
-/// Move to domains ////
-////////////////////////
-
-Gamma::Algebra Gmu [] = {
-			 Gamma::Algebra::GammaX,
-			 Gamma::Algebra::GammaY,
-			 Gamma::Algebra::GammaZ,
-			 Gamma::Algebra::GammaT
-};
-
-void Benchmark(int Ls, Coordinate Dirichlet, int partial);
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-
-  int threads = GridThread::GetThreads();
-
-  int Ls=8;
-  for(int i=0;i<argc;i++) {
-    if(std::string(argv[i]) == "-Ls"){
-      std::stringstream ss(argv[i+1]); ss >> Ls;
-    }
-  }
-
-  //////////////////
-  // With comms
-  //////////////////
-  Coordinate Dirichlet(Nd+1,0);
-
-  for(auto partial : {0}) {
-  std::cout << "\n\n\n\n\n\n" <<std::endl;
-  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
-  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    Benchmark(Ls,Dirichlet,partial);
-  }
-
-  //////////////////
-  // Domain decomposed
-  //////////////////
-  Coordinate latt4  = GridDefaultLatt();
-  Coordinate mpi    = GridDefaultMpi();
-  Coordinate CommDim(Nd);
-  //Coordinate shm({2,1,1,1});
-  Coordinate shm;
-  GlobalSharedMemory::GetShmDims(mpi,shm);
-
-  std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
-
-  //////////////////////
-  // Node level
-  //////////////////////
-  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-  //  for(int d=0;d<Nd;d++) CommDim[d]= 1;
-  Dirichlet[0] = 0;
-  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
-
-  for(auto partial : {0,1}) {
-    std::cout << "\n\n\n\n\n\n" <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    Benchmark(Ls,Dirichlet,partial);
-  }
-  
-  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
-  Dirichlet[0] = 0;
-  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
-  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
-  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
-  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
-  
-  for(auto partial : {0,1}) {
-    std::cout << "\n\n\n\n\n\n" <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    Benchmark(Ls,Dirichlet,partial);
-  }
-  Grid_finalize();
-  exit(0);
-}
-void Benchmark(int Ls, Coordinate Dirichlet, int partial)
-{
-  Coordinate latt4 = GridDefaultLatt();
-  GridLogLayout();
-
-  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-#define SINGLE
-#ifdef SINGLE
-  typedef vComplexF          Simd;
-  typedef LatticeFermionF    FermionField;
-  typedef LatticeGaugeFieldF GaugeField;
-  typedef LatticeColourMatrixF ColourMatrixField;
-  typedef DomainWallFermionF FermionAction;
-#endif
-#ifdef DOUBLE
-  typedef vComplexD          Simd;
-  typedef LatticeFermionD    FermionField;
-  typedef LatticeGaugeFieldD GaugeField;
-  typedef LatticeColourMatrixD ColourMatrixField;
-  typedef DomainWallFermionD FermionAction;
-#endif
-#ifdef DOUBLE2
-  typedef vComplexD2          Simd;
-  typedef LatticeFermionD2    FermionField;
-  typedef LatticeGaugeFieldD2 GaugeField;
-  typedef LatticeColourMatrixD2 ColourMatrixField;
-  typedef DomainWallFermionD2 FermionAction;
-#endif
-  
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
-
-  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
-
- 
-  FermionField src   (FGrid); random(RNG5,src);
-#if 0
-  src = Zero();
-  {
-    Coordinate origin({0,0,0,latt4[2]-1,0});
-    SpinColourVectorF tmp;
-    tmp=Zero();
-    tmp()(0)(0)=Complex(-2.0,0.0);
-    std::cout << " source site 0 " << tmp<<std::endl;
-    pokeSite(tmp,src,origin);
-  }
-#else
-  RealD N2 = 1.0/::sqrt(norm2(src));
-  src = src*N2;
-#endif
-
-  FermionField result(FGrid); result=Zero();
-  FermionField    ref(FGrid);    ref=Zero();
-  FermionField    tmp(FGrid);
-  FermionField    err(FGrid);
-
-  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
-  GaugeField Umu(UGrid);
-  GaugeField UmuFull(UGrid);
-  GaugeField UmuCopy(UGrid);
-  SU<Nc>::HotConfiguration(RNG4,Umu);
-  UmuCopy=Umu;
-  UmuFull=Umu;
-  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
-
-  ////////////////////////////////////
-  // Apply BCs
-  ////////////////////////////////////
-  Coordinate Block(4);
-  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
-
-  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
-  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
-
-  DirichletFilter<GaugeField> Filter(Block);
-  Filter.applyFilter(Umu);
-  if(!partial) Filter.applyFilter(UmuCopy);
-  
-  ////////////////////////////////////
-  // Naive wilson implementation
-  ////////////////////////////////////
-  std::vector<ColourMatrixField> U(4,UGrid);
-  std::vector<ColourMatrixField> Ucopy(4,UGrid);
-  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-    Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
-  }
-
-  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
-
-  if (1)
-  {
-    ref = Zero();
-    for(int mu=0;mu<Nd;mu++){
-      int depth=dwf_compressor_depth;
-      tmp = Cshift(src,mu+1,1);
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v    , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v  , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	autoView( src_v, src    , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      tmp =Cshift(tmp,mu+1,-1);
-      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
-    }
-    ref = -0.5*ref;
-  }
-
-  RealD mass=0.1;
-  RealD M5  =1.8;
-
-  RealD NP = UGrid->_Nprocessors;
-  RealD NN = UGrid->NodeCount();
-
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
-  std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
-  std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
-  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
-#ifdef GRID_OMP
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-#endif
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-
-  FermionAction::ImplParams p;
-  p.dirichlet=Dirichlet;
-  p.partialDirichlet=partial;
-  FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
-  
-  int ncall =1;
-  RealD n2e;
-  
-  if (1) {
-    FGrid->Barrier();
-    Dw.Dhop(src,result,0);
-    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
-    double t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.Dhop(src,result,0);
-    }
-    double t1=usecond();
-    FGrid->Barrier();
-
-    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=single_site_flops*volume*ncall;
-
-    auto nsimd = Simd::Nsimd();
-    auto simdwidth = sizeof(Simd);
-
-    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
-    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
-
-    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
-    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
-
-    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    err = ref-result;
-    n2e = norm2(err);
-
-    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
-
-    if(( n2e>1.0e-4) ) {
-      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
-      FGrid->Barrier();
-
-      DumpSliceNorm("s-slice ref ",ref,1);
-      DumpSliceNorm("s-slice res ",result,1);
-      DumpSliceNorm("s-slice error ",err,1);
-      exit(-1);
-    }
-    assert (n2e< 1.0e-4 );
-  }
-
-  if (1)
-  { // Naive wilson dag implementation
-
-    ref = Zero();
-    for(int mu=0;mu<Nd;mu++){
-
-      int depth=dwf_compressor_depth;
-      tmp = Cshift(src,mu+1,1);
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v    , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v  , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	autoView( src_v, src    , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      tmp =Cshift(tmp,mu+1,-1);
-      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-    }
-    ref = -0.5*ref;
-  }
-
-  Dw.Dhop(src,result,DaggerYes);
-
-  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
-  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
-  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
-
-  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
-  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
-  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
-  err = ref-result;
-  n2e= norm2(err);
-  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
-
-  assert((n2e)<1.0e-4);
-  
-  FermionField src_e (FrbGrid);
-  FermionField src_o (FrbGrid);
-  FermionField r_e   (FrbGrid);
-  FermionField r_o   (FrbGrid);
-  FermionField r_eo  (FGrid);
-
-  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
-  pickCheckerboard(Even,src_e,src);
-  pickCheckerboard(Odd,src_o,src);
-
-  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
-  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
-
-
-  // S-direction is INNERMOST and takes no part in the parity.
-  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
-#ifdef GRID_OMP
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-#endif
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
-  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  {
-    FGrid->Barrier();
-    Dw.DhopEO(src_o,r_e,DaggerNo);
-    double t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.DhopEO(src_o,r_e,DaggerNo);
-    }
-    double t1=usecond();
-    FGrid->Barrier();
-
-    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(single_site_flops*volume*ncall)/2.0;
-
-    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
-    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
-  }
-  Dw.DhopEO(src_o,r_e,DaggerNo);
-  Dw.DhopOE(src_e,r_o,DaggerNo);
-  Dw.Dhop  (src  ,result,DaggerNo);
-
-  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
-  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
-  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
-
-  setCheckerboard(r_eo,r_o);
-  setCheckerboard(r_eo,r_e);
-
-  err = r_eo-result;
-  n2e= norm2(err);
-  std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
-  assert(n2e<1.0e-4);
-
-  pickCheckerboard(Even,src_e,err);
-  pickCheckerboard(Odd,src_o,err);
-  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
-  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
-
-  assert(norm2(src_e)<1.0e-4);
-  assert(norm2(src_o)<1.0e-4);
-}
--- a/systems/Jupiter/sourceme.sh
+++ b/systems/Jupiter/sourceme.sh
@@ -7,3 +7,4 @@ module load MPFR
 module load NVHPC
 module load UCX
 module load OpenMPI
+ulimit -c 0
Author	SHA1	Message	Date
Peter Boyle	7aa06329d0	Update for new stencil compression options	2025-06-17 18:06:19 +02:00
Peter Boyle	9d6a38c44c	Compressed comms options as Sloppy	2025-06-17 16:43:53 +02:00
Peter Boyle	6ec5cee368	Preparing for compressed comms	2025-06-17 16:38:10 +02:00
Peter Boyle	f2e9a68825	Simplify	2025-06-13 17:32:05 +02:00
Peter Boyle	d88750e6b6	Sloppy + non-sloppy	2025-06-13 16:42:01 +02:00
Peter Boyle	821358eda7	Remove partial dirichlet. Favour intro reduced prec comms options	2025-06-13 05:08:45 +02:00
Peter Boyle	fce6e1f135	Kill core files for quota reasons	2025-06-13 05:08:15 +02:00
Peter Boyle	8f0bb3e676	remove partial dirichlet	2025-06-13 05:07:56 +02:00
Peter Boyle	262c70d967	USe sloppy comms options	2025-06-13 05:07:23 +02:00
Peter Boyle	da43ef7c2d	REmove partial dirichlet option. It's going nowhere	2025-06-13 05:05:15 +02:00
Peter Boyle	7b60ab5df1	Warning suppress	2025-06-13 05:04:55 +02:00
Peter Boyle	f6b961a64e	Warning suppress	2025-06-13 05:04:47 +02:00
Peter Boyle	f1ed988aa3	Interface to reduced precision comms	2025-06-13 05:04:12 +02:00
Peter Boyle	eea51bb604	Suppress annoying warns	2025-06-13 05:03:36 +02:00