Merge 461cd045c6 into da59379612

Large reg file for double
ifdef guard ommitted
2025-10-25 18:19:34 +01:00 · 2024-03-27 02:11:32 +09:00 · 2024-03-26 17:03:20 +00:00 · 2024-03-26 14:50:32 +00:00 · 2024-03-26 14:41:25 +00:00 · 2024-03-26 14:14:06 +00:00
24 changed files with 851 additions and 406 deletions
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -348,6 +348,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
  return offbytes;
 }

+#undef NVLINK_GET // Define to use get instead of put DMA
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
@@ -380,9 +381,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(rrq);
      off_node_bytes+=rbytes;
    }
+#ifdef NVLINK_GET
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
+      assert(shm!=NULL);
+      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+#endif
  }
  
  if (dox) {
+    //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@@ -390,9 +397,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(xrq);
      off_node_bytes+=xbytes;
    } else {
+#ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+#endif
+      
    }
  }

@@ -402,6 +412,8 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
 {
  int nreq=list.size();

+  acceleratorCopySynchronise();
+
  if (nreq==0) return;

  std::vector<MPI_Status> status(nreq);
--- a/Grid/communicator/SharedMemory.cc
+++ b/Grid/communicator/SharedMemory.cc
@@ -40,6 +40,9 @@ int                 GlobalSharedMemory::_ShmAlloc;
 uint64_t            GlobalSharedMemory::_ShmAllocBytes;

 std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
+#ifndef ACCELERATOR_AWARE_MPI
+void * GlobalSharedMemory::HostCommBuf;
+#endif

 Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
 int                 GlobalSharedMemory::WorldShmRank;
@@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void)
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
+#ifndef ACCELERATOR_AWARE_MPI
+void *SharedMemory::HostBufferMalloc(size_t bytes){
+  void *ptr = (void *)host_heap_top;
+  host_heap_top  += bytes;
+  host_heap_bytes+= bytes;
+  if (host_heap_bytes >= host_heap_size) {
+    std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
+    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
+    std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
+    std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
+    std::cout<< " Current heap  is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
+    assert(host_heap_bytes<host_heap_size);
+  }
+  return ptr;
+}
+void SharedMemory::HostBufferFreeAll(void) { 
+  host_heap_top  =(size_t)HostCommBuf;
+  host_heap_bytes=0;
+}
+#endif
 void *SharedMemory::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -75,7 +75,9 @@ public:
  static int           Hugepages;

  static std::vector<void *> WorldShmCommBufs;
-
+#ifndef ACCELERATOR_AWARE_MPI
+  static void *HostCommBuf;
+#endif
  static Grid_MPI_Comm WorldComm;
  static int           WorldRank;
  static int           WorldSize;
@@ -120,6 +122,13 @@ private:
  size_t heap_bytes;
  size_t heap_size;

+#ifndef ACCELERATOR_AWARE_MPI
+  size_t host_heap_top;  // set in free all
+  size_t host_heap_bytes;// set in free all
+  void *HostCommBuf;     // set in SetCommunicator
+  size_t host_heap_size; // set in SetCommunicator
+#endif
+  
 protected:

  Grid_MPI_Comm    ShmComm; // for barriers
@@ -151,7 +160,10 @@ public:
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void  ShmBufferFreeAll(void) ;
-  
+#ifndef ACCELERATOR_AWARE_MPI
+  void *HostBufferMalloc(size_t bytes);
+  void HostBufferFreeAll(void);
+#endif  
  //////////////////////////////////////////////////////////////////////////
  // Make info on Nodes & ranks and Shared memory available
  //////////////////////////////////////////////////////////////////////////
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -39,10 +39,12 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #include <hip/hip_runtime_api.h>
 #endif
 #ifdef GRID_SYCL
+#ifdef ACCELERATOR_AWARE_MPI
 #define GRID_SYCL_LEVEL_ZERO_IPC
-#include <syscall.h>
 #define SHM_SOCKETS
 #endif 
+#include <syscall.h>
+#endif

 #include <sys/socket.h>
 #include <sys/un.h>
@@ -511,46 +513,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
-#if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL)
-
-//if defined(GRID_SYCL)
-#if 0
-void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
-{
-  void * ShmCommBuf ; 
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0);
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // allocate the pointer array for shared windows for our group
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  MPI_Barrier(WorldShmComm);
-  WorldShmCommBufs.resize(WorldShmSize);
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Each MPI rank should allocate our own buffer
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-  ShmCommBuf = acceleratorAllocDevice(bytes);
-
-  if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
-    exit(EXIT_FAILURE);  
-  }
-
-  std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
-	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
-
-  SharedMemoryZero(ShmCommBuf,bytes);
-
-  assert(WorldShmSize == 1);
-  for(int r=0;r<WorldShmSize;r++){
-    WorldShmCommBufs[r] = ShmCommBuf;
-  }
-  _ShmAllocBytes=bytes;
-  _ShmAlloc=1;
-}
-#endif
-
 #if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL)
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
@@ -574,6 +536,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifndef ACCELERATOR_AWARE_MPI
+  HostCommBuf= malloc(bytes);
+#endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
@@ -738,7 +703,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 }
-#endif

 #else 
 #ifdef GRID_MPI3_SHMMMAP
@@ -962,6 +926,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
  ShmBufferFreeAll();

+#ifndef ACCELERATOR_AWARE_MPI
+  host_heap_size = heap_size;
+  HostCommBuf= GlobalSharedMemory::HostCommBuf;
+  HostBufferFreeAll();
+#endif  
+
  /////////////////////////////////////////////////////////////////////
  // find comm ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -285,13 +285,25 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
 template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();
-  uint32_t csum=0;
-  //  Uint32Checksum(left,csum);
+
+#ifdef GRID_SYCL
+  uint64_t csum=0;
+  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
+  {
+    // Hack
+    // Fast integer xor checksum. Can also be used in comms now.
+    autoView(l_v,left,AcceleratorRead);
+    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
+    uint64_t *base= (uint64_t *)&l_v[0];
+    csum=svm_xor(base,words);
+  }
+  FlightRecorder::CsumLog(csum);
+#endif
  ComplexD nrm = rankInnerProduct(left,right);
  RealD local = real(nrm);
-  GridNormLog(real(nrm),csum); // Could log before and after global sum to distinguish local and MPI
+  FlightRecorder::NormLog(real(nrm)); 
  grid->GlobalSum(nrm);
-  GridMPINormLog(local,real(nrm)); 
+  FlightRecorder::ReductionLog(local,real(nrm)); 
  return nrm;
 }

--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -69,29 +69,30 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
  return result;
 }

-NAMESPACE_END(Grid);

-/*
-template<class Double> Double svm_reduce(Double *vec,uint64_t L)
+template<class Word> Word svm_xor(Word *vec,uint64_t L)
 {
-  Double sumResult; zeroit(sumResult);
-  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
-  Double identity;  zeroit(identity);
+  Word xorResult; xorResult = 0;
+  Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
+  Word identity;  identity=0;
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
+     auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
     cgh.parallel_for(cl::sycl::range<1>{L},
 		      Reduction,
 		      [=] (cl::sycl::id<1> index, auto &sum) {
-	 sum +=vec[index];
+	 sum ^=vec[index];
     });
   });
  theGridAccelerator->wait();
-  Double ret = d_sum[0];
+  Word ret = d_sum[0];
  free(d_sum,*theGridAccelerator);
-  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
  return ret;
 }

+NAMESPACE_END(Grid);
+
+/*
+
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
 {
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <type_traits>
+
 #if defined(GRID_CUDA)

 #include <cub/cub.cuh>
@@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
  

 }
+#endif 

-template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+
+#if defined(GRID_SYCL)
+template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
+{
+  size_t subvol_size = e1*e2;
+
+  vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
+  vobj vobj_zero;
+  zeroit(vobj_zero);
+  for (int r = 0; r<rd; r++) { 
+    mysum[r] = vobj_zero; 
+  }
+
+  commVector<vobj> reduction_buffer(rd*subvol_size);    
+
+  auto rb_p = &reduction_buffer[0];
+
+  // autoView(Data_v, Data, AcceleratorRead);
+
+  //prepare reduction buffer 
+  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
+  
+      int n = s / e2;
+      int b = s % e2;
+      int so=r*ostride; // base offset for start of plane 
+      int ss= so+n*stride+b;
+
+      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
+
+  });
+
+  for (int r = 0; r < rd; r++) {
+      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
+          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
+          Reduction,
+          [=](cl::sycl::id<1> item, auto &sum) {
+              auto s = item[0];
+              sum += rb_p[r*subvol_size+s];
+          });
+      });
+      
+     
+  }
+  theGridAccelerator->wait();
+  for (int r = 0; r < rd; r++) {
+    lvSum[r] = mysum[r];
+  }
+  free(mysum,*theGridAccelerator);
+}
+#endif
+
+template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
  typedef typename vobj::vector_type vector;
  const int words = sizeof(vobj)/sizeof(vector);
  const int osites = rd*e1*e2;
@@ -106,7 +159,11 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
 	    buf[ss] = dat[ss*words+w];
    });

+    #if defined(GRID_CUDA) || defined(GRID_HIP)
      sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
+    #elif defined(GRID_SYCL)
+      sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
+    #endif

    for (int r = 0; r < rd; r++) {
      lvSum_ptr[w+words*r]=lvSum_small[r];
@@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
  
 }

-template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
+template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 {
-  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
+  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
    if constexpr (sizeof(vobj) <= 256) { 
+
+      #if defined(GRID_CUDA) || defined(GRID_HIP)
        sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      #elif defined (GRID_SYCL)
+        sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      #endif
+
    }
    else {
-      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
    }
 }
-#endif


-#if defined(GRID_SYCL)
-template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
-{
-  typedef typename vobj::scalar_object sobj;
-  size_t subvol_size = e1*e2;
-
-  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
-  vobj vobj_zero;
-  zeroit(vobj_zero);
-    
-  commVector<vobj> reduction_buffer(rd*subvol_size);    
-
-  auto rb_p = &reduction_buffer[0];
-
-  autoView(Data_v, Data, AcceleratorRead);
-
-  //prepare reduction buffer 
-  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
-  
-      int n = s / e2;
-      int b = s % e2;
-      int so=r*ostride; // base offset for start of plane 
-      int ss= so+n*stride+b;
-
-      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
-
-  });
-
-  for (int r = 0; r < rd; r++) {
-      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
-      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
-          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
-          Reduction,
-          [=](cl::sycl::id<1> item, auto &sum) {
-              auto s = item[0];
-              sum += rb_p[r*subvol_size+s];
-          });
-      });
-      theGridAccelerator->wait();
-      lvSum[r] = mysum[0];
-  }
-  
-  free(mysum,*theGridAccelerator);
-}
-#endif
-
 template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  // sum over reduced dimension planes, breaking out orthog dir
@@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data

 template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
 {
-  #if defined(GRID_CUDA) || defined(GRID_HIP)
+  #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
  
-  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-  
-  #elif defined(GRID_SYCL)
-  
-  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
+  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  
  #else
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -462,6 +462,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    autoView(st_v , st,AcceleratorRead);

   if( interior && exterior ) {
+     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #ifndef GRID_CUDA
@@ -495,6 +496,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    autoView(st_v ,st,AcceleratorRead);

   if( interior && exterior ) {
+     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
 #ifndef GRID_CUDA
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -70,57 +70,6 @@ struct DefaultImplParams {
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table);

-/*
-template<class vobj,class cobj,class compressor>
-void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)   __attribute__((noinline));
-
-template<class vobj,class cobj,class compressor>
-void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
-{
-  int num=table.size();
-  std::pair<int,int> *table_v = & table[0];
-
-  auto rhs_v = rhs.View(AcceleratorRead);
-  accelerator_forNB( i,num, vobj::Nsimd(), {
-    compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]);
-  });
-  rhs_v.ViewClose();
-}
-
-///////////////////////////////////////////////////////////////////
-// Gather for when there *is* need to SIMD split with compression
-///////////////////////////////////////////////////////////////////
-template<class cobj,class vobj,class compressor>
-void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
-				 commVector<cobj *> pointers,
-				 int dimension,int plane,
-				 int cbmask,compressor &compress,int type) __attribute__((noinline));
-
-template<class cobj,class vobj,class compressor>
-void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
-				 const Lattice<vobj> &rhs,
-				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
-				 compressor &compress,int type)
-{
-  assert( (table.size()&0x1)==0);
-  int num=table.size()/2;
-  int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
-
-  auto rhs_v = rhs.View(AcceleratorRead);
-  auto rhs_p = &rhs_v[0];
-  auto p0=&pointers[0][0];
-  auto p1=&pointers[1][0];
-  auto tp=&table[0];
-  accelerator_forNB(j, num, vobj::Nsimd(), {
-      compress.CompressExchange(p0,p1, rhs_p, j,
-				so+tp[2*j  ].second,
-				so+tp[2*j+1].second,
-				type);
-  });
-  rhs_v.ViewClose();
-}
-*/
-
 void DslashResetCounts(void);
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 void DslashLogFull(void);
@@ -258,6 +207,10 @@ public:
  struct Packet {
    void * send_buf;
    void * recv_buf;
+#ifndef ACCELERATOR_AWARE_MPI
+    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
+    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
+#endif
    Integer to_rank;
    Integer from_rank;
    Integer do_send;
@@ -324,7 +277,7 @@ public:
  Vector<int> surface_list;

  stencilVector<StencilEntry>  _entries; // Resident in managed memory
-  commVector<StencilEntry>     _entries_device; // Resident in managed memory
+  commVector<StencilEntry>     _entries_device; // Resident in device memory
  std::vector<Packet> Packets;
  std::vector<Merge> Mergers;
  std::vector<Merge> MergersSHM;
@@ -408,33 +361,16 @@ public:
  // Use OpenMP Tasks for cleaner ???
  // must be called *inside* parallel region
  //////////////////////////////////////////
-  /*
-  void CommunicateThreaded()
-  {
-#ifdef GRID_OMP
-    int mythread = omp_get_thread_num();
-    int nthreads = CartesianCommunicator::nCommThreads;
-#else
-    int mythread = 0;
-    int nthreads = 1;
-#endif
-    if (nthreads == -1) nthreads = 1;
-    if (mythread < nthreads) {
-      for (int i = mythread; i < Packets.size(); i += nthreads) {
-	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
-						      Packets[i].to_rank,
-						      Packets[i].recv_buf,
-						      Packets[i].from_rank,
-						      Packets[i].bytes,i);
-      }
-    }
-  }
-  */
  ////////////////////////////////////////////////////////////////////////
  // Non blocking send and receive. Necessarily parallel.
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    // All GPU kernel tasks must complete
+    //    accelerator_barrier();     // All kernels should ALREADY be complete
+    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
+                               // But the HaloGather had a barrier too.
+#ifdef ACCELERATOR_AWARE_MPI
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
@@ -443,16 +379,54 @@ public:
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
+#else
+#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
+    for(int i=0;i<Packets.size();i++){
+      // Introduce a host buffer with a cheap slab allocator and zero cost wipe all
+      Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
+      Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
+      if ( Packets[i].do_send ) {
+	acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
+      }
+      _grid->StencilSendToRecvFromBegin(MpiReqs,
+					Packets[i].host_send_buf,
+					Packets[i].to_rank,Packets[i].do_send,
+					Packets[i].host_recv_buf,
+					Packets[i].from_rank,Packets[i].do_recv,
+					Packets[i].xbytes,Packets[i].rbytes,i);
+    }
+#endif
+    // Get comms started then run checksums
+    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
+    for(int i=0;i<Packets.size();i++){
+      if ( Packets[i].do_send )
+	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
+    }
  }

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    _grid->StencilSendToRecvFromComplete(MpiReqs,0);
+    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
-    acceleratorCopySynchronise();
+    // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
+    //    accelerator_barrier(); 
    _grid->StencilBarrier(); 
+#ifndef ACCELERATOR_AWARE_MPI
+#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
+    for(int i=0;i<Packets.size();i++){
+      if ( Packets[i].do_recv ) {
+	acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
+      }
+    }
+    _grid->HostBufferFreeAll();
+#endif
+    // run any checksums
+    for(int i=0;i<Packets.size();i++){
+      if ( Packets[i].do_recv )
+	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
+    }
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
@@ -528,6 +502,7 @@ public:
  template<class compressor>
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
+    //    accelerator_barrier();
    _grid->StencilBarrier();// Synch shared memory on a single nodes

    assert(source.Grid()==_grid);
@@ -540,10 +515,9 @@ public:
      compress.Point(point);
      HaloGatherDir(source,compress,point,face_idx);
    }
-    accelerator_barrier();
+    accelerator_barrier(); // All my local gathers are complete
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
-
  }

  /////////////////////////
@@ -579,6 +553,7 @@ public:
      accelerator_forNB(j, words, cobj::Nsimd(), {
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
+      acceleratorFenceComputeStream();
    }
  }
  
@@ -669,6 +644,7 @@ public:
    for(int i=0;i<dd.size();i++){
      decompressor::DecompressFace(decompress,dd[i]);
    }
+    acceleratorFenceComputeStream(); // dependent kernels
  }
  ////////////////////////////////////////
  // Set up routines
@@ -1224,7 +1200,6 @@ public:
 	  ///////////////////////////////////////////////////////////
 	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
 	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
-	  
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
--- a/Grid/util/FlightRecorder.cc
+++ b/Grid/util/FlightRecorder.cc
@@ -0,0 +1,339 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/Init.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@MacBook-Pro.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+///////////////////////////////////////////////////////
+// Grid Norm logging for repro testing
+///////////////////////////////////////////////////////
+int FlightRecorder::PrintEntireLog;
+int FlightRecorder::ContinueOnFail;
+int FlightRecorder::LoggingMode;
+int FlightRecorder::ChecksumComms;
+int FlightRecorder::ChecksumCommsSend;
+int32_t  FlightRecorder::XmitLoggingCounter;
+int32_t  FlightRecorder::RecvLoggingCounter;
+int32_t  FlightRecorder::CsumLoggingCounter;
+int32_t  FlightRecorder::NormLoggingCounter;
+int32_t  FlightRecorder::ReductionLoggingCounter;
+uint64_t FlightRecorder::ErrorCounter;
+std::vector<double> FlightRecorder::NormLogVector;
+std::vector<double> FlightRecorder::ReductionLogVector;
+std::vector<uint64_t> FlightRecorder::CsumLogVector;
+std::vector<uint64_t> FlightRecorder::XmitLogVector;
+std::vector<uint64_t> FlightRecorder::RecvLogVector;
+
+void FlightRecorder::ResetCounters(void)
+{
+  XmitLoggingCounter=0;
+  RecvLoggingCounter=0;
+  CsumLoggingCounter=0;
+  NormLoggingCounter=0;
+  ReductionLoggingCounter=0;
+}
+void FlightRecorder::Truncate(void)
+{
+  ResetCounters();
+  XmitLogVector.resize(0);
+  RecvLogVector.resize(0);
+  NormLogVector.resize(0);
+  CsumLogVector.resize(0);
+  ReductionLogVector.resize(0);
+}
+void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
+{
+  switch ( mode ) {
+  case LoggingModePrint:
+    SetLoggingModePrint();
+    break;
+  case LoggingModeRecord:
+    SetLoggingModeRecord();
+    break;
+  case LoggingModeVerify:
+    SetLoggingModeVerify();
+    break;
+  case LoggingModeNone:
+    LoggingMode = mode;
+    Truncate();
+    break;
+  default:
+    assert(0);
+  }
+}
+
+void FlightRecorder::SetLoggingModePrint(void)
+{
+  std::cout << " FlightRecorder: set to print output " <<std::endl;
+  Truncate();
+  LoggingMode = LoggingModePrint;
+}
+void FlightRecorder::SetLoggingModeRecord(void)
+{
+  std::cout << " FlightRecorder: set to RECORD " <<std::endl;
+  Truncate();
+  LoggingMode = LoggingModeRecord;
+}
+void FlightRecorder::SetLoggingModeVerify(void)
+{
+  std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl;
+  ResetCounters();
+  LoggingMode = LoggingModeVerify;
+}
+uint64_t FlightRecorder::ErrorCount(void)
+{
+  return ErrorCounter;
+}
+void FlightRecorder::NormLog(double value)
+{
+  uint64_t hex = * ( (uint64_t *)&value );
+  if(LoggingMode == LoggingModePrint) {
+    std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
+    NormLoggingCounter++;
+  }
+  if(LoggingMode == LoggingModeRecord) {
+    std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
+    NormLogVector.push_back(value);
+    NormLoggingCounter++;
+  }
+  if(LoggingMode == LoggingModeVerify) {
+
+    if(NormLoggingCounter < NormLogVector.size()){
+      uint64_t hexref  = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] );
+
+      if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
+
+	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
+		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
+		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
+
+	std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl;
+
+	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n",
+		GridHostname(),
+		GlobalSharedMemory::WorldShmRank,
+		NormLoggingCounter,NormLogVector.size(),
+		value, NormLogVector[NormLoggingCounter]); fflush(stderr);
+
+	if(!ContinueOnFail)assert(0); // Force takedown of job
+	  
+	ErrorCounter++;
+      } else {
+	if ( PrintEntireLog ) { 
+	  std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex
+		   <<" "<<hex<<" "<<hexref
+		   <<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl;
+	}
+      }
+       
+    }
+    if ( NormLogVector.size()==NormLoggingCounter ) {
+      std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl;
+    }
+    NormLoggingCounter++;
+  }
+}
+void FlightRecorder::CsumLog(uint64_t hex)
+{
+  if(LoggingMode == LoggingModePrint) {
+    std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
+    CsumLoggingCounter++;
+  }
+
+  if(LoggingMode == LoggingModeRecord) {
+    std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
+    CsumLogVector.push_back(hex);
+    CsumLoggingCounter++;
+  }
+
+  if(LoggingMode == LoggingModeVerify) {
+    
+    if(CsumLoggingCounter < CsumLogVector.size()) {
+
+      uint64_t hexref  = CsumLogVector[CsumLoggingCounter] ;
+
+      if ( hex != hexref ) {
+
+        std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
+		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
+
+	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n",
+		GridHostname(),
+		GlobalSharedMemory::WorldShmRank,
+		CsumLoggingCounter,hex, hexref);
+	fflush(stderr);
+
+	if(!ContinueOnFail) assert(0); // Force takedown of job
+	  
+	ErrorCounter++;
+
+      } else {
+
+	if ( PrintEntireLog ) { 
+	  std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex
+		   <<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
+	}
+      }
+    }  
+    if ( CsumLogVector.size()==CsumLoggingCounter ) {
+      std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl;
+    }
+    CsumLoggingCounter++;
+  }
+}
+void FlightRecorder::ReductionLog(double local,double global)
+{
+  uint64_t hex_l = * ( (uint64_t *)&local );
+  uint64_t hex_g = * ( (uint64_t *)&global );
+  if(LoggingMode == LoggingModePrint) {
+    std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
+    ReductionLoggingCounter++;
+  }
+  if(LoggingMode == LoggingModeRecord) {
+    std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
+    ReductionLogVector.push_back(global);
+    ReductionLoggingCounter++;
+  }
+  if(LoggingMode == LoggingModeVerify) {
+    if(ReductionLoggingCounter < ReductionLogVector.size()){
+      if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
+	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
+		GridHostname(),
+		GlobalSharedMemory::WorldShmRank,
+		ReductionLoggingCounter,ReductionLogVector.size(),
+		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
+	
+	if ( !ContinueOnFail ) assert(0);
+
+	ErrorCounter++;
+      } else {
+	if ( PrintEntireLog ) { 
+	  std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl;
+	}
+      }
+    }
+    if ( ReductionLogVector.size()==ReductionLoggingCounter ) {
+      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl;
+    }
+    ReductionLoggingCounter++;
+  }
+}
+void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
+{
+  if ( ChecksumCommsSend ){
+  uint64_t *ubuf = (uint64_t *)buf;
+  if(LoggingMode == LoggingModeNone) return;
+#ifdef GRID_SYCL
+  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
+  if(LoggingMode == LoggingModePrint) {
+    std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
+    XmitLoggingCounter++;
+  }
+  if(LoggingMode == LoggingModeRecord) {
+    std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
+    XmitLogVector.push_back(_xor);
+    XmitLoggingCounter++;
+  }
+  if(LoggingMode == LoggingModeVerify) {
+    if(XmitLoggingCounter < XmitLogVector.size()){
+      if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
+	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n",
+		GridHostname(),
+		GlobalSharedMemory::WorldShmRank,
+		XmitLoggingCounter,XmitLogVector.size(),
+		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
+	
+	if ( !ContinueOnFail ) assert(0);
+
+	ErrorCounter++;
+      } else {
+	if ( PrintEntireLog ) { 
+	  std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  XmitLogVector[XmitLoggingCounter] <<std::endl;
+	}
+      }
+    }
+    if ( XmitLogVector.size()==XmitLoggingCounter ) {
+      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl;
+    }
+    XmitLoggingCounter++;
+  }
+#endif
+  } else {
+    uint64_t word = 1;
+    deviceVector<uint64_t> dev(1);
+    acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t));
+    acceleratorCopySynchronise();
+    MPI_Barrier(MPI_COMM_WORLD);
+  }
+}
+void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
+{
+  if ( ChecksumComms ){
+  uint64_t *ubuf = (uint64_t *)buf;
+  if(LoggingMode == LoggingModeNone) return;
+#ifdef GRID_SYCL
+  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
+  if(LoggingMode == LoggingModePrint) {
+    std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
+    RecvLoggingCounter++;
+  }
+  if(LoggingMode == LoggingModeRecord) {
+    std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
+    RecvLogVector.push_back(_xor);
+    RecvLoggingCounter++;
+  }
+  if(LoggingMode == LoggingModeVerify) {
+    if(RecvLoggingCounter < RecvLogVector.size()){
+      if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
+	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n",
+		GridHostname(),
+		GlobalSharedMemory::WorldShmRank,
+		RecvLoggingCounter,RecvLogVector.size(),
+		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
+	
+	if ( !ContinueOnFail ) assert(0);
+
+	ErrorCounter++;
+      } else {
+	if ( PrintEntireLog ) { 
+	  std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  RecvLogVector[RecvLoggingCounter] <<std::endl;
+	}
+      }
+    }
+    if ( RecvLogVector.size()==RecvLoggingCounter ) {
+      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl;
+    }
+    RecvLoggingCounter++;
+  }
+#endif
+  }
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/util/FlightRecorder.h
+++ b/Grid/util/FlightRecorder.h
@@ -0,0 +1,43 @@
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+class FlightRecorder {
+ public:
+  enum LoggingMode_t {
+    LoggingModeNone,
+    LoggingModePrint,
+    LoggingModeRecord,
+    LoggingModeVerify
+  };
+  
+  static int                   LoggingMode;
+  static uint64_t              ErrorCounter;
+  static int32_t               XmitLoggingCounter;
+  static int32_t               RecvLoggingCounter;
+  static int32_t               CsumLoggingCounter;
+  static int32_t               NormLoggingCounter;
+  static int32_t               ReductionLoggingCounter;
+  static std::vector<uint64_t> XmitLogVector;
+  static std::vector<uint64_t> RecvLogVector;
+  static std::vector<uint64_t> CsumLogVector;
+  static std::vector<double>   NormLogVector;
+  static std::vector<double>   ReductionLogVector;
+  static int ContinueOnFail;
+  static int PrintEntireLog;
+  static int ChecksumComms;
+  static int ChecksumCommsSend;
+  static void SetLoggingModePrint(void);
+  static void SetLoggingModeRecord(void);
+  static void SetLoggingModeVerify(void);
+  static void SetLoggingMode(LoggingMode_t mode);
+  static void NormLog(double value);
+  static void CsumLog(uint64_t csum);
+  static void ReductionLog(double lcl, double glbl);
+  static void Truncate(void);
+  static void ResetCounters(void);
+  static uint64_t ErrorCount(void);
+  static void xmitLog(void *,uint64_t bytes);
+  static void recvLog(void *,uint64_t bytes,int rank);
+};
+NAMESPACE_END(Grid);
+
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -90,129 +90,6 @@ NAMESPACE_BEGIN(Grid);
 static Coordinate Grid_default_latt;
 static Coordinate Grid_default_mpi;

-
-///////////////////////////////////////////////////////
-// Grid Norm logging for repro testing
-///////////////////////////////////////////////////////
-int GridNormLoggingMode;
-int32_t GridNormLoggingCounter;
-int32_t GridMPINormLoggingCounter;
-std::vector<double> GridNormLogVector;
-std::vector<double> GridMPINormLogVector;
-std::vector<uint32_t> GridCsumLogVector;
-
-void SetGridNormLoggingMode(GridNormLoggingMode_t mode)
-{
-  switch ( mode ) {
-  case GridNormLoggingModePrint:
-    SetGridNormLoggingModePrint();
-    break;
-  case GridNormLoggingModeRecord:
-    SetGridNormLoggingModeRecord();
-    break;
-  case GridNormLoggingModeVerify:
-    SetGridNormLoggingModeVerify();
-    break;
-  case GridNormLoggingModeNone:
-    GridNormLoggingMode = mode;
-    GridNormLoggingCounter=0;
-    GridMPINormLoggingCounter=0;
-    GridNormLogVector.resize(0);
-    GridCsumLogVector.resize(0);
-    GridMPINormLogVector.resize(0);
-    break;
-  default:
-    assert(0);
-  }
-}
-
-void SetGridNormLoggingModePrint(void)
-{
-  std::cout << " GridNormLogging Reproducibility logging set to print output " <<std::endl;
-  GridNormLoggingCounter = 0;
-  GridMPINormLoggingCounter=0;
-  GridNormLogVector.resize(0);
-  GridCsumLogVector.resize(0);
-  GridMPINormLogVector.resize(0);
-  GridNormLoggingMode = GridNormLoggingModePrint;
-}
-void SetGridNormLoggingModeRecord(void)
-{
-  std::cout << " GridNormLogging Reproducibility logging set to RECORD " <<std::endl;
-  GridNormLoggingCounter = 0;
-  GridMPINormLoggingCounter=0;
-  GridNormLogVector.resize(0);
-  GridCsumLogVector.resize(0);
-  GridMPINormLogVector.resize(0);
-  GridNormLoggingMode = GridNormLoggingModeRecord;
-}
-void SetGridNormLoggingModeVerify(void)
-{
-  std::cout << " GridNormLogging Reproducibility logging set to VERIFY " << GridNormLogVector.size()<< " log entries "<<std::endl;
-  GridNormLoggingCounter = 0;
-  GridMPINormLoggingCounter=0;
-  GridNormLoggingMode = GridNormLoggingModeVerify;
-}
-void GridNormLog(double value,uint32_t csum)
-{
-  if(GridNormLoggingMode == GridNormLoggingModePrint) {
-    std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value << " csum " <<std::hex<<csum<<std::dec <<std::endl;
-    GridNormLoggingCounter++;
-  }
-  if(GridNormLoggingMode == GridNormLoggingModeRecord) {
-    GridNormLogVector.push_back(value);
-    GridCsumLogVector.push_back(csum);
-    GridNormLoggingCounter++;
-  }
-  if(GridNormLoggingMode == GridNormLoggingModeVerify) {
-    assert(GridNormLoggingCounter < GridNormLogVector.size());
-    if ( (value != GridNormLogVector[GridNormLoggingCounter])
-	 || (csum!=GridCsumLogVector[GridNormLoggingCounter]) ) {
-      std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<GridNormLogVector[GridNormLoggingCounter] <<std::endl;
-      std::cerr << " Oops got csum "<< std::hex<<csum<<" expect "<<GridCsumLogVector[GridNormLoggingCounter] <<std::endl;
-      fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e %.16e %x %x\n",
-	      GridHostname(),
-	      GlobalSharedMemory::WorldShmRank,
-	      GridNormLoggingCounter,GridNormLogVector.size(),
-	      value, GridNormLogVector[GridNormLoggingCounter],
-	      csum, GridCsumLogVector[GridNormLoggingCounter]); fflush(stderr);
-      assert(0); // Force takedown of job
-    }
-    if ( GridNormLogVector.size()==GridNormLoggingCounter ) {
-      std::cout << " GridNormLogging : Verified entire sequence of "<<GridNormLoggingCounter<<" norms "<<std::endl;
-    }
-    GridNormLoggingCounter++;
-  }
-}
-void GridMPINormLog(double local,double result)
-{
-  if(GridNormLoggingMode == GridNormLoggingModePrint) {
-    std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << " -> " <<result <<std::endl;
-    GridMPINormLoggingCounter++;
-  }
-  if(GridNormLoggingMode == GridNormLoggingModeRecord) {
-    std::cerr<<"GridMPINormLog RECORDING : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl;
-    GridMPINormLogVector.push_back(result);
-    GridMPINormLoggingCounter++;
-  }
-  if(GridNormLoggingMode == GridNormLoggingModeVerify) {
-    std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl;
-    assert(GridMPINormLoggingCounter < GridMPINormLogVector.size());
-    if ( result != GridMPINormLogVector[GridMPINormLoggingCounter] ) {
-      fprintf(stderr,"%s:%d MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e hist %.16e\n",
-	      GridHostname(),
-	      GlobalSharedMemory::WorldShmRank,
-	      GridMPINormLoggingCounter,GridMPINormLogVector.size(),
-	      result, local, GridMPINormLogVector[GridMPINormLoggingCounter]); fflush(stderr);
-      assert(0); // Force takedown of job
-    }
-    if ( GridMPINormLogVector.size()==GridMPINormLoggingCounter ) {
-      std::cout << " GridMPINormLogging : Verified entire sequence of "<<GridMPINormLoggingCounter<<" norms "<<std::endl;
-    }
-    GridMPINormLoggingCounter++;
-  }
-}
-
 int GridThread::_threads =1;
 int GridThread::_hyperthreads=1;
 int GridThread::_cores=1;
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -70,21 +70,6 @@ void GridParseLayout(char **argv,int argc,
 void printHash(void);


-enum GridNormLoggingMode_t {
-  GridNormLoggingModeNone,
-  GridNormLoggingModePrint,
-  GridNormLoggingModeRecord,
-  GridNormLoggingModeVerify
-};
-//extern int GridNormLoggingMode;
-//extern int32_t GridNormLoggingCounter;
-//extern std::vector<double> GridNormLogVector;
-void SetGridNormLoggingModePrint(void);
-void SetGridNormLoggingModeRecord(void);
-void SetGridNormLoggingModeVerify(void);
-void SetGridNormLoggingMode(GridNormLoggingMode_t mode);
-void GridNormLog(double value,uint32_t csum);
-void GridMPINormLog(double lcl, double glbl);

 NAMESPACE_END(Grid);

--- a/Grid/util/Util.h
+++ b/Grid/util/Util.h
@@ -1,6 +1,6 @@
-#ifndef GRID_UTIL_H
-#define GRID_UTIL_H
+#pragma once
 #include <Grid/util/Coordinate.h>
 #include <Grid/util/Lexicographic.h>
 #include <Grid/util/Init.h>
-#endif
+#include <Grid/util/FlightRecorder.h>
+
--- a/configure.ac
+++ b/configure.ac
@@ -226,23 +226,14 @@ case ${ac_SFW_FP16} in
 esac

 ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
-AC_ARG_ENABLE([accelerator-cshift],
-    [AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])],
-    [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
+AC_ARG_ENABLE([accelerator-aware-mpi],
+    [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
+    [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])

-AC_ARG_ENABLE([ucx-buggy],
-    [AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])],
-    [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
-
-case ${ac_UCXBUGGY} in
+case ${ac_ACCELERATOR_AWARE_MPI} in
    yes)
-    ac_ACC_CSHIFT=no;;
-    *);;
-esac
-
-case ${ac_ACC_CSHIFT} in
-    yes)
-      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
+      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
+      AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
    *);;
 esac

--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -1,16 +1,16 @@
-TOOLS=$HOME/tools
+
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
-	--enable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
+	--enable-accelerator-aware-mpi=no\
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
-	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel"
+	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"

--- a/systems/Aurora/sourceme-sunspot-deterministic.sh
+++ b/systems/Aurora/sourceme-sunspot-deterministic.sh
@@ -0,0 +1,2 @@
+module load oneapi/eng-compiler/2023.05.15.003
+module load mpich/51.2/icc-all-deterministic-pmix-gpu
--- a/systems/Aurora/tests/repro1gpu.pbs
+++ b/systems/Aurora/tests/repro1gpu.pbs
@@ -30,6 +30,7 @@ export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"

 cd $PBS_O_WORKDIR

--- a/systems/Aurora/tests/reproN.pbs
+++ b/systems/Aurora/tests/reproN.pbs
@@ -1,6 +1,6 @@
 #!/bin/bash

-#PBS -l select=16:system=sunspot,place=scatter
+#PBS -l select=32:system=sunspot,place=scatter
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=02:00:00
 #PBS -N reproN
@@ -15,13 +15,23 @@ module load mpich/51.2/icc-all-deterministic-pmix-gpu
 # 56 cores / 6 threads ~9
 export OMP_NUM_THREADS=6
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
-export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
+
+export GRID_PRINT_ENTIRE_LOG=0
+export GRID_CHECKSUM_RECV_BUF=0
+export GRID_CHECKSUM_SEND_BUF=0
+
 export MPICH_OFI_NIC_POLICY=GPU

 export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
@@ -46,29 +56,39 @@ cd $PBS_O_WORKDIR
 THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 echo Node $n is $THIS_NODE

-DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE
+DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE

 mkdir -p $DIR
 cd $DIR

 echo $THIS_NODE > nodefile

+#CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
+#	     ../../gpu_tile_compact.sh \
+#	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
+#		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
+
 CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
 	     ../../gpu_tile_compact.sh \
 	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
-		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
+		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"

+echo $CMD > command-line
+env > environment
 $CMD &

 done

-wait
+# Suspicious wait is allowing jobs to collide and knock out
+#wait
+
+sleep 6500

 for n in ` eval echo {1..$NN} `
 do

 THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
-DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE
+DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE

 cd $DIR

--- a/systems/Sunspot/config-command
+++ b/systems/Sunspot/config-command
@@ -1,4 +1,4 @@
-TOOLS=$HOME/tools
+
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
@@ -11,6 +11,6 @@ TOOLS=$HOME/tools
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
-	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
+	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"

--- a/systems/Sunspot/sourceme.sh
+++ b/systems/Sunspot/sourceme.sh
@@ -0,0 +1,2 @@
+module load oneapi/eng-compiler/2023.05.15.003
+module load mpich/51.2/icc-all-deterministic-pmix-gpu
--- a/systems/Sunspot/tests/repro1gpu.pbs
+++ b/systems/Sunspot/tests/repro1gpu.pbs
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+#PBS -l select=16:system=sunspot,place=scatter
+#PBS -A LatticeQCD_aesp_CNDA
+#PBS -l walltime=02:00:00
+#PBS -N repro1gpu
+#PBS -k doe
+
+#export OMP_PROC_BIND=spread
+#unset OMP_PLACES
+
+module load oneapi/eng-compiler/2023.05.15.003
+module load mpich/51.2/icc-all-deterministic-pmix-gpu
+
+# 56 cores / 6 threads ~9
+export OMP_NUM_THREADS=6
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+export MPICH_OFI_NIC_POLICY=GPU
+
+export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
+unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
+
+cd $PBS_O_WORKDIR
+
+NN=`cat $PBS_NODEFILE | wc -l`
+echo $PBS_NODEFILE
+cat $PBS_NODEFILE
+
+echo $NN nodes in node file
+for n in `eval echo {1..$NN}`
+do
+
+THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
+echo Node $n is $THIS_NODE
+
+
+for g in {0..11}
+do
+export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
+export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
+export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
+
+export numa=${NUMA_MAP[$g]}
+export gpu_id=${GPU_MAP[$g]}
+export tile_id=${TILE_MAP[$g]}
+export gpu=$gpu_id.$tile_id
+
+cd $PBS_O_WORKDIR
+
+DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
+mkdir -p $DIR
+cd $DIR
+
+echo $THIS_NODE > nodefile
+echo $gpu > gpu
+
+export ZE_AFFINITY_MASK=$gpu
+export ONEAPI_DEVICE_FILTER=gpu,level_zero
+
+CMD="mpiexec -np 1 -ppn 1  -envall --hostfile nodefile \
+	     numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
+		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
+echo $CMD
+$CMD &
+
+done
+done
+
+wait
+
--- a/systems/Sunspot/tests/reproN.pbs
+++ b/systems/Sunspot/tests/reproN.pbs
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+#PBS -l select=32:system=sunspot,place=scatter
+#PBS -A LatticeQCD_aesp_CNDA
+#PBS -l walltime=02:00:00
+#PBS -N reproN
+#PBS -k doe
+
+#export OMP_PROC_BIND=spread
+#unset OMP_PLACES
+
+module load oneapi/eng-compiler/2023.05.15.003
+module load mpich/51.2/icc-all-deterministic-pmix-gpu
+
+# 56 cores / 6 threads ~9
+export OMP_NUM_THREADS=6
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+
+export GRID_PRINT_ENTIRE_LOG=0
+export GRID_CHECKSUM_RECV_BUF=1
+export GRID_CHECKSUM_SEND_BUF=0
+
+export MPICH_OFI_NIC_POLICY=GPU
+
+export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
+unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
+
+cd $PBS_O_WORKDIR
+
+NN=`cat $PBS_NODEFILE | wc -l`
+echo $PBS_NODEFILE
+cat $PBS_NODEFILE
+
+echo $NN nodes in node file
+for n in `eval echo {1..$NN}`
+do
+
+cd $PBS_O_WORKDIR
+
+THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
+echo Node $n is $THIS_NODE
+
+DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
+
+mkdir -p $DIR
+cd $DIR
+
+echo $THIS_NODE > nodefile
+
+#CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
+#	     ../../gpu_tile_compact.sh \
+#	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
+#		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
+
+CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
+	     ../../gpu_tile_compact.sh \
+	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
+		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
+
+echo $CMD > command-line
+env > environment
+$CMD &
+
+done
+
+# Suspicious wait is allowing jobs to collide and knock out
+#wait
+
+sleep 6500
+
+for n in ` eval echo {1..$NN} `
+do
+
+THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
+DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
+
+cd $DIR
+
+grep Oops Grid.stderr.* > failures.$PBS_JOBID
+rm core.*
+
+done
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -34,6 +34,7 @@ using namespace Grid;
 #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 #endif

+
 NAMESPACE_BEGIN(Grid);
 template<class Matrix,class Field>
  class SchurDiagMooeeOperatorParanoid :  public SchurOperatorBase<Field> {
@@ -143,14 +144,21 @@ int main (int argc, char ** argv)

  time_t start = time(NULL);

-  uint32_t csum, csumref;
-  csumref=0;
+  FlightRecorder::ContinueOnFail = 0;
+  FlightRecorder::PrintEntireLog = 0;
+  FlightRecorder::ChecksumComms  = 1;
+  FlightRecorder::ChecksumCommsSend=0;
+
+  if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s);
+  if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms      = atoi(s);
+  if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend  = atoi(s);
+
  int iter=0;
  do {
    if ( iter == 0 ) {
-      SetGridNormLoggingMode(GridNormLoggingModeRecord);
+      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
    } else {
-      SetGridNormLoggingMode(GridNormLoggingModeVerify);
+      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
    }
    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
    result_o = Zero();
@@ -162,31 +170,22 @@ int main (int argc, char ** argv)
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
+    std::cout << " SinglePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;

-    csum = crc(result_o);
+    assert(FlightRecorder::ErrorCount()==0);

-    if ( csumref == 0 ) {
-      csumref = csum;
-    } else {
-      if ( csum != csumref ) { 
-	std::cerr << host<<" FAILURE " <<iter <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
-	assert(0);
-      } else {
-	std::cout << host <<" OK " <<iter <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
-      }
-    }
+    std::cout << " FlightRecorder is OK! "<<std::endl;
    iter ++;
-  } while (time(NULL) < (start + nsecs/2) );
+  } while (time(NULL) < (start + nsecs/10) );
    
  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-  csumref=0;
  int i=0;
  do { 
    if ( i == 0 ) {
-      SetGridNormLoggingMode(GridNormLoggingModeRecord);
+      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
    } else {
-      SetGridNormLoggingMode(GridNormLoggingModeVerify);
+      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
    }
    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
    result_o_2 = Zero();
@@ -199,19 +198,9 @@ int main (int argc, char ** argv)

    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
-
-    csum = crc(result_o);
-
-    if ( csumref == 0 ) {
-      csumref = csum;
-    } else {
-      if ( csum != csumref ) { 
-	std::cerr << i <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
-	assert(0);
-      } else {
-	std::cout << i <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
-      }
-    }
+    std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
+    assert(FlightRecorder::ErrorCount()==0);
+    std::cout << " FlightRecorder is OK! "<<std::endl;
    i++;
  } while (time(NULL) < (start + nsecs) );
Author	SHA1	Message	Date
dbollweg	da81a73b4a	Merge `461cd045c6` into `da59379612`	2024-03-27 02:11:32 +09:00
Peter Boyle	da59379612	Large reg file for double	2024-03-26 17:03:20 +00:00
Peter Boyle	3ef2a41518	ifdef guard ommitted	2024-03-26 14:50:32 +00:00
Peter Boyle	aa96f420c6	Acclerator ware MPI guard on the Unix domain sockets	2024-03-26 14:41:25 +00:00
Peter Boyle	49e9e4ed0e	Fences	2024-03-26 14:14:06 +00:00
Peter Boyle	f7b8163016	Deterministic MPI reduce options	2024-03-26 14:11:40 +00:00
Peter Boyle	93769eacd3	Updated configure for bounce through host	2024-03-26 14:10:24 +00:00
Peter Boyle	59b0cc11df	REduce the time in single	2024-03-26 00:42:40 +00:00
Peter Boyle	f32c275376	Updated config options for MPI not being aware of GPU	2024-03-26 00:42:00 +00:00
Peter Boyle	5404fc66ab	Merge needs a fence on SYCL	2024-03-26 00:38:41 +00:00
Peter Boyle	1f53458af8	Options to bounce through a host buffer if --disable-accelerator-aware-mpi	2024-03-26 00:37:19 +00:00
Peter Boyle	434c3e7f1d	We have a choice of GET or PUT across NVlink	2024-03-25 14:32:44 +00:00
Peter Boyle	500b119f3d	Deterministic MPI	2024-03-22 15:55:23 +00:00
Peter Boyle	4b87259c1b	New config command for sunspot	2024-03-22 15:43:49 +00:00
Peter Boyle	503dec34ef	This appears working now on Sunspot	2024-03-22 15:43:30 +00:00
Peter Boyle	d1e9fe50d2	Xor csum for repro testing	2024-03-22 15:42:57 +00:00
Peter Boyle	d01e5fa838	Improved FlightRecorder	2024-03-22 15:42:32 +00:00
Peter Boyle	a477c25e8c	Sunspot repro tests	2024-03-22 15:42:11 +00:00
Peter Boyle	1bd20cd9e8	FlightRecorder	2024-03-22 15:40:01 +00:00
Peter Boyle	e49e95b037	Upgrade of the Britney test with flight recorder and fast xor checksum	2024-03-22 15:39:27 +00:00
Peter Boyle	6f59fed563	Flight recorder, resurrecting the "world famous" Britney test	2024-03-22 15:32:32 +00:00
Peter Boyle	60b7f6c99d	Flight recorder, resurrecting the "world famous" Britney test	2024-03-22 15:32:26 +00:00
Peter Boyle	b92dfcc8d3	Flight recorder, resurrecting the "world famous" Britney test	2024-03-22 15:30:27 +00:00
Peter Boyle	f6fd6dd053	Flight recorder, resurrecting the "world famous" Britney test	2024-03-22 15:30:01 +00:00
dbollweg	461cd045c6	sliceSum cleanup	2024-03-13 18:18:44 -04:00
dbollweg	fee65d7a75	Merge branch 'paboyle:develop' into sycl_slicesum_update	2024-03-13 18:06:17 -04:00
dbollweg	31f9971dbf	avoid PI_ERROR_OUT_OF_RESOURCES in sycl sliceSum	2024-03-13 13:39:26 -04:00
dbollweg	d87296f3e8	Merge branch 'develop' of https://github.com/dbollweg/Grid into develop	2024-03-06 16:54:22 -05:00
dbollweg	be94cf1c6f	Fewer wait-calls in sycl slicesum	2024-03-06 16:53:13 -05:00