Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2025-07-27 01:37:07 +01:00 · 2021-09-21 12:18:05 -07:00
parent 8bd70ad8b5 ca9816bfbb
commit b2ccaad761
15 changed files with 594 additions and 171 deletions
--- a/Grid/communicator/Communicator_base.cc
+++ b/Grid/communicator/Communicator_base.cc
@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+bool Stencil_force_mpi = true;
+
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -35,11 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-#ifdef GRID_MPI3_SHM_NVLINK
-const bool Stencil_force_mpi = true;
-#else
-const bool Stencil_force_mpi = false;
-#endif
+extern bool Stencil_force_mpi ;

 class CartesianCommunicator : public SharedMemory {

--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -384,6 +384,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
+  } else {
+    // TODO : make a OMP loop on CPU, call threaded bcopy
+    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+    assert(shm!=NULL);
+    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
+    acceleratorCopySynchronize(); // MPI prob slower
  }

  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -543,6 +543,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
+    void * thisBuf = ShmCommBuf;
+    if(!Stencil_force_mpi) {
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    ze_ipc_mem_handle_t handle;
    if ( r==WorldShmRank ) { 
@@ -580,6 +582,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
      }
    }
 #endif
+
    //////////////////////////////////////////////////
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
@@ -595,7 +598,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    ///////////////////////////////////////////////////////////////
    // If I am not the source, overwrite thisBuf with remote buffer
    ///////////////////////////////////////////////////////////////
-    void * thisBuf = ShmCommBuf;
+
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    if ( r!=WorldShmRank ) {
      thisBuf = nullptr;
@@ -636,7 +639,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    ///////////////////////////////////////////////////////////////
    // Save a copy of the device buffers
    ///////////////////////////////////////////////////////////////
-    WorldShmCommBufs[r] = thisBuf;
+    }
+  WorldShmCommBufs[r] = thisBuf;
 #else
    WorldShmCommBufs[r] = ShmCommBuf;
 #endif
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -326,21 +326,8 @@ public:
    int xmit_to_rank;

    if ( ! comm_dim ) return 1;
-
-    int nbr_proc;
-    if (displacement>0) nbr_proc = 1;
-    else                 nbr_proc = pd-1;
-
-    // FIXME  this logic needs to be sorted for three link term
-    //    assert( (displacement==1) || (displacement==-1));
-    // Present hack only works for >= 4^4 subvol per node
-    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
-
-    void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
-
-    if ( (shm==NULL) || Stencil_force_mpi ) return 0;
-
-    return 1;
+    if ( displacement == 0 ) return 1;
+    return 0;
  }

  //////////////////////////////////////////
@@ -1020,7 +1007,6 @@ public:
    int cb= (cbmask==0x2)? Odd : Even;
    int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);

-    int shm_receive_only = 1;
    for(int x=0;x<rd;x++){

      int sx        = (x+sshift)%rd;
@@ -1052,10 +1038,6 @@ public:
 	assert (xmit_to_rank   != _grid->ThisRank());
 	assert (recv_from_rank != _grid->ThisRank());

-	/////////////////////////////////////////////////////////
-	// try the direct copy if possible
-	/////////////////////////////////////////////////////////
-	cobj *send_buf;
 	cobj *recv_buf;
 	if ( compress.DecompressionStep() ) {
 	  recv_buf=u_simd_recv_buf[0];
@@ -1063,52 +1045,36 @@ public:
 	  recv_buf=this->u_recv_buf_p;
 	}

-	send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
-	if ( (send_buf==NULL) || Stencil_force_mpi ) {
-	  send_buf = this->u_send_buf_p;
-	}
-
-	// Find out if we get the direct copy.
-	void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_send_buf_p);
-	if ((success==NULL)||Stencil_force_mpi) {
-	  // we found a packet that comes from MPI and contributes to this leg of stencil
-	  shm_receive_only = 0;
-	}
+	cobj *send_buf;
+	send_buf = this->u_send_buf_p; // Gather locally, must send

+	////////////////////////////////////////////////////////
+	// Gather locally
+	////////////////////////////////////////////////////////
 	gathertime-=usecond();
 	assert(send_buf!=NULL);
 	Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
 	gathertime+=usecond();

+	///////////////////////////////////////////////////////////
+	// Build a list of things to do after we synchronise GPUs
+	// Start comms now???
+	///////////////////////////////////////////////////////////
+	AddPacket((void *)&send_buf[u_comm_offset],
+		  (void *)&recv_buf[u_comm_offset],
+		  xmit_to_rank,
+		  recv_from_rank,
+		  bytes);
+
 	if ( compress.DecompressionStep() ) {
-
-	  if ( shm_receive_only ) { // Early decompress before MPI is finished is possible
-	    AddDecompress(&this->u_recv_buf_p[u_comm_offset],
-			  &recv_buf[u_comm_offset],
-			  words,DecompressionsSHM);
-	  } else { // Decompress after MPI is finished
-	    AddDecompress(&this->u_recv_buf_p[u_comm_offset],
-			  &recv_buf[u_comm_offset],
-			  words,Decompressions);
-	  }
-
-	  AddPacket((void *)&send_buf[u_comm_offset],
-		    (void *)&recv_buf[u_comm_offset],
-		    xmit_to_rank,
-		    recv_from_rank,
-		    bytes);
-
-	} else {
-	  AddPacket((void *)&send_buf[u_comm_offset],
-		    (void *)&this->u_recv_buf_p[u_comm_offset],
-		    xmit_to_rank,
-		    recv_from_rank,
-		    bytes);
+	  AddDecompress(&this->u_recv_buf_p[u_comm_offset],
+			&recv_buf[u_comm_offset],
+			words,Decompressions);
 	}
 	u_comm_offset+=words;
      }
    }
-    return shm_receive_only;
+    return 0;
  }

  template<class compressor>
@@ -1159,7 +1125,6 @@ public:
    int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);

    // loop over outer coord planes orthog to dim
-    int shm_receive_only = 1;
    for(int x=0;x<rd;x++){

      int any_offnode = ( ((x+sshift)%fd) >= rd );
@@ -1214,20 +1179,7 @@ public:

 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);

-	    // shm == receive pointer         if offnode
-	    // shm == Translate[send pointer] if on node -- my view of his send pointer
-	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
-	    if ((shm==NULL)||Stencil_force_mpi) {
-	      shm = rp;
-	      // we found a packet that comes from MPI and contributes to this shift.
-	      // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
-	      // Kernel will add the exterior_terms except if is_same_node.
-	      shm_receive_only = 0;
-	      // leg of stencil
-	    }
-	    // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
-	    // assuming above pointer flip
-	    rpointers[i] = shm;
+	    rpointers[i] = rp;

 	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);

@@ -1239,102 +1191,17 @@ public:
 	  }
 	}

-	if ( shm_receive_only ) {
-	  AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM);
-	} else {
-	  AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
-	}
+	AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);

 	u_comm_offset     +=buffer_size;
      }
    }
-    return shm_receive_only;
+    return 0;
  }

-  void ZeroCounters(void) {
-    gathertime = 0.;
-    commtime = 0.;
-    mpi3synctime=0.;
-    mpi3synctime_g=0.;
-    shmmergetime=0.;
-    for(int i=0;i<this->_npoints;i++){
-      comm_time_thr[i]=0;
-      comm_bytes_thr[i]=0;
-      comm_enter_thr[i]=0;
-      comm_leave_thr[i]=0;
-      shm_bytes_thr[i]=0;
-    }
-    halogtime = 0.;
-    mergetime = 0.;
-    decompresstime = 0.;
-    gathermtime = 0.;
-    splicetime = 0.;
-    nosplicetime = 0.;
-    comms_bytes = 0.;
-    shm_bytes = 0.;
-    calls = 0.;
-  };
+  void ZeroCounters(void) { };

-  void Report(void) {
-#define AVERAGE(A)
-#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
-    RealD NP = _grid->_Nprocessors;
-    RealD NN = _grid->NodeCount();
-    double t = 0;
-    // if comm_time_thr is set they were all done in parallel so take the max
-    // but add up the bytes
-    int threaded = 0 ;
-    for (int i = 0; i < 8; ++i) {
-      if ( comm_time_thr[i]>0.0 ) {
-	threaded = 1;
-	comms_bytes += comm_bytes_thr[i];
-	shm_bytes   += shm_bytes_thr[i];
-	if (t < comm_time_thr[i]) t = comm_time_thr[i];
-      }
-    }
-    if (threaded) commtime += t;
-
-    _grid->GlobalSum(commtime);    commtime/=NP;
-    if ( calls > 0. ) {
-      std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
-      PRINTIT(halogtime);
-      PRINTIT(gathertime);
-      PRINTIT(gathermtime);
-      PRINTIT(mergetime);
-      PRINTIT(decompresstime);
-      if(comms_bytes>1.0){
-	PRINTIT(comms_bytes);
-	PRINTIT(commtime);
-	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
-	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
-      }
-      if(shm_bytes>1.0){
-	PRINTIT(shm_bytes); // X bytes + R bytes
-	                    // Double this to include spin projection overhead with 2:1 ratio in wilson
-	auto gatheralltime = gathertime+gathermtime;
-	std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
-	std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
-
-	auto all_bytes = comms_bytes+shm_bytes;
-	std::cout << GridLogMessage << " Stencil SHM all " << (all_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
-	std::cout << GridLogMessage << " Stencil SHM all " << (all_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
-
-	auto membytes = (shm_bytes + comms_bytes/2) // read/write
-	              + (shm_bytes+comms_bytes)/2 * sizeof(vobj)/sizeof(cobj);
-	std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
-	std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
-      }
-      /*
-      PRINTIT(mpi3synctime);
-      PRINTIT(mpi3synctime_g);
-      PRINTIT(shmmergetime);
-      PRINTIT(splicetime);
-      PRINTIT(nosplicetime);
-      */
-    }
-#undef PRINTIT
-#undef AVERAGE
-  };
+  void Report(void) {   };

 };
 NAMESPACE_END(Grid);
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -8,6 +8,7 @@ void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};

 #ifdef GRID_CUDA
 cudaDeviceProp *gpu_props;
+cudaStream_t copyStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -105,6 +105,7 @@ void     acceleratorInit(void);
 #define accelerator_inline __host__ __device__ inline

 extern int acceleratorAbortOnGpuError;
+extern cudaStream_t copyStream;

 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@@ -213,9 +214,13 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+{
+  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
+}
+inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
 inline int  acceleratorIsCommunicable(void *ptr)
 {
  //  int uvm=0;
@@ -289,7 +294,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
-inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {
+  theGridAccelerator->memcpy(to,from,bytes);
+}
+inline void acceleratorCopySynchronise(void) {  theGridAccelerator->wait(); }
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
@@ -394,7 +402,8 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
+inline void acceleratorCopySynchronise(void) {  }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}

 #endif
@@ -435,7 +444,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
-inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
+inline void acceleratorCopySynchronize(void) {};

 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -301,6 +301,13 @@ void Grid_init(int *argc,char ***argv)
    GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
  }

+  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-mpi") ){
+    int forcempi;
+    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm-mpi");
+    GridCmdOptionInt(arg,forcempi);
+    Stencil_force_mpi = (bool)forcempi;
+  }
+  
  if( GridCmdOptionExists(*argv,*argv+*argc,"--device-mem") ){
    int MB;
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--device-mem");
@@ -419,7 +426,9 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;
    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;
-    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;    
+    std::cout<<GridLogMessage<<"  --shm-mpi 0|1   : Force MPI usage under multi-rank per node "<<std::endl;
+    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
+    std::cout<<GridLogMessage<<"  --device-mem M  : Size of device software cache for lattice fields (MB) "<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
--- a/systems/Booster/comms.4node.perf
+++ b/systems/Booster/comms.4node.perf
@@ -0,0 +1,129 @@
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 16
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1463a0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=e188c0512ebee79bfb15906676af1c9e142aa21a: (HEAD -> develop) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 0.729967 s : Grid is setup to use 4 threads
+Grid : Message : 0.729975 s : Number of iterations to average: 250
+Grid : Message : 0.729977 s : ====================================================================================================
+Grid : Message : 0.729978 s : = Benchmarking sequential halo exchange from host memory 
+Grid : Message : 0.729979 s : ====================================================================================================
+Grid : Message : 0.729980 s :  L  	 Ls  	    bytes		MB/s uni (err/min/max)		MB/s bidi (err/min/max)
+Grid : Message : 0.749870 s :    8	8	     393216       50783.4  		101566.8
+Grid : Message : 0.764282 s :    8	8	     393216       54704.5  		109409.0
+Grid : Message : 0.780310 s :    8	8	     393216       49090.6  		98181.3
+Grid : Message : 0.796479 s :    8	8	     393216       48662.3  		97324.7
+Grid : Message : 0.841551 s :   12	8	    1327104       66728.9  		133457.8
+Grid : Message : 0.880653 s :   12	8	    1327104       67932.9  		135865.9
+Grid : Message : 0.920097 s :   12	8	    1327104       67304.2  		134608.4
+Grid : Message : 0.961444 s :   12	8	    1327104       64205.9  		128411.8
+Grid : Message : 1.660890 s :   16	8	    3145728       67833.1  		135666.3
+Grid : Message : 1.153006 s :   16	8	    3145728       72416.3  		144832.6
+Grid : Message : 1.240962 s :   16	8	    3145728       71536.1  		143072.2
+Grid : Message : 1.330372 s :   16	8	    3145728       70372.7  		140745.3
+Grid : Message : 1.519996 s :   20	8	    6144000       71017.4  		142034.8
+Grid : Message : 1.667745 s :   20	8	    6144000       83189.5  		166378.9
+Grid : Message : 1.817908 s :   20	8	    6144000       81836.5  		163673.1
+Grid : Message : 1.969344 s :   20	8	    6144000       81148.0  		162296.0
+Grid : Message : 2.260249 s :   24	8	   10616832       79299.9  		158599.8
+Grid : Message : 2.512319 s :   24	8	   10616832       84249.2  		168498.4
+Grid : Message : 2.763820 s :   24	8	   10616832       84430.4  		168860.9
+Grid : Message : 3.172850 s :   24	8	   10616832       83776.5  		167553.1
+Grid : Message : 3.460951 s :   28	8	   16859136       82176.6  		164353.1
+Grid : Message : 3.859348 s :   28	8	   16859136       84642.9  		169285.9
+Grid : Message : 4.254351 s :   28	8	   16859136       85366.0  		170731.9
+Grid : Message : 4.651748 s :   28	8	   16859136       84850.2  		169700.4
+Grid : Message : 5.302166 s :   32	8	   25165824       83402.1  		166804.1
+Grid : Message : 5.889123 s :   32	8	   25165824       85756.3  		171512.6
+Grid : Message : 6.472357 s :   32	8	   25165824       86299.1  		172598.3
+Grid : Message : 7.572140 s :   32	8	   25165824       86059.7  		172119.3
+Grid : Message : 7.578700 s : ====================================================================================================
+Grid : Message : 7.578740 s : = Benchmarking sequential halo exchange from GPU memory 
+Grid : Message : 7.578750 s : ====================================================================================================
+Grid : Message : 7.578760 s :  L  	 Ls  	    bytes		MB/s uni (err/min/max)		MB/s bidi (err/min/max)
+Grid : Message : 7.119231 s :    8	8	     393216       13844.9  		27689.8
+Grid : Message : 7.150661 s :    8	8	     393216       25034.4  		50068.9
+Grid : Message : 7.173800 s :    8	8	     393216       34002.0  		68004.0
+Grid : Message : 7.197415 s :    8	8	     393216       33317.7  		66635.5
+Grid : Message : 7.240696 s :   12	8	    1327104       110772.0  		221544.0
+Grid : Message : 7.263466 s :   12	8	    1327104       116627.5  		233254.9
+Grid : Message : 7.310752 s :   12	8	    1327104       56142.8  		112285.6
+Grid : Message : 7.356881 s :   12	8	    1327104       57551.3  		115102.6
+Grid : Message : 7.422351 s :   16	8	    3145728       167086.0  		334172.0
+Grid : Message : 7.458334 s :   16	8	    3145728       174903.6  		349807.1
+Grid : Message : 7.558746 s :   16	8	    3145728       62663.3  		125326.6
+Grid : Message : 7.658824 s :   16	8	    3145728       62871.8  		125743.6
+Grid : Message : 7.741423 s :   20	8	    6144000       231840.3  		463680.6
+Grid : Message : 7.794862 s :   20	8	    6144000       229996.1  		459992.1
+Grid : Message : 7.982472 s :   20	8	    6144000       65501.1  		131002.1
+Grid : Message : 8.170548 s :   20	8	    6144000       65338.8  		130677.5
+Grid : Message : 8.277182 s :   24	8	   10616832       274319.0  		548638.0
+Grid : Message : 8.354585 s :   24	8	   10616832       274365.1  		548730.2
+Grid : Message : 8.675675 s :   24	8	   10616832       66132.8  		132265.7
+Grid : Message : 8.999237 s :   24	8	   10616832       65627.4  		131254.7
+Grid : Message : 9.140302 s :   28	8	   16859136       300825.0  		601650.0
+Grid : Message : 9.251320 s :   28	8	   16859136       303749.1  		607498.1
+Grid : Message : 9.632241 s :   28	8	   16859136       88520.3  		177040.6
+Grid : Message : 9.999663 s :   28	8	   16859136       91772.9  		183545.7
+Grid : Message : 10.183071 s :   32	8	   25165824       328325.5  		656651.1
+Grid : Message : 10.335093 s :   32	8	   25165824       331109.7  		662219.3
+Grid : Message : 10.875980 s :   32	8	   25165824       93056.0  		186111.9
+Grid : Message : 11.418666 s :   32	8	   25165824       92747.5  		185495.0
+Grid : Message : 11.434792 s : ====================================================================================================
+Grid : Message : 11.434797 s : = All done; Bye Bye
+Grid : Message : 11.434798 s : ====================================================================================================
--- a/systems/Booster/config-command
+++ b/systems/Booster/config-command
@@ -0,0 +1,14 @@
+LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
+../../configure \
+    --enable-comms=mpi \
+    --enable-simd=GPU \
+    --enable-gen-simd-width=64 \
+    --enable-shm=nvlink \
+    --enable-accelerator=cuda \
+    --with-lime=$LIME \
+    --disable-accelerator-cshift \
+    --disable-unified \
+    CXX=nvcc \
+    LDFLAGS="-cudart shared " \
+    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
+
--- a/systems/Booster/dwf.16node.perf
+++ b/systems/Booster/dwf.16node.perf
@@ -0,0 +1,156 @@
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 64
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ac40000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=f660dc67e4b193afc4015bc5e5fe47cfdbb0356e: (HEAD -> develop, origin/develop, origin/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 0.910318 s : Grid Layout
+Grid : Message : 0.910320 s : 	Global lattice size  : 64 64 64 256 
+Grid : Message : 0.910325 s : 	OpenMP threads       : 4
+Grid : Message : 0.910326 s : 	MPI tasks            : 2 2 2 8 
+Grid : Message : 0.973956 s : Making s innermost grids
+Grid : Message : 1.198830 s : Initialising 4d RNG
+Grid : Message : 1.119813 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.119870 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 2.683307 s : Initialising 5d RNG
+Grid : Message : 4.220535 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.220563 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 37.198140 s : Initialised RNGs
+Grid : Message : 39.952612 s : Drawing gauge field
+Grid : Message : 40.488019 s : Random gauge initialised 
+Grid : Message : 42.659220 s : Setting up Cshift based reference 
+Grid : Message : 47.622210 s : *****************************************************************
+Grid : Message : 47.622236 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 47.622237 s : *****************************************************************
+Grid : Message : 47.622238 s : *****************************************************************
+Grid : Message : 47.622239 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 47.622240 s : * Vectorising space-time by 8
+Grid : Message : 47.622241 s : * VComplexF size is 64 B
+Grid : Message : 47.622242 s : * SINGLE precision 
+Grid : Message : 47.622243 s : * Using Overlapped Comms/Compute
+Grid : Message : 47.622244 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 47.622245 s : *****************************************************************
+Grid : Message : 48.950210 s : Called warmup
+Grid : Message : 77.311124 s : Called Dw 3000 times in 2.83592e+07 us
+Grid : Message : 77.311181 s : mflop/s =   1.49934e+08
+Grid : Message : 77.311184 s : mflop/s per rank =  2.34273e+06
+Grid : Message : 77.311185 s : mflop/s per node =  9.37091e+06
+Grid : Message : 77.311186 s : RF  GiB/s (base 2) =   304663
+Grid : Message : 77.311187 s : mem GiB/s (base 2) =   190415
+Grid : Message : 77.314752 s : norm diff   1.03478e-13
+Grid : Message : 77.349587 s : #### Dhop calls report 
+Grid : Message : 77.349591 s : WilsonFermion5D Number of DhopEO Calls   : 6002
+Grid : Message : 77.349613 s : WilsonFermion5D TotalTime   /Calls        : 4761.53 us
+Grid : Message : 77.349615 s : WilsonFermion5D CommTime    /Calls        : 3363.09 us
+Grid : Message : 77.349616 s : WilsonFermion5D FaceTime    /Calls        : 469.094 us
+Grid : Message : 77.349617 s : WilsonFermion5D ComputeTime1/Calls        : 26.8794 us
+Grid : Message : 77.349618 s : WilsonFermion5D ComputeTime2/Calls        : 949.276 us
+Grid : Message : 77.349702 s : Average mflops/s per call                : 2.68569e+10
+Grid : Message : 77.349710 s : Average mflops/s per call per rank       : 4.1964e+08
+Grid : Message : 77.349711 s : Average mflops/s per call per node       : 1.67856e+09
+Grid : Message : 77.349712 s : Average mflops/s per call (full)         : 1.51538e+08
+Grid : Message : 77.349713 s : Average mflops/s per call per rank (full): 2.36779e+06
+Grid : Message : 77.349714 s : Average mflops/s per call per node (full): 9.47115e+06
+Grid : Message : 77.349715 s : WilsonFermion5D Stencil
+Grid : Message : 77.349716 s : WilsonFermion5D StencilEven
+Grid : Message : 77.349717 s : WilsonFermion5D StencilOdd
+Grid : Message : 77.349718 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 77.349719 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 77.349720 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 104.883719 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 104.883743 s : Called DwDag
+Grid : Message : 104.883744 s : norm dag result 12.0421
+Grid : Message : 104.901901 s : norm dag ref    12.0421
+Grid : Message : 104.917822 s : norm dag diff   7.63254e-14
+Grid : Message : 104.957229 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 105.334551 s : src_e0.499998
+Grid : Message : 105.416616 s : src_o0.500002
+Grid : Message : 105.486729 s : *********************************************************
+Grid : Message : 105.486732 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 105.486733 s : * Vectorising space-time by 8
+Grid : Message : 105.486734 s : * SINGLE precision 
+Grid : Message : 105.486739 s : * Using Overlapped Comms/Compute
+Grid : Message : 105.486740 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 105.486741 s : *********************************************************
+Grid : Message : 119.695464 s : Deo mflop/s =   1.5039e+08
+Grid : Message : 119.695494 s : Deo mflop/s per rank   2.34984e+06
+Grid : Message : 119.695496 s : Deo mflop/s per node   9.39937e+06
+Grid : Message : 119.695502 s : #### Dhop calls report 
+Grid : Message : 119.695503 s : WilsonFermion5D Number of DhopEO Calls   : 3001
+Grid : Message : 119.695505 s : WilsonFermion5D TotalTime   /Calls        : 4734.45 us
+Grid : Message : 119.695507 s : WilsonFermion5D CommTime    /Calls        : 3287.23 us
+Grid : Message : 119.695508 s : WilsonFermion5D FaceTime    /Calls        : 537.724 us
+Grid : Message : 119.695509 s : WilsonFermion5D ComputeTime1/Calls        : 16.0483 us
+Grid : Message : 119.695510 s : WilsonFermion5D ComputeTime2/Calls        : 939.854 us
+Grid : Message : 119.695533 s : Average mflops/s per call                : 4.50726e+10
+Grid : Message : 119.695535 s : Average mflops/s per call per rank       : 7.04259e+08
+Grid : Message : 119.695536 s : Average mflops/s per call per node       : 2.81703e+09
+Grid : Message : 119.695537 s : Average mflops/s per call (full)         : 1.52405e+08
+Grid : Message : 119.695538 s : Average mflops/s per call per rank (full): 2.38133e+06
+Grid : Message : 119.695539 s : Average mflops/s per call per node (full): 9.52532e+06
+Grid : Message : 119.695540 s : WilsonFermion5D Stencil
+Grid : Message : 119.695541 s : WilsonFermion5D StencilEven
+Grid : Message : 119.695542 s : WilsonFermion5D StencilOdd
+Grid : Message : 119.695543 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 119.695544 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 119.695545 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 119.752707 s : r_e6.02108
+Grid : Message : 119.759448 s : r_o6.02101
+Grid : Message : 119.765382 s : res12.0421
+Grid : Message : 120.419093 s : norm diff   0
+Grid : Message : 120.829772 s : norm diff even  0
+Grid : Message : 120.909078 s : norm diff odd   0
--- a/systems/Booster/dwf.4node.perf
+++ b/systems/Booster/dwf.4node.perf
@@ -0,0 +1,156 @@
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 16
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14e9c0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=e188c0512ebee79bfb15906676af1c9e142aa21a: (HEAD -> develop) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 0.717713 s : Grid Layout
+Grid : Message : 0.717716 s : 	Global lattice size  : 64 64 64 64 
+Grid : Message : 0.717724 s : 	OpenMP threads       : 4
+Grid : Message : 0.717725 s : 	MPI tasks            : 2 2 2 2 
+Grid : Message : 0.801634 s : Making s innermost grids
+Grid : Message : 0.844903 s : Initialising 4d RNG
+Grid : Message : 0.940001 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 0.940060 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 1.338368 s : Initialising 5d RNG
+Grid : Message : 2.859273 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 2.859304 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 11.140924 s : Initialised RNGs
+Grid : Message : 13.433456 s : Drawing gauge field
+Grid : Message : 13.955847 s : Random gauge initialised 
+Grid : Message : 15.528535 s : Setting up Cshift based reference 
+Grid : Message : 21.484340 s : *****************************************************************
+Grid : Message : 21.484840 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 21.484860 s : *****************************************************************
+Grid : Message : 21.484870 s : *****************************************************************
+Grid : Message : 21.484880 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 21.484890 s : * Vectorising space-time by 8
+Grid : Message : 21.484900 s : * VComplexF size is 64 B
+Grid : Message : 21.484910 s : * SINGLE precision 
+Grid : Message : 21.484920 s : * Using Overlapped Comms/Compute
+Grid : Message : 21.484930 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 21.484940 s : *****************************************************************
+Grid : Message : 22.344741 s : Called warmup
+Grid : Message : 49.832292 s : Called Dw 3000 times in 2.74873e+07 us
+Grid : Message : 49.832358 s : mflop/s =   3.86726e+07
+Grid : Message : 49.832360 s : mflop/s per rank =  2.41704e+06
+Grid : Message : 49.832361 s : mflop/s per node =  9.66814e+06
+Grid : Message : 49.832362 s : RF  GiB/s (base 2) =   78581.7
+Grid : Message : 49.832363 s : mem GiB/s (base 2) =   49113.6
+Grid : Message : 49.835924 s : norm diff   1.03481e-13
+Grid : Message : 49.870568 s : #### Dhop calls report 
+Grid : Message : 49.870574 s : WilsonFermion5D Number of DhopEO Calls   : 6002
+Grid : Message : 49.870598 s : WilsonFermion5D TotalTime   /Calls        : 4616.79 us
+Grid : Message : 49.870600 s : WilsonFermion5D CommTime    /Calls        : 3241.77 us
+Grid : Message : 49.870601 s : WilsonFermion5D FaceTime    /Calls        : 469.006 us
+Grid : Message : 49.870602 s : WilsonFermion5D ComputeTime1/Calls        : 27.0492 us
+Grid : Message : 49.870603 s : WilsonFermion5D ComputeTime2/Calls        : 926.33 us
+Grid : Message : 49.870614 s : Average mflops/s per call                : 6.71631e+09
+Grid : Message : 49.870619 s : Average mflops/s per call per rank       : 4.19769e+08
+Grid : Message : 49.870621 s : Average mflops/s per call per node       : 1.67908e+09
+Grid : Message : 49.870626 s : Average mflops/s per call (full)         : 3.90723e+07
+Grid : Message : 49.870627 s : Average mflops/s per call per rank (full): 2.44202e+06
+Grid : Message : 49.870628 s : Average mflops/s per call per node (full): 9.76808e+06
+Grid : Message : 49.870629 s : WilsonFermion5D Stencil
+Grid : Message : 49.870630 s : WilsonFermion5D StencilEven
+Grid : Message : 49.870631 s : WilsonFermion5D StencilOdd
+Grid : Message : 49.870632 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 49.870633 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 49.870634 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 77.321890 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 77.321911 s : Called DwDag
+Grid : Message : 77.321912 s : norm dag result 12.0421
+Grid : Message : 77.334619 s : norm dag ref    12.0421
+Grid : Message : 77.350515 s : norm dag diff   7.63236e-14
+Grid : Message : 77.389923 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 77.769815 s : src_e0.499997
+Grid : Message : 77.847560 s : src_o0.500003
+Grid : Message : 77.917493 s : *********************************************************
+Grid : Message : 77.917496 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 77.917497 s : * Vectorising space-time by 8
+Grid : Message : 77.917498 s : * SINGLE precision 
+Grid : Message : 77.917499 s : * Using Overlapped Comms/Compute
+Grid : Message : 77.917500 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 77.917501 s : *********************************************************
+Grid : Message : 91.412946 s : Deo mflop/s =   3.95925e+07
+Grid : Message : 91.412978 s : Deo mflop/s per rank   2.47453e+06
+Grid : Message : 91.412980 s : Deo mflop/s per node   9.89813e+06
+Grid : Message : 91.412983 s : #### Dhop calls report 
+Grid : Message : 91.412984 s : WilsonFermion5D Number of DhopEO Calls   : 3001
+Grid : Message : 91.412986 s : WilsonFermion5D TotalTime   /Calls        : 4496.84 us
+Grid : Message : 91.412988 s : WilsonFermion5D CommTime    /Calls        : 3057.28 us
+Grid : Message : 91.412989 s : WilsonFermion5D FaceTime    /Calls        : 528.499 us
+Grid : Message : 91.412990 s : WilsonFermion5D ComputeTime1/Calls        : 16.1939 us
+Grid : Message : 91.412991 s : WilsonFermion5D ComputeTime2/Calls        : 942.557 us
+Grid : Message : 91.413021 s : Average mflops/s per call                : 1.12574e+10
+Grid : Message : 91.413023 s : Average mflops/s per call per rank       : 7.03586e+08
+Grid : Message : 91.413024 s : Average mflops/s per call per node       : 2.81434e+09
+Grid : Message : 91.413025 s : Average mflops/s per call (full)         : 4.01145e+07
+Grid : Message : 91.413026 s : Average mflops/s per call per rank (full): 2.50716e+06
+Grid : Message : 91.413027 s : Average mflops/s per call per node (full): 1.00286e+07
+Grid : Message : 91.413028 s : WilsonFermion5D Stencil
+Grid : Message : 91.413029 s : WilsonFermion5D StencilEven
+Grid : Message : 91.413030 s : WilsonFermion5D StencilOdd
+Grid : Message : 91.413031 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 91.413032 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 91.413033 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 91.470394 s : r_e6.02111
+Grid : Message : 91.476539 s : r_o6.02102
+Grid : Message : 91.482442 s : res12.0421
+Grid : Message : 92.138799 s : norm diff   0
+Grid : Message : 92.545354 s : norm diff even  0
+Grid : Message : 92.619444 s : norm diff odd   0
--- a/systems/Booster/dwf16.slurm
+++ b/systems/Booster/dwf16.slurm
@@ -0,0 +1,29 @@
+#!/bin/sh
+#SBATCH --account=gm2dwf
+#SBATCH --nodes=16
+#SBATCH --ntasks=64
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=12
+#SBATCH --time=0:30:00
+#SBATCH --partition=booster
+#SBATCH --gres=gpu:4
+
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+OPT="--comms-overlap --comms-concurrent"
+
+
+srun -N 16  -n $SLURM_NTASKS \
+	./benchmarks/Benchmark_dwf_fp32 \
+	$OPT \
+	--mpi 2.2.2.8 \
+	--accelerator-threads 8 \
+	--grid 64.64.64.256 \
+	--shm 2048 > dwf.16node.perf
+
+
--- a/systems/Booster/dwf4.slurm
+++ b/systems/Booster/dwf4.slurm
@@ -0,0 +1,39 @@
+#!/bin/sh
+#SBATCH --account=gm2dwf
+#SBATCH --nodes=4
+#SBATCH --ntasks=16
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=12
+#SBATCH --time=2:00:00
+#SBATCH --partition=develbooster
+#SBATCH --gres=gpu:4
+
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+OPT="--comms-overlap --comms-concurrent"
+
+srun -N 4  -n $SLURM_NTASKS \
+	./benchmarks/Benchmark_dwf_fp32 \
+	$OPT \
+	--mpi 2.2.2.2 \
+	--accelerator-threads 8 \
+	--grid 64.64.64.64 \
+	--shm 2048 > dwf.4node.perf
+
+
+srun -N 4  -n $SLURM_NTASKS \
+	./benchmarks/Benchmark_comms_host_device \
+	--mpi 2.2.2.2 \
+	--accelerator-threads 8 \
+	--grid 64.64.64.64 \
+	--shm 2048 > comms.4node.perf
+
+
+
+
--- a/systems/Booster/sourceme.sh
+++ b/systems/Booster/sourceme.sh
@@ -0,0 +1,5 @@
+module load GCC/9.3.0       
+module load  GMP/6.2.0   
+module load MPFR/4.1.0     
+module load OpenMPI/4.1.0rc1  
+module load CUDA/11.3