mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Compare commits
	
		
			35 Commits
		
	
	
		
			fix/HOST_N
			...
			8a098889fc
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					8a098889fc | ||
| 
						 | 
					ff2ea5de18 | ||
| 
						 | 
					da59379612 | ||
| 
						 | 
					3ef2a41518 | ||
| 
						 | 
					aa96f420c6 | ||
| 
						 | 
					49e9e4ed0e | ||
| 
						 | 
					f7b8163016 | ||
| 
						 | 
					93769eacd3 | ||
| 
						 | 
					59b0cc11df | ||
| 
						 | 
					f32c275376 | ||
| 
						 | 
					5404fc66ab | ||
| 
						 | 
					1f53458af8 | ||
| 
						 | 
					434c3e7f1d | ||
| 
						 | 
					500b119f3d | ||
| 
						 | 
					4b87259c1b | ||
| 
						 | 
					503dec34ef | ||
| 
						 | 
					d1e9fe50d2 | ||
| 
						 | 
					d01e5fa838 | ||
| 
						 | 
					a477c25e8c | ||
| 
						 | 
					1bd20cd9e8 | ||
| 
						 | 
					e49e95b037 | ||
| 
						 | 
					6f59fed563 | ||
| 
						 | 
					60b7f6c99d | ||
| 
						 | 
					b92dfcc8d3 | ||
| 
						 | 
					f6fd6dd053 | ||
| 
						 | 
					79ad567dd5 | ||
| 
						 | 
					fab1efb48c | ||
| 
						 | 
					660eb76d93 | ||
| 
						 | 
					62e7bf024a | ||
| 
						 | 
					95f3d69cf9 | ||
| 89c0519f83 | |||
| 2704b82084 | |||
| cf8632bbac | |||
| d224297972 | |||
| 
						 | 
					a4d11a630f | 
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#define _GRID_FFT_H_
 | 
					#define _GRID_FFT_H_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef HAVE_FFTW
 | 
					#ifdef HAVE_FFTW
 | 
				
			||||||
#ifdef USE_MKL
 | 
					#if defined(USE_MKL) || defined(GRID_SYCL)
 | 
				
			||||||
#include <fftw/fftw3.h>
 | 
					#include <fftw/fftw3.h>
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
#include <fftw3.h>
 | 
					#include <fftw3.h>
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -348,6 +348,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 | 
				
			|||||||
  return offbytes;
 | 
					  return offbytes;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef NVLINK_GET // Define to use get instead of put DMA
 | 
				
			||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
					double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
				
			||||||
							 void *xmit,
 | 
												 void *xmit,
 | 
				
			||||||
							 int dest,int dox,
 | 
												 int dest,int dox,
 | 
				
			||||||
@@ -380,9 +381,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
				
			|||||||
      list.push_back(rrq);
 | 
					      list.push_back(rrq);
 | 
				
			||||||
      off_node_bytes+=rbytes;
 | 
					      off_node_bytes+=rbytes;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					#ifdef NVLINK_GET
 | 
				
			||||||
 | 
					      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
 | 
				
			||||||
 | 
					      assert(shm!=NULL);
 | 
				
			||||||
 | 
					      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  if (dox) {
 | 
					  if (dox) {
 | 
				
			||||||
 | 
					    //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
 | 
				
			||||||
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
 | 
					    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
 | 
				
			||||||
      tag= dir+_processor*32;
 | 
					      tag= dir+_processor*32;
 | 
				
			||||||
      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
 | 
					      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
 | 
				
			||||||
@@ -390,9 +397,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
				
			|||||||
      list.push_back(xrq);
 | 
					      list.push_back(xrq);
 | 
				
			||||||
      off_node_bytes+=xbytes;
 | 
					      off_node_bytes+=xbytes;
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
 | 
					#ifndef NVLINK_GET
 | 
				
			||||||
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
 | 
					      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
 | 
				
			||||||
      assert(shm!=NULL);
 | 
					      assert(shm!=NULL);
 | 
				
			||||||
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
 | 
					      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -402,6 +412,8 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
 | 
				
			|||||||
{
 | 
					{
 | 
				
			||||||
  int nreq=list.size();
 | 
					  int nreq=list.size();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  acceleratorCopySynchronise();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (nreq==0) return;
 | 
					  if (nreq==0) return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<MPI_Status> status(nreq);
 | 
					  std::vector<MPI_Status> status(nreq);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -40,6 +40,9 @@ int                 GlobalSharedMemory::_ShmAlloc;
 | 
				
			|||||||
uint64_t            GlobalSharedMemory::_ShmAllocBytes;
 | 
					uint64_t            GlobalSharedMemory::_ShmAllocBytes;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
 | 
					std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
 | 
				
			||||||
 | 
					#ifndef ACCELERATOR_AWARE_MPI
 | 
				
			||||||
 | 
					void * GlobalSharedMemory::HostCommBuf;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
 | 
					Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
 | 
				
			||||||
int                 GlobalSharedMemory::WorldShmRank;
 | 
					int                 GlobalSharedMemory::WorldShmRank;
 | 
				
			||||||
@@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void)
 | 
				
			|||||||
/////////////////////////////////
 | 
					/////////////////////////////////
 | 
				
			||||||
// Alloc, free shmem region
 | 
					// Alloc, free shmem region
 | 
				
			||||||
/////////////////////////////////
 | 
					/////////////////////////////////
 | 
				
			||||||
 | 
					#ifndef ACCELERATOR_AWARE_MPI
 | 
				
			||||||
 | 
					void *SharedMemory::HostBufferMalloc(size_t bytes){
 | 
				
			||||||
 | 
					  void *ptr = (void *)host_heap_top;
 | 
				
			||||||
 | 
					  host_heap_top  += bytes;
 | 
				
			||||||
 | 
					  host_heap_bytes+= bytes;
 | 
				
			||||||
 | 
					  if (host_heap_bytes >= host_heap_size) {
 | 
				
			||||||
 | 
					    std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
 | 
				
			||||||
 | 
					    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
 | 
				
			||||||
 | 
					    std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<< " Current heap  is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
 | 
				
			||||||
 | 
					    assert(host_heap_bytes<host_heap_size);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return ptr;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void SharedMemory::HostBufferFreeAll(void) { 
 | 
				
			||||||
 | 
					  host_heap_top  =(size_t)HostCommBuf;
 | 
				
			||||||
 | 
					  host_heap_bytes=0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
void *SharedMemory::ShmBufferMalloc(size_t bytes){
 | 
					void *SharedMemory::ShmBufferMalloc(size_t bytes){
 | 
				
			||||||
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
 | 
					  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
 | 
				
			||||||
  void *ptr = (void *)heap_top;
 | 
					  void *ptr = (void *)heap_top;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -75,7 +75,9 @@ public:
 | 
				
			|||||||
  static int           Hugepages;
 | 
					  static int           Hugepages;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  static std::vector<void *> WorldShmCommBufs;
 | 
					  static std::vector<void *> WorldShmCommBufs;
 | 
				
			||||||
 | 
					#ifndef ACCELERATOR_AWARE_MPI
 | 
				
			||||||
 | 
					  static void *HostCommBuf;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
  static Grid_MPI_Comm WorldComm;
 | 
					  static Grid_MPI_Comm WorldComm;
 | 
				
			||||||
  static int           WorldRank;
 | 
					  static int           WorldRank;
 | 
				
			||||||
  static int           WorldSize;
 | 
					  static int           WorldSize;
 | 
				
			||||||
@@ -120,6 +122,13 @@ private:
 | 
				
			|||||||
  size_t heap_bytes;
 | 
					  size_t heap_bytes;
 | 
				
			||||||
  size_t heap_size;
 | 
					  size_t heap_size;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef ACCELERATOR_AWARE_MPI
 | 
				
			||||||
 | 
					  size_t host_heap_top;  // set in free all
 | 
				
			||||||
 | 
					  size_t host_heap_bytes;// set in free all
 | 
				
			||||||
 | 
					  void *HostCommBuf;     // set in SetCommunicator
 | 
				
			||||||
 | 
					  size_t host_heap_size; // set in SetCommunicator
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
protected:
 | 
					protected:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  Grid_MPI_Comm    ShmComm; // for barriers
 | 
					  Grid_MPI_Comm    ShmComm; // for barriers
 | 
				
			||||||
@@ -151,7 +160,10 @@ public:
 | 
				
			|||||||
  void *ShmBufferTranslate(int rank,void * local_p);
 | 
					  void *ShmBufferTranslate(int rank,void * local_p);
 | 
				
			||||||
  void *ShmBufferMalloc(size_t bytes);
 | 
					  void *ShmBufferMalloc(size_t bytes);
 | 
				
			||||||
  void  ShmBufferFreeAll(void) ;
 | 
					  void  ShmBufferFreeAll(void) ;
 | 
				
			||||||
  
 | 
					#ifndef ACCELERATOR_AWARE_MPI
 | 
				
			||||||
 | 
					  void *HostBufferMalloc(size_t bytes);
 | 
				
			||||||
 | 
					  void HostBufferFreeAll(void);
 | 
				
			||||||
 | 
					#endif  
 | 
				
			||||||
  //////////////////////////////////////////////////////////////////////////
 | 
					  //////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  // Make info on Nodes & ranks and Shared memory available
 | 
					  // Make info on Nodes & ranks and Shared memory available
 | 
				
			||||||
  //////////////////////////////////////////////////////////////////////////
 | 
					  //////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -39,10 +39,12 @@ Author: Christoph Lehner <christoph@lhnr.de>
 | 
				
			|||||||
#include <hip/hip_runtime_api.h>
 | 
					#include <hip/hip_runtime_api.h>
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GRID_SYCL
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
 | 
					#ifdef ACCELERATOR_AWARE_MPI
 | 
				
			||||||
#define GRID_SYCL_LEVEL_ZERO_IPC
 | 
					#define GRID_SYCL_LEVEL_ZERO_IPC
 | 
				
			||||||
#include <syscall.h>
 | 
					 | 
				
			||||||
#define SHM_SOCKETS
 | 
					#define SHM_SOCKETS
 | 
				
			||||||
#endif 
 | 
					#endif 
 | 
				
			||||||
 | 
					#include <syscall.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <sys/socket.h>
 | 
					#include <sys/socket.h>
 | 
				
			||||||
#include <sys/un.h>
 | 
					#include <sys/un.h>
 | 
				
			||||||
@@ -511,46 +513,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
				
			|||||||
////////////////////////////////////////////////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// Hugetlbfs mapping intended
 | 
					// Hugetlbfs mapping intended
 | 
				
			||||||
////////////////////////////////////////////////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
#if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//if defined(GRID_SYCL)
 | 
					 | 
				
			||||||
#if 0
 | 
					 | 
				
			||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  void * ShmCommBuf ; 
 | 
					 | 
				
			||||||
  assert(_ShmSetup==1);
 | 
					 | 
				
			||||||
  assert(_ShmAlloc==0);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					 | 
				
			||||||
  // allocate the pointer array for shared windows for our group
 | 
					 | 
				
			||||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					 | 
				
			||||||
  MPI_Barrier(WorldShmComm);
 | 
					 | 
				
			||||||
  WorldShmCommBufs.resize(WorldShmSize);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					 | 
				
			||||||
  // Each MPI rank should allocate our own buffer
 | 
					 | 
				
			||||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					 | 
				
			||||||
  ShmCommBuf = acceleratorAllocDevice(bytes);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (ShmCommBuf == (void *)NULL ) {
 | 
					 | 
				
			||||||
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
 | 
					 | 
				
			||||||
    exit(EXIT_FAILURE);  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 | 
					 | 
				
			||||||
	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  SharedMemoryZero(ShmCommBuf,bytes);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  assert(WorldShmSize == 1);
 | 
					 | 
				
			||||||
  for(int r=0;r<WorldShmSize;r++){
 | 
					 | 
				
			||||||
    WorldShmCommBufs[r] = ShmCommBuf;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  _ShmAllocBytes=bytes;
 | 
					 | 
				
			||||||
  _ShmAlloc=1;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL)
 | 
					#if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL)
 | 
				
			||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
					void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -574,6 +536,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
				
			|||||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  // Each MPI rank should allocate our own buffer
 | 
					  // Each MPI rank should allocate our own buffer
 | 
				
			||||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					#ifndef ACCELERATOR_AWARE_MPI
 | 
				
			||||||
 | 
					  HostCommBuf= malloc(bytes);
 | 
				
			||||||
 | 
					#endif  
 | 
				
			||||||
  ShmCommBuf = acceleratorAllocDevice(bytes);
 | 
					  ShmCommBuf = acceleratorAllocDevice(bytes);
 | 
				
			||||||
  if (ShmCommBuf == (void *)NULL ) {
 | 
					  if (ShmCommBuf == (void *)NULL ) {
 | 
				
			||||||
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
 | 
					    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
 | 
				
			||||||
@@ -738,7 +703,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
				
			|||||||
  _ShmAllocBytes=bytes;
 | 
					  _ShmAllocBytes=bytes;
 | 
				
			||||||
  _ShmAlloc=1;
 | 
					  _ShmAlloc=1;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#else 
 | 
					#else 
 | 
				
			||||||
#ifdef GRID_MPI3_SHMMMAP
 | 
					#ifdef GRID_MPI3_SHMMMAP
 | 
				
			||||||
@@ -962,6 +926,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  ShmBufferFreeAll();
 | 
					  ShmBufferFreeAll();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef ACCELERATOR_AWARE_MPI
 | 
				
			||||||
 | 
					  host_heap_size = heap_size;
 | 
				
			||||||
 | 
					  HostCommBuf= GlobalSharedMemory::HostCommBuf;
 | 
				
			||||||
 | 
					  HostBufferFreeAll();
 | 
				
			||||||
 | 
					#endif  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /////////////////////////////////////////////////////////////////////
 | 
					  /////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  // find comm ranks in our SHM group (i.e. which ranks are on our node)
 | 
					  // find comm ranks in our SHM group (i.e. which ranks are on our node)
 | 
				
			||||||
  /////////////////////////////////////////////////////////////////////
 | 
					  /////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -281,12 +281,29 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
 | 
				
			|||||||
  return nrm;
 | 
					  return nrm;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class vobj>
 | 
					template<class vobj>
 | 
				
			||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
 | 
					inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
 | 
				
			||||||
  GridBase *grid = left.Grid();
 | 
					  GridBase *grid = left.Grid();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
 | 
					  uint64_t csum=0;
 | 
				
			||||||
 | 
					  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    // Hack
 | 
				
			||||||
 | 
					    // Fast integer xor checksum. Can also be used in comms now.
 | 
				
			||||||
 | 
					    autoView(l_v,left,AcceleratorRead);
 | 
				
			||||||
 | 
					    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
 | 
				
			||||||
 | 
					    uint64_t *base= (uint64_t *)&l_v[0];
 | 
				
			||||||
 | 
					    csum=svm_xor(base,words);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  FlightRecorder::CsumLog(csum);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
  ComplexD nrm = rankInnerProduct(left,right);
 | 
					  ComplexD nrm = rankInnerProduct(left,right);
 | 
				
			||||||
  //  std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl;
 | 
					  RealD local = real(nrm);
 | 
				
			||||||
 | 
					  FlightRecorder::NormLog(real(nrm)); 
 | 
				
			||||||
  grid->GlobalSum(nrm);
 | 
					  grid->GlobalSum(nrm);
 | 
				
			||||||
 | 
					  FlightRecorder::ReductionLog(local,real(nrm)); 
 | 
				
			||||||
  return nrm;
 | 
					  return nrm;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -69,29 +69,30 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
 | 
				
			|||||||
  return result;
 | 
					  return result;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NAMESPACE_END(Grid);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					template<class Word> Word svm_xor(Word *vec,uint64_t L)
 | 
				
			||||||
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  Double sumResult; zeroit(sumResult);
 | 
					  Word xorResult; xorResult = 0;
 | 
				
			||||||
  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
 | 
					  Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
 | 
				
			||||||
  Double identity;  zeroit(identity);
 | 
					  Word identity;  identity=0;
 | 
				
			||||||
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
					  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
				
			||||||
     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
 | 
					     auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
 | 
				
			||||||
     cgh.parallel_for(cl::sycl::range<1>{L},
 | 
					     cgh.parallel_for(cl::sycl::range<1>{L},
 | 
				
			||||||
		      Reduction,
 | 
							      Reduction,
 | 
				
			||||||
		      [=] (cl::sycl::id<1> index, auto &sum) {
 | 
							      [=] (cl::sycl::id<1> index, auto &sum) {
 | 
				
			||||||
	 sum +=vec[index];
 | 
						 sum ^=vec[index];
 | 
				
			||||||
     });
 | 
					     });
 | 
				
			||||||
   });
 | 
					   });
 | 
				
			||||||
  theGridAccelerator->wait();
 | 
					  theGridAccelerator->wait();
 | 
				
			||||||
  Double ret = d_sum[0];
 | 
					  Word ret = d_sum[0];
 | 
				
			||||||
  free(d_sum,*theGridAccelerator);
 | 
					  free(d_sum,*theGridAccelerator);
 | 
				
			||||||
  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
 | 
					 | 
				
			||||||
  return ret;
 | 
					  return ret;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <class vobj>
 | 
					template <class vobj>
 | 
				
			||||||
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
 | 
					inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -411,7 +411,7 @@ public:
 | 
				
			|||||||
      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
 | 
					      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
 | 
				
			||||||
      SeedFixedIntegers(seeds);
 | 
					      SeedFixedIntegers(seeds);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  void SeedFixedIntegers(const std::vector<int> &seeds){
 | 
					  void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Everyone generates the same seed_seq based on input seeds
 | 
					    // Everyone generates the same seed_seq based on input seeds
 | 
				
			||||||
    CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
 | 
					    CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
 | 
				
			||||||
@@ -428,7 +428,6 @@ public:
 | 
				
			|||||||
    // MT implementation does not implement fast discard even though
 | 
					    // MT implementation does not implement fast discard even though
 | 
				
			||||||
    // in principle this is possible
 | 
					    // in principle this is possible
 | 
				
			||||||
    ////////////////////////////////////////////////
 | 
					    ////////////////////////////////////////////////
 | 
				
			||||||
#if 1
 | 
					 | 
				
			||||||
    thread_for( lidx, _grid->lSites(), {
 | 
					    thread_for( lidx, _grid->lSites(), {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	int gidx;
 | 
						int gidx;
 | 
				
			||||||
@@ -449,29 +448,12 @@ public:
 | 
				
			|||||||
	
 | 
						
 | 
				
			||||||
	int l_idx=generator_idx(o_idx,i_idx);
 | 
						int l_idx=generator_idx(o_idx,i_idx);
 | 
				
			||||||
	_generators[l_idx] = master_engine;
 | 
						_generators[l_idx] = master_engine;
 | 
				
			||||||
	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 | 
						if ( britney ) { 
 | 
				
			||||||
    });
 | 
						  Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
 | 
				
			||||||
#else
 | 
						} else { 	
 | 
				
			||||||
    // Everybody loops over global volume.
 | 
					 | 
				
			||||||
    thread_for( gidx, _grid->_gsites, {
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	// Where is it?
 | 
					 | 
				
			||||||
	int rank;
 | 
					 | 
				
			||||||
	int o_idx;
 | 
					 | 
				
			||||||
	int i_idx;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	Coordinate gcoor;
 | 
					 | 
				
			||||||
	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 | 
					 | 
				
			||||||
	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	// If this is one of mine we take it
 | 
					 | 
				
			||||||
	if( rank == _grid->ThisRank() ){
 | 
					 | 
				
			||||||
	  int l_idx=generator_idx(o_idx,i_idx);
 | 
					 | 
				
			||||||
	  _generators[l_idx] = master_engine;
 | 
					 | 
				
			||||||
	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 | 
						  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
    });
 | 
					    });
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
#else 
 | 
					#else 
 | 
				
			||||||
    ////////////////////////////////////////////////////////////////
 | 
					    ////////////////////////////////////////////////////////////////
 | 
				
			||||||
    // Machine and thread decomposition dependent seeding is efficient
 | 
					    // Machine and thread decomposition dependent seeding is efficient
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -462,6 +462,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 | 
				
			|||||||
    autoView(st_v , st,AcceleratorRead);
 | 
					    autoView(st_v , st,AcceleratorRead);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
   if( interior && exterior ) {
 | 
					   if( interior && exterior ) {
 | 
				
			||||||
 | 
					     acceleratorFenceComputeStream();
 | 
				
			||||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 | 
					     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 | 
				
			||||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 | 
					     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 | 
				
			||||||
#ifndef GRID_CUDA
 | 
					#ifndef GRID_CUDA
 | 
				
			||||||
@@ -495,6 +496,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 | 
				
			|||||||
    autoView(st_v ,st,AcceleratorRead);
 | 
					    autoView(st_v ,st,AcceleratorRead);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
   if( interior && exterior ) {
 | 
					   if( interior && exterior ) {
 | 
				
			||||||
 | 
					     acceleratorFenceComputeStream();
 | 
				
			||||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
 | 
					     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
 | 
				
			||||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
 | 
					     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
 | 
				
			||||||
#ifndef GRID_CUDA
 | 
					#ifndef GRID_CUDA
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -70,57 +70,6 @@ struct DefaultImplParams {
 | 
				
			|||||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 | 
					void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 | 
				
			||||||
				 int off,std::vector<std::pair<int,int> > & table);
 | 
									 int off,std::vector<std::pair<int,int> > & table);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
template<class vobj,class cobj,class compressor>
 | 
					 | 
				
			||||||
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)   __attribute__((noinline));
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template<class vobj,class cobj,class compressor>
 | 
					 | 
				
			||||||
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  int num=table.size();
 | 
					 | 
				
			||||||
  std::pair<int,int> *table_v = & table[0];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  auto rhs_v = rhs.View(AcceleratorRead);
 | 
					 | 
				
			||||||
  accelerator_forNB( i,num, vobj::Nsimd(), {
 | 
					 | 
				
			||||||
    compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]);
 | 
					 | 
				
			||||||
  });
 | 
					 | 
				
			||||||
  rhs_v.ViewClose();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
///////////////////////////////////////////////////////////////////
 | 
					 | 
				
			||||||
// Gather for when there *is* need to SIMD split with compression
 | 
					 | 
				
			||||||
///////////////////////////////////////////////////////////////////
 | 
					 | 
				
			||||||
template<class cobj,class vobj,class compressor>
 | 
					 | 
				
			||||||
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
 | 
					 | 
				
			||||||
				 commVector<cobj *> pointers,
 | 
					 | 
				
			||||||
				 int dimension,int plane,
 | 
					 | 
				
			||||||
				 int cbmask,compressor &compress,int type) __attribute__((noinline));
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template<class cobj,class vobj,class compressor>
 | 
					 | 
				
			||||||
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
 | 
					 | 
				
			||||||
				 const Lattice<vobj> &rhs,
 | 
					 | 
				
			||||||
				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
 | 
					 | 
				
			||||||
				 compressor &compress,int type)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  assert( (table.size()&0x1)==0);
 | 
					 | 
				
			||||||
  int num=table.size()/2;
 | 
					 | 
				
			||||||
  int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  auto rhs_v = rhs.View(AcceleratorRead);
 | 
					 | 
				
			||||||
  auto rhs_p = &rhs_v[0];
 | 
					 | 
				
			||||||
  auto p0=&pointers[0][0];
 | 
					 | 
				
			||||||
  auto p1=&pointers[1][0];
 | 
					 | 
				
			||||||
  auto tp=&table[0];
 | 
					 | 
				
			||||||
  accelerator_forNB(j, num, vobj::Nsimd(), {
 | 
					 | 
				
			||||||
      compress.CompressExchange(p0,p1, rhs_p, j,
 | 
					 | 
				
			||||||
				so+tp[2*j  ].second,
 | 
					 | 
				
			||||||
				so+tp[2*j+1].second,
 | 
					 | 
				
			||||||
				type);
 | 
					 | 
				
			||||||
  });
 | 
					 | 
				
			||||||
  rhs_v.ViewClose();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void DslashResetCounts(void);
 | 
					void DslashResetCounts(void);
 | 
				
			||||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 | 
					void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 | 
				
			||||||
void DslashLogFull(void);
 | 
					void DslashLogFull(void);
 | 
				
			||||||
@@ -258,6 +207,10 @@ public:
 | 
				
			|||||||
  struct Packet {
 | 
					  struct Packet {
 | 
				
			||||||
    void * send_buf;
 | 
					    void * send_buf;
 | 
				
			||||||
    void * recv_buf;
 | 
					    void * recv_buf;
 | 
				
			||||||
 | 
					#ifndef ACCELERATOR_AWARE_MPI
 | 
				
			||||||
 | 
					    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
 | 
				
			||||||
 | 
					    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
    Integer to_rank;
 | 
					    Integer to_rank;
 | 
				
			||||||
    Integer from_rank;
 | 
					    Integer from_rank;
 | 
				
			||||||
    Integer do_send;
 | 
					    Integer do_send;
 | 
				
			||||||
@@ -324,7 +277,7 @@ public:
 | 
				
			|||||||
  Vector<int> surface_list;
 | 
					  Vector<int> surface_list;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  stencilVector<StencilEntry>  _entries; // Resident in managed memory
 | 
					  stencilVector<StencilEntry>  _entries; // Resident in managed memory
 | 
				
			||||||
  commVector<StencilEntry>     _entries_device; // Resident in managed memory
 | 
					  commVector<StencilEntry>     _entries_device; // Resident in device memory
 | 
				
			||||||
  std::vector<Packet> Packets;
 | 
					  std::vector<Packet> Packets;
 | 
				
			||||||
  std::vector<Merge> Mergers;
 | 
					  std::vector<Merge> Mergers;
 | 
				
			||||||
  std::vector<Merge> MergersSHM;
 | 
					  std::vector<Merge> MergersSHM;
 | 
				
			||||||
@@ -408,33 +361,16 @@ public:
 | 
				
			|||||||
  // Use OpenMP Tasks for cleaner ???
 | 
					  // Use OpenMP Tasks for cleaner ???
 | 
				
			||||||
  // must be called *inside* parallel region
 | 
					  // must be called *inside* parallel region
 | 
				
			||||||
  //////////////////////////////////////////
 | 
					  //////////////////////////////////////////
 | 
				
			||||||
  /*
 | 
					 | 
				
			||||||
  void CommunicateThreaded()
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
#ifdef GRID_OMP
 | 
					 | 
				
			||||||
    int mythread = omp_get_thread_num();
 | 
					 | 
				
			||||||
    int nthreads = CartesianCommunicator::nCommThreads;
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
    int mythread = 0;
 | 
					 | 
				
			||||||
    int nthreads = 1;
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
    if (nthreads == -1) nthreads = 1;
 | 
					 | 
				
			||||||
    if (mythread < nthreads) {
 | 
					 | 
				
			||||||
      for (int i = mythread; i < Packets.size(); i += nthreads) {
 | 
					 | 
				
			||||||
	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 | 
					 | 
				
			||||||
						      Packets[i].to_rank,
 | 
					 | 
				
			||||||
						      Packets[i].recv_buf,
 | 
					 | 
				
			||||||
						      Packets[i].from_rank,
 | 
					 | 
				
			||||||
						      Packets[i].bytes,i);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  */
 | 
					 | 
				
			||||||
  ////////////////////////////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  // Non blocking send and receive. Necessarily parallel.
 | 
					  // Non blocking send and receive. Necessarily parallel.
 | 
				
			||||||
  ////////////////////////////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
					  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
 | 
					    // All GPU kernel tasks must complete
 | 
				
			||||||
 | 
					    //    accelerator_barrier();     // All kernels should ALREADY be complete
 | 
				
			||||||
 | 
					    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
 | 
				
			||||||
 | 
					                               // But the HaloGather had a barrier too.
 | 
				
			||||||
 | 
					#ifdef ACCELERATOR_AWARE_MPI
 | 
				
			||||||
    for(int i=0;i<Packets.size();i++){
 | 
					    for(int i=0;i<Packets.size();i++){
 | 
				
			||||||
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 | 
					      _grid->StencilSendToRecvFromBegin(MpiReqs,
 | 
				
			||||||
					Packets[i].send_buf,
 | 
										Packets[i].send_buf,
 | 
				
			||||||
@@ -443,16 +379,54 @@ public:
 | 
				
			|||||||
					Packets[i].from_rank,Packets[i].do_recv,
 | 
										Packets[i].from_rank,Packets[i].do_recv,
 | 
				
			||||||
					Packets[i].xbytes,Packets[i].rbytes,i);
 | 
										Packets[i].xbytes,Packets[i].rbytes,i);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
 | 
				
			||||||
 | 
					    for(int i=0;i<Packets.size();i++){
 | 
				
			||||||
 | 
					      // Introduce a host buffer with a cheap slab allocator and zero cost wipe all
 | 
				
			||||||
 | 
					      Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
 | 
				
			||||||
 | 
					      Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
 | 
				
			||||||
 | 
					      if ( Packets[i].do_send ) {
 | 
				
			||||||
 | 
						acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      _grid->StencilSendToRecvFromBegin(MpiReqs,
 | 
				
			||||||
 | 
										Packets[i].host_send_buf,
 | 
				
			||||||
 | 
										Packets[i].to_rank,Packets[i].do_send,
 | 
				
			||||||
 | 
										Packets[i].host_recv_buf,
 | 
				
			||||||
 | 
										Packets[i].from_rank,Packets[i].do_recv,
 | 
				
			||||||
 | 
										Packets[i].xbytes,Packets[i].rbytes,i);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					    // Get comms started then run checksums
 | 
				
			||||||
 | 
					    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
 | 
				
			||||||
 | 
					    for(int i=0;i<Packets.size();i++){
 | 
				
			||||||
 | 
					      if ( Packets[i].do_send )
 | 
				
			||||||
 | 
						FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
					  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    _grid->StencilSendToRecvFromComplete(MpiReqs,0);
 | 
					    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
 | 
				
			||||||
    if   ( this->partialDirichlet ) DslashLogPartial();
 | 
					    if   ( this->partialDirichlet ) DslashLogPartial();
 | 
				
			||||||
    else if ( this->fullDirichlet ) DslashLogDirichlet();
 | 
					    else if ( this->fullDirichlet ) DslashLogDirichlet();
 | 
				
			||||||
    else DslashLogFull();
 | 
					    else DslashLogFull();
 | 
				
			||||||
    acceleratorCopySynchronise();
 | 
					    // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
 | 
				
			||||||
 | 
					    //    accelerator_barrier(); 
 | 
				
			||||||
    _grid->StencilBarrier(); 
 | 
					    _grid->StencilBarrier(); 
 | 
				
			||||||
 | 
					#ifndef ACCELERATOR_AWARE_MPI
 | 
				
			||||||
 | 
					#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
 | 
				
			||||||
 | 
					    for(int i=0;i<Packets.size();i++){
 | 
				
			||||||
 | 
					      if ( Packets[i].do_recv ) {
 | 
				
			||||||
 | 
						acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    _grid->HostBufferFreeAll();
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					    // run any checksums
 | 
				
			||||||
 | 
					    for(int i=0;i<Packets.size();i++){
 | 
				
			||||||
 | 
					      if ( Packets[i].do_recv )
 | 
				
			||||||
 | 
						FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  ////////////////////////////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  // Blocking send and receive. Either sequential or parallel.
 | 
					  // Blocking send and receive. Either sequential or parallel.
 | 
				
			||||||
@@ -528,6 +502,7 @@ public:
 | 
				
			|||||||
  template<class compressor>
 | 
					  template<class compressor>
 | 
				
			||||||
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
 | 
					  void HaloGather(const Lattice<vobj> &source,compressor &compress)
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
 | 
					    //    accelerator_barrier();
 | 
				
			||||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
					    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert(source.Grid()==_grid);
 | 
					    assert(source.Grid()==_grid);
 | 
				
			||||||
@@ -540,10 +515,9 @@ public:
 | 
				
			|||||||
      compress.Point(point);
 | 
					      compress.Point(point);
 | 
				
			||||||
      HaloGatherDir(source,compress,point,face_idx);
 | 
					      HaloGatherDir(source,compress,point,face_idx);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    accelerator_barrier();
 | 
					    accelerator_barrier(); // All my local gathers are complete
 | 
				
			||||||
    face_table_computed=1;
 | 
					    face_table_computed=1;
 | 
				
			||||||
    assert(u_comm_offset==_unified_buffer_size);
 | 
					    assert(u_comm_offset==_unified_buffer_size);
 | 
				
			||||||
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /////////////////////////
 | 
					  /////////////////////////
 | 
				
			||||||
@@ -579,6 +553,7 @@ public:
 | 
				
			|||||||
      accelerator_forNB(j, words, cobj::Nsimd(), {
 | 
					      accelerator_forNB(j, words, cobj::Nsimd(), {
 | 
				
			||||||
	  coalescedWrite(to[j] ,coalescedRead(from [j]));
 | 
						  coalescedWrite(to[j] ,coalescedRead(from [j]));
 | 
				
			||||||
      });
 | 
					      });
 | 
				
			||||||
 | 
					      acceleratorFenceComputeStream();
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
@@ -669,6 +644,7 @@ public:
 | 
				
			|||||||
    for(int i=0;i<dd.size();i++){
 | 
					    for(int i=0;i<dd.size();i++){
 | 
				
			||||||
      decompressor::DecompressFace(decompress,dd[i]);
 | 
					      decompressor::DecompressFace(decompress,dd[i]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    acceleratorFenceComputeStream(); // dependent kernels
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  ////////////////////////////////////////
 | 
					  ////////////////////////////////////////
 | 
				
			||||||
  // Set up routines
 | 
					  // Set up routines
 | 
				
			||||||
@@ -1224,7 +1200,6 @@ public:
 | 
				
			|||||||
	  ///////////////////////////////////////////////////////////
 | 
						  ///////////////////////////////////////////////////////////
 | 
				
			||||||
	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
 | 
						  int do_send = (comms_send|comms_partial_send) && (!shm_send );
 | 
				
			||||||
	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
 | 
						  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	  AddPacket((void *)&send_buf[comm_off],
 | 
						  AddPacket((void *)&send_buf[comm_off],
 | 
				
			||||||
		    (void *)&recv_buf[comm_off],
 | 
							    (void *)&recv_buf[comm_off],
 | 
				
			||||||
		    xmit_to_rank, do_send,
 | 
							    xmit_to_rank, do_send,
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -405,11 +405,4 @@ NAMESPACE_BEGIN(Grid);
 | 
				
			|||||||
NAMESPACE_END(Grid);
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef GRID_SYCL
 | 
					 | 
				
			||||||
template<typename T> struct
 | 
					 | 
				
			||||||
sycl::is_device_copyable<T, typename std::enable_if<
 | 
					 | 
				
			||||||
			      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value),
 | 
					 | 
				
			||||||
			      void>::type>
 | 
					 | 
				
			||||||
  : public std::true_type {};
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										341
									
								
								Grid/util/FlightRecorder.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										341
									
								
								Grid/util/FlightRecorder.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,341 @@
 | 
				
			|||||||
 | 
					/*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./lib/Init.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Peter Boyle <peterboyle@MacBook-Pro.local>
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					*************************************************************************************/
 | 
				
			||||||
 | 
					/*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid/Grid.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					///////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					// Grid Norm logging for repro testing
 | 
				
			||||||
 | 
					///////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					int FlightRecorder::PrintEntireLog;
 | 
				
			||||||
 | 
					int FlightRecorder::ContinueOnFail;
 | 
				
			||||||
 | 
					int FlightRecorder::LoggingMode;
 | 
				
			||||||
 | 
					int FlightRecorder::ChecksumComms;
 | 
				
			||||||
 | 
					int FlightRecorder::ChecksumCommsSend;
 | 
				
			||||||
 | 
					int32_t  FlightRecorder::XmitLoggingCounter;
 | 
				
			||||||
 | 
					int32_t  FlightRecorder::RecvLoggingCounter;
 | 
				
			||||||
 | 
					int32_t  FlightRecorder::CsumLoggingCounter;
 | 
				
			||||||
 | 
					int32_t  FlightRecorder::NormLoggingCounter;
 | 
				
			||||||
 | 
					int32_t  FlightRecorder::ReductionLoggingCounter;
 | 
				
			||||||
 | 
					uint64_t FlightRecorder::ErrorCounter;
 | 
				
			||||||
 | 
					std::vector<double> FlightRecorder::NormLogVector;
 | 
				
			||||||
 | 
					std::vector<double> FlightRecorder::ReductionLogVector;
 | 
				
			||||||
 | 
					std::vector<uint64_t> FlightRecorder::CsumLogVector;
 | 
				
			||||||
 | 
					std::vector<uint64_t> FlightRecorder::XmitLogVector;
 | 
				
			||||||
 | 
					std::vector<uint64_t> FlightRecorder::RecvLogVector;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void FlightRecorder::ResetCounters(void)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  XmitLoggingCounter=0;
 | 
				
			||||||
 | 
					  RecvLoggingCounter=0;
 | 
				
			||||||
 | 
					  CsumLoggingCounter=0;
 | 
				
			||||||
 | 
					  NormLoggingCounter=0;
 | 
				
			||||||
 | 
					  ReductionLoggingCounter=0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void FlightRecorder::Truncate(void)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  ResetCounters();
 | 
				
			||||||
 | 
					  XmitLogVector.resize(0);
 | 
				
			||||||
 | 
					  RecvLogVector.resize(0);
 | 
				
			||||||
 | 
					  NormLogVector.resize(0);
 | 
				
			||||||
 | 
					  CsumLogVector.resize(0);
 | 
				
			||||||
 | 
					  ReductionLogVector.resize(0);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  switch ( mode ) {
 | 
				
			||||||
 | 
					  case LoggingModePrint:
 | 
				
			||||||
 | 
					    SetLoggingModePrint();
 | 
				
			||||||
 | 
					    break;
 | 
				
			||||||
 | 
					  case LoggingModeRecord:
 | 
				
			||||||
 | 
					    SetLoggingModeRecord();
 | 
				
			||||||
 | 
					    break;
 | 
				
			||||||
 | 
					  case LoggingModeVerify:
 | 
				
			||||||
 | 
					    SetLoggingModeVerify();
 | 
				
			||||||
 | 
					    break;
 | 
				
			||||||
 | 
					  case LoggingModeNone:
 | 
				
			||||||
 | 
					    LoggingMode = mode;
 | 
				
			||||||
 | 
					    Truncate();
 | 
				
			||||||
 | 
					    break;
 | 
				
			||||||
 | 
					  default:
 | 
				
			||||||
 | 
					    assert(0);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void FlightRecorder::SetLoggingModePrint(void)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  std::cout << " FlightRecorder: set to print output " <<std::endl;
 | 
				
			||||||
 | 
					  Truncate();
 | 
				
			||||||
 | 
					  LoggingMode = LoggingModePrint;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void FlightRecorder::SetLoggingModeRecord(void)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  std::cout << " FlightRecorder: set to RECORD " <<std::endl;
 | 
				
			||||||
 | 
					  Truncate();
 | 
				
			||||||
 | 
					  LoggingMode = LoggingModeRecord;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void FlightRecorder::SetLoggingModeVerify(void)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl;
 | 
				
			||||||
 | 
					  ResetCounters();
 | 
				
			||||||
 | 
					  LoggingMode = LoggingModeVerify;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					uint64_t FlightRecorder::ErrorCount(void)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  return ErrorCounter;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void FlightRecorder::NormLog(double value)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  uint64_t hex = * ( (uint64_t *)&value );
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModePrint) {
 | 
				
			||||||
 | 
					    std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
				
			||||||
 | 
					    NormLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeRecord) {
 | 
				
			||||||
 | 
					    std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
				
			||||||
 | 
					    NormLogVector.push_back(value);
 | 
				
			||||||
 | 
					    NormLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeVerify) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if(NormLoggingCounter < NormLogVector.size()){
 | 
				
			||||||
 | 
					      uint64_t hexref  = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
 | 
				
			||||||
 | 
							 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
 | 
				
			||||||
 | 
							 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n",
 | 
				
			||||||
 | 
							GridHostname(),
 | 
				
			||||||
 | 
							GlobalSharedMemory::WorldShmRank,
 | 
				
			||||||
 | 
							NormLoggingCounter,NormLogVector.size(),
 | 
				
			||||||
 | 
							value, NormLogVector[NormLoggingCounter]); fflush(stderr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if(!ContinueOnFail)assert(0); // Force takedown of job
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						ErrorCounter++;
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
						if ( PrintEntireLog ) { 
 | 
				
			||||||
 | 
						  std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex
 | 
				
			||||||
 | 
							   <<" "<<hex<<" "<<hexref
 | 
				
			||||||
 | 
							   <<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					       
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if ( NormLogVector.size()==NormLoggingCounter ) {
 | 
				
			||||||
 | 
					      std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    NormLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void FlightRecorder::CsumLog(uint64_t hex)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModePrint) {
 | 
				
			||||||
 | 
					    std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
				
			||||||
 | 
					    CsumLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeRecord) {
 | 
				
			||||||
 | 
					    std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
				
			||||||
 | 
					    CsumLogVector.push_back(hex);
 | 
				
			||||||
 | 
					    CsumLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeVerify) {
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    if(CsumLoggingCounter < CsumLogVector.size()) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      uint64_t hexref  = CsumLogVector[CsumLoggingCounter] ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if ( hex != hexref ) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
 | 
				
			||||||
 | 
							 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n",
 | 
				
			||||||
 | 
							GridHostname(),
 | 
				
			||||||
 | 
							GlobalSharedMemory::WorldShmRank,
 | 
				
			||||||
 | 
							CsumLoggingCounter,hex, hexref);
 | 
				
			||||||
 | 
						fflush(stderr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if(!ContinueOnFail) assert(0); // Force takedown of job
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						ErrorCounter++;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if ( PrintEntireLog ) { 
 | 
				
			||||||
 | 
						  std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex
 | 
				
			||||||
 | 
							   <<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }  
 | 
				
			||||||
 | 
					    if ( CsumLogVector.size()==CsumLoggingCounter ) {
 | 
				
			||||||
 | 
					      std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    CsumLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void FlightRecorder::ReductionLog(double local,double global)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  uint64_t hex_l = * ( (uint64_t *)&local );
 | 
				
			||||||
 | 
					  uint64_t hex_g = * ( (uint64_t *)&global );
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModePrint) {
 | 
				
			||||||
 | 
					    std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
 | 
				
			||||||
 | 
					    ReductionLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeRecord) {
 | 
				
			||||||
 | 
					    std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
 | 
				
			||||||
 | 
					    ReductionLogVector.push_back(global);
 | 
				
			||||||
 | 
					    ReductionLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeVerify) {
 | 
				
			||||||
 | 
					    if(ReductionLoggingCounter < ReductionLogVector.size()){
 | 
				
			||||||
 | 
					      if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
 | 
				
			||||||
 | 
						fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
 | 
				
			||||||
 | 
							GridHostname(),
 | 
				
			||||||
 | 
							GlobalSharedMemory::WorldShmRank,
 | 
				
			||||||
 | 
							ReductionLoggingCounter,ReductionLogVector.size(),
 | 
				
			||||||
 | 
							global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						if ( !ContinueOnFail ) assert(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ErrorCounter++;
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
						if ( PrintEntireLog ) { 
 | 
				
			||||||
 | 
						  std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if ( ReductionLogVector.size()==ReductionLoggingCounter ) {
 | 
				
			||||||
 | 
					      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    ReductionLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  if ( ChecksumCommsSend ){
 | 
				
			||||||
 | 
					  uint64_t *ubuf = (uint64_t *)buf;
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeNone) return;
 | 
				
			||||||
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
 | 
					  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModePrint) {
 | 
				
			||||||
 | 
					    std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
 | 
				
			||||||
 | 
					    XmitLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeRecord) {
 | 
				
			||||||
 | 
					    std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
 | 
				
			||||||
 | 
					    XmitLogVector.push_back(_xor);
 | 
				
			||||||
 | 
					    XmitLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeVerify) {
 | 
				
			||||||
 | 
					    if(XmitLoggingCounter < XmitLogVector.size()){
 | 
				
			||||||
 | 
					      if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
 | 
				
			||||||
 | 
						fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n",
 | 
				
			||||||
 | 
							GridHostname(),
 | 
				
			||||||
 | 
							GlobalSharedMemory::WorldShmRank,
 | 
				
			||||||
 | 
							XmitLoggingCounter,XmitLogVector.size(),
 | 
				
			||||||
 | 
							_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						if ( !ContinueOnFail ) assert(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ErrorCounter++;
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
						if ( PrintEntireLog ) { 
 | 
				
			||||||
 | 
						  std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  XmitLogVector[XmitLoggingCounter] <<std::endl;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if ( XmitLogVector.size()==XmitLoggingCounter ) {
 | 
				
			||||||
 | 
					      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    XmitLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    uint64_t word = 1;
 | 
				
			||||||
 | 
					    deviceVector<uint64_t> dev(1);
 | 
				
			||||||
 | 
					    acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t));
 | 
				
			||||||
 | 
					    acceleratorCopySynchronise();
 | 
				
			||||||
 | 
					#ifndef GRID_COMMS_NONE
 | 
				
			||||||
 | 
					    MPI_Barrier(MPI_COMM_WORLD);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  if ( ChecksumComms ){
 | 
				
			||||||
 | 
					  uint64_t *ubuf = (uint64_t *)buf;
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeNone) return;
 | 
				
			||||||
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
 | 
					  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModePrint) {
 | 
				
			||||||
 | 
					    std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
 | 
				
			||||||
 | 
					    RecvLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeRecord) {
 | 
				
			||||||
 | 
					    std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
 | 
				
			||||||
 | 
					    RecvLogVector.push_back(_xor);
 | 
				
			||||||
 | 
					    RecvLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if(LoggingMode == LoggingModeVerify) {
 | 
				
			||||||
 | 
					    if(RecvLoggingCounter < RecvLogVector.size()){
 | 
				
			||||||
 | 
					      if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
 | 
				
			||||||
 | 
						fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n",
 | 
				
			||||||
 | 
							GridHostname(),
 | 
				
			||||||
 | 
							GlobalSharedMemory::WorldShmRank,
 | 
				
			||||||
 | 
							RecvLoggingCounter,RecvLogVector.size(),
 | 
				
			||||||
 | 
							_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						if ( !ContinueOnFail ) assert(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ErrorCounter++;
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
						if ( PrintEntireLog ) { 
 | 
				
			||||||
 | 
						  std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  RecvLogVector[RecvLoggingCounter] <<std::endl;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if ( RecvLogVector.size()==RecvLoggingCounter ) {
 | 
				
			||||||
 | 
					      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    RecvLoggingCounter++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
							
								
								
									
										43
									
								
								Grid/util/FlightRecorder.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								Grid/util/FlightRecorder.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,43 @@
 | 
				
			|||||||
 | 
					#pragma once
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					class FlightRecorder {
 | 
				
			||||||
 | 
					 public:
 | 
				
			||||||
 | 
					  enum LoggingMode_t {
 | 
				
			||||||
 | 
					    LoggingModeNone,
 | 
				
			||||||
 | 
					    LoggingModePrint,
 | 
				
			||||||
 | 
					    LoggingModeRecord,
 | 
				
			||||||
 | 
					    LoggingModeVerify
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  static int                   LoggingMode;
 | 
				
			||||||
 | 
					  static uint64_t              ErrorCounter;
 | 
				
			||||||
 | 
					  static int32_t               XmitLoggingCounter;
 | 
				
			||||||
 | 
					  static int32_t               RecvLoggingCounter;
 | 
				
			||||||
 | 
					  static int32_t               CsumLoggingCounter;
 | 
				
			||||||
 | 
					  static int32_t               NormLoggingCounter;
 | 
				
			||||||
 | 
					  static int32_t               ReductionLoggingCounter;
 | 
				
			||||||
 | 
					  static std::vector<uint64_t> XmitLogVector;
 | 
				
			||||||
 | 
					  static std::vector<uint64_t> RecvLogVector;
 | 
				
			||||||
 | 
					  static std::vector<uint64_t> CsumLogVector;
 | 
				
			||||||
 | 
					  static std::vector<double>   NormLogVector;
 | 
				
			||||||
 | 
					  static std::vector<double>   ReductionLogVector;
 | 
				
			||||||
 | 
					  static int ContinueOnFail;
 | 
				
			||||||
 | 
					  static int PrintEntireLog;
 | 
				
			||||||
 | 
					  static int ChecksumComms;
 | 
				
			||||||
 | 
					  static int ChecksumCommsSend;
 | 
				
			||||||
 | 
					  static void SetLoggingModePrint(void);
 | 
				
			||||||
 | 
					  static void SetLoggingModeRecord(void);
 | 
				
			||||||
 | 
					  static void SetLoggingModeVerify(void);
 | 
				
			||||||
 | 
					  static void SetLoggingMode(LoggingMode_t mode);
 | 
				
			||||||
 | 
					  static void NormLog(double value);
 | 
				
			||||||
 | 
					  static void CsumLog(uint64_t csum);
 | 
				
			||||||
 | 
					  static void ReductionLog(double lcl, double glbl);
 | 
				
			||||||
 | 
					  static void Truncate(void);
 | 
				
			||||||
 | 
					  static void ResetCounters(void);
 | 
				
			||||||
 | 
					  static uint64_t ErrorCount(void);
 | 
				
			||||||
 | 
					  static void xmitLog(void *,uint64_t bytes);
 | 
				
			||||||
 | 
					  static void recvLog(void *,uint64_t bytes,int rank);
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -94,7 +94,12 @@ int GridThread::_threads =1;
 | 
				
			|||||||
int GridThread::_hyperthreads=1;
 | 
					int GridThread::_hyperthreads=1;
 | 
				
			||||||
int GridThread::_cores=1;
 | 
					int GridThread::_cores=1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					char hostname[HOST_NAME_MAX+1];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					char *GridHostname(void)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  return hostname;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
const Coordinate &GridDefaultLatt(void)     {return Grid_default_latt;};
 | 
					const Coordinate &GridDefaultLatt(void)     {return Grid_default_latt;};
 | 
				
			||||||
const Coordinate &GridDefaultMpi(void)      {return Grid_default_mpi;};
 | 
					const Coordinate &GridDefaultMpi(void)      {return Grid_default_mpi;};
 | 
				
			||||||
const Coordinate GridDefaultSimd(int dims,int nsimd)
 | 
					const Coordinate GridDefaultSimd(int dims,int nsimd)
 | 
				
			||||||
@@ -397,7 +402,6 @@ void Grid_init(int *argc,char ***argv)
 | 
				
			|||||||
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
 | 
					  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
 | 
				
			||||||
  std::cout << GridLogMessage << "================================================ "<<std::endl;
 | 
					  std::cout << GridLogMessage << "================================================ "<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  char hostname[HOST_NAME_MAX+1];
 | 
					 | 
				
			||||||
  gethostname(hostname, HOST_NAME_MAX+1);
 | 
					  gethostname(hostname, HOST_NAME_MAX+1);
 | 
				
			||||||
  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
 | 
					  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
 | 
				
			|||||||
void Grid_init(int *argc,char ***argv);
 | 
					void Grid_init(int *argc,char ***argv);
 | 
				
			||||||
void Grid_finalize(void);
 | 
					void Grid_finalize(void);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					char * GridHostname(void);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// internal, controled with --handle
 | 
					// internal, controled with --handle
 | 
				
			||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
 | 
					void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
 | 
				
			||||||
void Grid_debug_handler_init(void);
 | 
					void Grid_debug_handler_init(void);
 | 
				
			||||||
@@ -68,5 +70,6 @@ void GridParseLayout(char **argv,int argc,
 | 
				
			|||||||
void printHash(void);
 | 
					void printHash(void);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NAMESPACE_END(Grid);
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,6 +1,6 @@
 | 
				
			|||||||
#ifndef GRID_UTIL_H
 | 
					#pragma once
 | 
				
			||||||
#define GRID_UTIL_H
 | 
					 | 
				
			||||||
#include <Grid/util/Coordinate.h>
 | 
					#include <Grid/util/Coordinate.h>
 | 
				
			||||||
#include <Grid/util/Lexicographic.h>
 | 
					#include <Grid/util/Lexicographic.h>
 | 
				
			||||||
#include <Grid/util/Init.h>
 | 
					#include <Grid/util/Init.h>
 | 
				
			||||||
#endif
 | 
					#include <Grid/util/FlightRecorder.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										21
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								configure.ac
									
									
									
									
									
								
							@@ -226,23 +226,14 @@ case ${ac_SFW_FP16} in
 | 
				
			|||||||
esac
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
 | 
					############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
 | 
				
			||||||
AC_ARG_ENABLE([accelerator-cshift],
 | 
					AC_ARG_ENABLE([accelerator-aware-mpi],
 | 
				
			||||||
    [AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])],
 | 
					    [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
 | 
				
			||||||
    [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
 | 
					    [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
AC_ARG_ENABLE([ucx-buggy],
 | 
					case ${ac_ACCELERATOR_AWARE_MPI} in
 | 
				
			||||||
    [AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])],
 | 
					 | 
				
			||||||
    [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
case ${ac_UCXBUGGY} in
 | 
					 | 
				
			||||||
    yes)
 | 
					    yes)
 | 
				
			||||||
    ac_ACC_CSHIFT=no;;
 | 
					      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
 | 
				
			||||||
    *);;
 | 
					      AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
 | 
				
			||||||
esac
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
case ${ac_ACC_CSHIFT} in
 | 
					 | 
				
			||||||
    yes)
 | 
					 | 
				
			||||||
      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
 | 
					 | 
				
			||||||
    *);;
 | 
					    *);;
 | 
				
			||||||
esac
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,16 +1,16 @@
 | 
				
			|||||||
TOOLS=$HOME/tools
 | 
					
 | 
				
			||||||
../../configure \
 | 
					../../configure \
 | 
				
			||||||
	--enable-simd=GPU \
 | 
						--enable-simd=GPU \
 | 
				
			||||||
	--enable-gen-simd-width=64 \
 | 
						--enable-gen-simd-width=64 \
 | 
				
			||||||
	--enable-comms=mpi-auto \
 | 
						--enable-comms=mpi-auto \
 | 
				
			||||||
	--enable-accelerator-cshift \
 | 
					 | 
				
			||||||
	--disable-gparity \
 | 
						--disable-gparity \
 | 
				
			||||||
	--disable-fermion-reps \
 | 
						--disable-fermion-reps \
 | 
				
			||||||
	--enable-shm=nvlink \
 | 
						--enable-shm=nvlink \
 | 
				
			||||||
	--enable-accelerator=sycl \
 | 
						--enable-accelerator=sycl \
 | 
				
			||||||
 | 
						--enable-accelerator-aware-mpi=no\
 | 
				
			||||||
	--enable-unified=no \
 | 
						--enable-unified=no \
 | 
				
			||||||
	MPICXX=mpicxx \
 | 
						MPICXX=mpicxx \
 | 
				
			||||||
	CXX=icpx \
 | 
						CXX=icpx \
 | 
				
			||||||
	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \
 | 
						LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
 | 
				
			||||||
	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel"
 | 
						CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										2
									
								
								systems/Aurora/sourceme-sunspot-deterministic.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								systems/Aurora/sourceme-sunspot-deterministic.sh
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,2 @@
 | 
				
			|||||||
 | 
					module load oneapi/eng-compiler/2023.05.15.003
 | 
				
			||||||
 | 
					module load mpich/51.2/icc-all-deterministic-pmix-gpu
 | 
				
			||||||
							
								
								
									
										41
									
								
								systems/Aurora/tests/repro128.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								systems/Aurora/tests/repro128.pbs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,41 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#PBS -q EarlyAppAccess
 | 
				
			||||||
 | 
					#PBS -l select=128
 | 
				
			||||||
 | 
					#PBS -l walltime=02:00:00
 | 
				
			||||||
 | 
					#PBS -A LatticeQCD_aesp_CNDA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#export OMP_PROC_BIND=spread
 | 
				
			||||||
 | 
					#unset OMP_PLACES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					source ../sourceme.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cat $PBS_NODEFILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export OMP_NUM_THREADS=3
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
				
			||||||
 | 
					#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
				
			||||||
 | 
					#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
				
			||||||
 | 
					export MPICH_OFI_NIC_POLICY=GPU
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 12 ppn, 16 nodes, 192 ranks
 | 
				
			||||||
 | 
					# 12 ppn, 128 nodes, 1536 ranks
 | 
				
			||||||
 | 
					CMD="mpiexec -np 1536 -ppn 12  -envall \
 | 
				
			||||||
 | 
						     ./gpu_tile_compact.sh \
 | 
				
			||||||
 | 
						     ./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \
 | 
				
			||||||
 | 
							--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap "
 | 
				
			||||||
 | 
					$CMD 
 | 
				
			||||||
@@ -2,26 +2,39 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 | 
					## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#PBS -q EarlyAppAccess
 | 
					#PBS -l select=16:system=sunspot,place=scatter
 | 
				
			||||||
#PBS -l select=16
 | 
					 | 
				
			||||||
#PBS -l walltime=01:00:00
 | 
					 | 
				
			||||||
#PBS -A LatticeQCD_aesp_CNDA
 | 
					#PBS -A LatticeQCD_aesp_CNDA
 | 
				
			||||||
 | 
					#PBS -l walltime=01:00:00
 | 
				
			||||||
 | 
					#PBS -N dwf
 | 
				
			||||||
 | 
					#PBS -k doe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#export OMP_PROC_BIND=spread
 | 
					#export OMP_PROC_BIND=spread
 | 
				
			||||||
#unset OMP_PLACES
 | 
					#unset OMP_PLACES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cd $PBS_O_WORKDIR
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
source ../sourceme.sh
 | 
					#source ../sourceme.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cat $PBS_NODEFILE
 | 
					cat $PBS_NODEFILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#export MPICH_COLL_SYNC=1
 | 
				
			||||||
 | 
					#export MPICH_ENV_DISPLAY=1
 | 
				
			||||||
 | 
					export MPICH_
 | 
				
			||||||
export OMP_NUM_THREADS=3
 | 
					export OMP_NUM_THREADS=3
 | 
				
			||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
					export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
				
			||||||
 | 
					module load oneapi/eng-compiler/2023.05.15.003
 | 
				
			||||||
 | 
					module load mpich/51.2/icc-all-deterministic-pmix-gpu
 | 
				
			||||||
 | 
					#export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
					#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
				
			||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
					#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
				
			||||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
					#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
				
			||||||
 | 
					export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 | 
				
			||||||
 | 
					unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
				
			||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
				
			||||||
@@ -32,9 +45,17 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
				
			|||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
				
			||||||
export MPICH_OFI_NIC_POLICY=GPU
 | 
					export MPICH_OFI_NIC_POLICY=GPU
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# 12 ppn, 16 nodes, 192 ranks
 | 
					DIR=repro.$PBS_JOBID
 | 
				
			||||||
 | 
					mkdir $DIR
 | 
				
			||||||
 | 
					cd $DIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
CMD="mpiexec -np 192 -ppn 12  -envall \
 | 
					CMD="mpiexec -np 192 -ppn 12  -envall \
 | 
				
			||||||
	     ./gpu_tile_compact.sh \
 | 
						     ../gpu_tile_compact.sh \
 | 
				
			||||||
	     ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
 | 
						     ../Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
 | 
				
			||||||
		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
 | 
							--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --debug-stdout --log Message,Iterative"
 | 
				
			||||||
 | 
					#--comms-overlap
 | 
				
			||||||
$CMD 
 | 
					$CMD 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					grep Oops Grid.stderr.* > failures.$PBS_JOBID
 | 
				
			||||||
 | 
					rm core.*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										82
									
								
								systems/Aurora/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								systems/Aurora/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,82 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#PBS -l select=16:system=sunspot,place=scatter
 | 
				
			||||||
 | 
					#PBS -A LatticeQCD_aesp_CNDA
 | 
				
			||||||
 | 
					#PBS -l walltime=02:00:00
 | 
				
			||||||
 | 
					#PBS -N repro1gpu
 | 
				
			||||||
 | 
					#PBS -k doe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#export OMP_PROC_BIND=spread
 | 
				
			||||||
 | 
					#unset OMP_PLACES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					module load oneapi/eng-compiler/2023.05.15.003
 | 
				
			||||||
 | 
					module load mpich/51.2/icc-all-deterministic-pmix-gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 56 cores / 6 threads ~9
 | 
				
			||||||
 | 
					export OMP_NUM_THREADS=6
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
				
			||||||
 | 
					export MPICH_OFI_NIC_POLICY=GPU
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 | 
				
			||||||
 | 
					unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NN=`cat $PBS_NODEFILE | wc -l`
 | 
				
			||||||
 | 
					echo $PBS_NODEFILE
 | 
				
			||||||
 | 
					cat $PBS_NODEFILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo $NN nodes in node file
 | 
				
			||||||
 | 
					for n in `eval echo {1..$NN}`
 | 
				
			||||||
 | 
					do
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 | 
				
			||||||
 | 
					echo Node $n is $THIS_NODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for g in {0..11}
 | 
				
			||||||
 | 
					do
 | 
				
			||||||
 | 
					export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 | 
				
			||||||
 | 
					export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 | 
				
			||||||
 | 
					export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export numa=${NUMA_MAP[$g]}
 | 
				
			||||||
 | 
					export gpu_id=${GPU_MAP[$g]}
 | 
				
			||||||
 | 
					export tile_id=${TILE_MAP[$g]}
 | 
				
			||||||
 | 
					export gpu=$gpu_id.$tile_id
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
 | 
				
			||||||
 | 
					mkdir -p $DIR
 | 
				
			||||||
 | 
					cd $DIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo $THIS_NODE > nodefile
 | 
				
			||||||
 | 
					echo $gpu > gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export ZE_AFFINITY_MASK=$gpu
 | 
				
			||||||
 | 
					export ONEAPI_DEVICE_FILTER=gpu,level_zero
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CMD="mpiexec -np 1 -ppn 1  -envall --hostfile nodefile \
 | 
				
			||||||
 | 
						     numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
 | 
				
			||||||
 | 
							--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
 | 
				
			||||||
 | 
					echo $CMD
 | 
				
			||||||
 | 
					$CMD &
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					wait
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										98
									
								
								systems/Aurora/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								systems/Aurora/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,98 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#PBS -l select=32:system=sunspot,place=scatter
 | 
				
			||||||
 | 
					#PBS -A LatticeQCD_aesp_CNDA
 | 
				
			||||||
 | 
					#PBS -l walltime=02:00:00
 | 
				
			||||||
 | 
					#PBS -N reproN
 | 
				
			||||||
 | 
					#PBS -k doe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#export OMP_PROC_BIND=spread
 | 
				
			||||||
 | 
					#unset OMP_PLACES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					module load oneapi/eng-compiler/2023.05.15.003
 | 
				
			||||||
 | 
					module load mpich/51.2/icc-all-deterministic-pmix-gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 56 cores / 6 threads ~9
 | 
				
			||||||
 | 
					export OMP_NUM_THREADS=6
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
				
			||||||
 | 
					#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
				
			||||||
 | 
					export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
 | 
				
			||||||
 | 
					export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 | 
				
			||||||
 | 
					export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export GRID_PRINT_ENTIRE_LOG=0
 | 
				
			||||||
 | 
					export GRID_CHECKSUM_RECV_BUF=0
 | 
				
			||||||
 | 
					export GRID_CHECKSUM_SEND_BUF=0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export MPICH_OFI_NIC_POLICY=GPU
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 | 
				
			||||||
 | 
					unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NN=`cat $PBS_NODEFILE | wc -l`
 | 
				
			||||||
 | 
					echo $PBS_NODEFILE
 | 
				
			||||||
 | 
					cat $PBS_NODEFILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo $NN nodes in node file
 | 
				
			||||||
 | 
					for n in `eval echo {1..$NN}`
 | 
				
			||||||
 | 
					do
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 | 
				
			||||||
 | 
					echo Node $n is $THIS_NODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					mkdir -p $DIR
 | 
				
			||||||
 | 
					cd $DIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo $THIS_NODE > nodefile
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
 | 
				
			||||||
 | 
					#	     ../../gpu_tile_compact.sh \
 | 
				
			||||||
 | 
					#	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
 | 
				
			||||||
 | 
					#		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
 | 
				
			||||||
 | 
						     ../../gpu_tile_compact.sh \
 | 
				
			||||||
 | 
						     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
 | 
				
			||||||
 | 
							--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo $CMD > command-line
 | 
				
			||||||
 | 
					env > environment
 | 
				
			||||||
 | 
					$CMD &
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Suspicious wait is allowing jobs to collide and knock out
 | 
				
			||||||
 | 
					#wait
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					sleep 6500
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for n in ` eval echo {1..$NN} `
 | 
				
			||||||
 | 
					do
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 | 
				
			||||||
 | 
					DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $DIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					grep Oops Grid.stderr.* > failures.$PBS_JOBID
 | 
				
			||||||
 | 
					rm core.*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
@@ -36,5 +36,5 @@ export MPICH_OFI_NIC_POLICY=GPU
 | 
				
			|||||||
CMD="mpiexec -np 192 -ppn 12  -envall \
 | 
					CMD="mpiexec -np 192 -ppn 12  -envall \
 | 
				
			||||||
	     ./gpu_tile_compact.sh \
 | 
						     ./gpu_tile_compact.sh \
 | 
				
			||||||
	     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
 | 
						     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
 | 
				
			||||||
	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
 | 
						     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap"
 | 
				
			||||||
$CMD 
 | 
					$CMD 
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,4 +1,4 @@
 | 
				
			|||||||
TOOLS=$HOME/tools
 | 
					
 | 
				
			||||||
../../configure \
 | 
					../../configure \
 | 
				
			||||||
	--enable-simd=GPU \
 | 
						--enable-simd=GPU \
 | 
				
			||||||
	--enable-gen-simd-width=64 \
 | 
						--enable-gen-simd-width=64 \
 | 
				
			||||||
@@ -11,6 +11,6 @@ TOOLS=$HOME/tools
 | 
				
			|||||||
	--enable-unified=no \
 | 
						--enable-unified=no \
 | 
				
			||||||
	MPICXX=mpicxx \
 | 
						MPICXX=mpicxx \
 | 
				
			||||||
	CXX=icpx \
 | 
						CXX=icpx \
 | 
				
			||||||
	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
 | 
						LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
 | 
				
			||||||
	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
 | 
						CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										2
									
								
								systems/Sunspot/sourceme.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								systems/Sunspot/sourceme.sh
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,2 @@
 | 
				
			|||||||
 | 
					module load oneapi/eng-compiler/2023.05.15.003
 | 
				
			||||||
 | 
					module load mpich/51.2/icc-all-deterministic-pmix-gpu
 | 
				
			||||||
							
								
								
									
										81
									
								
								systems/Sunspot/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								systems/Sunspot/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,81 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#PBS -l select=16:system=sunspot,place=scatter
 | 
				
			||||||
 | 
					#PBS -A LatticeQCD_aesp_CNDA
 | 
				
			||||||
 | 
					#PBS -l walltime=02:00:00
 | 
				
			||||||
 | 
					#PBS -N repro1gpu
 | 
				
			||||||
 | 
					#PBS -k doe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#export OMP_PROC_BIND=spread
 | 
				
			||||||
 | 
					#unset OMP_PLACES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					module load oneapi/eng-compiler/2023.05.15.003
 | 
				
			||||||
 | 
					module load mpich/51.2/icc-all-deterministic-pmix-gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 56 cores / 6 threads ~9
 | 
				
			||||||
 | 
					export OMP_NUM_THREADS=6
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
				
			||||||
 | 
					export MPICH_OFI_NIC_POLICY=GPU
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 | 
				
			||||||
 | 
					unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NN=`cat $PBS_NODEFILE | wc -l`
 | 
				
			||||||
 | 
					echo $PBS_NODEFILE
 | 
				
			||||||
 | 
					cat $PBS_NODEFILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo $NN nodes in node file
 | 
				
			||||||
 | 
					for n in `eval echo {1..$NN}`
 | 
				
			||||||
 | 
					do
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 | 
				
			||||||
 | 
					echo Node $n is $THIS_NODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for g in {0..11}
 | 
				
			||||||
 | 
					do
 | 
				
			||||||
 | 
					export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 | 
				
			||||||
 | 
					export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 | 
				
			||||||
 | 
					export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export numa=${NUMA_MAP[$g]}
 | 
				
			||||||
 | 
					export gpu_id=${GPU_MAP[$g]}
 | 
				
			||||||
 | 
					export tile_id=${TILE_MAP[$g]}
 | 
				
			||||||
 | 
					export gpu=$gpu_id.$tile_id
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
 | 
				
			||||||
 | 
					mkdir -p $DIR
 | 
				
			||||||
 | 
					cd $DIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo $THIS_NODE > nodefile
 | 
				
			||||||
 | 
					echo $gpu > gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export ZE_AFFINITY_MASK=$gpu
 | 
				
			||||||
 | 
					export ONEAPI_DEVICE_FILTER=gpu,level_zero
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CMD="mpiexec -np 1 -ppn 1  -envall --hostfile nodefile \
 | 
				
			||||||
 | 
						     numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
 | 
				
			||||||
 | 
							--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
 | 
				
			||||||
 | 
					echo $CMD
 | 
				
			||||||
 | 
					$CMD &
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					wait
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										97
									
								
								systems/Sunspot/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								systems/Sunspot/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,97 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#PBS -l select=32:system=sunspot,place=scatter
 | 
				
			||||||
 | 
					#PBS -A LatticeQCD_aesp_CNDA
 | 
				
			||||||
 | 
					#PBS -l walltime=02:00:00
 | 
				
			||||||
 | 
					#PBS -N reproN
 | 
				
			||||||
 | 
					#PBS -k doe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#export OMP_PROC_BIND=spread
 | 
				
			||||||
 | 
					#unset OMP_PLACES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					module load oneapi/eng-compiler/2023.05.15.003
 | 
				
			||||||
 | 
					module load mpich/51.2/icc-all-deterministic-pmix-gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 56 cores / 6 threads ~9
 | 
				
			||||||
 | 
					export OMP_NUM_THREADS=6
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
				
			||||||
 | 
					#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
				
			||||||
 | 
					#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
				
			||||||
 | 
					export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
 | 
				
			||||||
 | 
					export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export GRID_PRINT_ENTIRE_LOG=0
 | 
				
			||||||
 | 
					export GRID_CHECKSUM_RECV_BUF=1
 | 
				
			||||||
 | 
					export GRID_CHECKSUM_SEND_BUF=0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export MPICH_OFI_NIC_POLICY=GPU
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 | 
				
			||||||
 | 
					unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NN=`cat $PBS_NODEFILE | wc -l`
 | 
				
			||||||
 | 
					echo $PBS_NODEFILE
 | 
				
			||||||
 | 
					cat $PBS_NODEFILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo $NN nodes in node file
 | 
				
			||||||
 | 
					for n in `eval echo {1..$NN}`
 | 
				
			||||||
 | 
					do
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 | 
				
			||||||
 | 
					echo Node $n is $THIS_NODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					mkdir -p $DIR
 | 
				
			||||||
 | 
					cd $DIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo $THIS_NODE > nodefile
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
 | 
				
			||||||
 | 
					#	     ../../gpu_tile_compact.sh \
 | 
				
			||||||
 | 
					#	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
 | 
				
			||||||
 | 
					#		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
 | 
				
			||||||
 | 
						     ../../gpu_tile_compact.sh \
 | 
				
			||||||
 | 
						     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
 | 
				
			||||||
 | 
							--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo $CMD > command-line
 | 
				
			||||||
 | 
					env > environment
 | 
				
			||||||
 | 
					$CMD &
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Suspicious wait is allowing jobs to collide and knock out
 | 
				
			||||||
 | 
					#wait
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					sleep 6500
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for n in ` eval echo {1..$NN} `
 | 
				
			||||||
 | 
					do
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 | 
				
			||||||
 | 
					DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $DIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					grep Oops Grid.stderr.* > failures.$PBS_JOBID
 | 
				
			||||||
 | 
					rm core.*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
@@ -34,6 +34,46 @@ using namespace Grid;
 | 
				
			|||||||
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 | 
					#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					template<class Matrix,class Field>
 | 
				
			||||||
 | 
					  class SchurDiagMooeeOperatorParanoid :  public SchurOperatorBase<Field> {
 | 
				
			||||||
 | 
					 public:
 | 
				
			||||||
 | 
					    Matrix &_Mat;
 | 
				
			||||||
 | 
					    SchurDiagMooeeOperatorParanoid (Matrix &Mat): _Mat(Mat){};
 | 
				
			||||||
 | 
					    virtual  void Mpc      (const Field &in, Field &out) {
 | 
				
			||||||
 | 
					      Field tmp(in.Grid());
 | 
				
			||||||
 | 
					      tmp.Checkerboard() = !in.Checkerboard();
 | 
				
			||||||
 | 
					      //      std::cout <<" Mpc starting"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      RealD nn = norm2(in); // std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl;
 | 
				
			||||||
 | 
					      _Mat.Meooe(in,tmp);
 | 
				
			||||||
 | 
					      nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooeinv "<<nn<<std::endl;
 | 
				
			||||||
 | 
					      _Mat.MooeeInv(tmp,out);
 | 
				
			||||||
 | 
					      nn = norm2(out); //std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl;
 | 
				
			||||||
 | 
					      _Mat.Meooe(out,tmp);
 | 
				
			||||||
 | 
					      nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooee "<<nn<<std::endl;
 | 
				
			||||||
 | 
					      _Mat.Mooee(in,out);
 | 
				
			||||||
 | 
					      nn = norm2(out); //std::cout <<" Mpc Prior to axpy "<<nn<<std::endl;
 | 
				
			||||||
 | 
					      axpy(out,-1.0,tmp,out);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    virtual void MpcDag   (const Field &in, Field &out){
 | 
				
			||||||
 | 
					      Field tmp(in.Grid());
 | 
				
			||||||
 | 
					      //      std::cout <<" MpcDag starting"<<std::endl;
 | 
				
			||||||
 | 
					      RealD nn = norm2(in);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl;
 | 
				
			||||||
 | 
					      _Mat.MeooeDag(in,tmp);
 | 
				
			||||||
 | 
					      _Mat.MooeeInvDag(tmp,out);
 | 
				
			||||||
 | 
					      nn = norm2(out);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl;
 | 
				
			||||||
 | 
					      _Mat.MeooeDag(out,tmp);
 | 
				
			||||||
 | 
					      nn = norm2(tmp);// std::cout <<" MpcDag Prior to Mooee "<<nn<<std::endl;
 | 
				
			||||||
 | 
					      _Mat.MooeeDag(in,out);
 | 
				
			||||||
 | 
					      nn = norm2(out);// std::cout <<" MpcDag Prior to axpy "<<nn<<std::endl;
 | 
				
			||||||
 | 
					      axpy(out,-1.0,tmp,out);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int main (int argc, char ** argv)
 | 
					int main (int argc, char ** argv)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  char hostname[HOST_NAME_MAX+1];
 | 
					  char hostname[HOST_NAME_MAX+1];
 | 
				
			||||||
@@ -82,8 +122,8 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  result_o_2.Checkerboard() = Odd;
 | 
					  result_o_2.Checkerboard() = Odd;
 | 
				
			||||||
  result_o_2 = Zero();
 | 
					  result_o_2 = Zero();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
 | 
					  SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
 | 
				
			||||||
  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 | 
					  SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int nsecs=600;
 | 
					  int nsecs=600;
 | 
				
			||||||
  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
 | 
					  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
 | 
				
			||||||
@@ -104,10 +144,22 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  time_t start = time(NULL);
 | 
					  time_t start = time(NULL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  uint32_t csum, csumref;
 | 
					  FlightRecorder::ContinueOnFail = 0;
 | 
				
			||||||
  csumref=0;
 | 
					  FlightRecorder::PrintEntireLog = 0;
 | 
				
			||||||
 | 
					  FlightRecorder::ChecksumComms  = 1;
 | 
				
			||||||
 | 
					  FlightRecorder::ChecksumCommsSend=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s);
 | 
				
			||||||
 | 
					  if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms      = atoi(s);
 | 
				
			||||||
 | 
					  if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend  = atoi(s);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int iter=0;
 | 
					  int iter=0;
 | 
				
			||||||
  do {
 | 
					  do {
 | 
				
			||||||
 | 
					    if ( iter == 0 ) {
 | 
				
			||||||
 | 
					      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
 | 
					    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
 | 
				
			||||||
    result_o = Zero();
 | 
					    result_o = Zero();
 | 
				
			||||||
    t1=usecond();
 | 
					    t1=usecond();
 | 
				
			||||||
@@ -118,27 +170,23 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
 | 
					    flops+= CGsiteflops*FrbGrid->gSites()*iters;
 | 
				
			||||||
    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
					    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
				
			||||||
    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
					    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
				
			||||||
 | 
					    std::cout << " SinglePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    csum = crc(result_o);
 | 
					    assert(FlightRecorder::ErrorCount()==0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if ( csumref == 0 ) {
 | 
					    std::cout << " FlightRecorder is OK! "<<std::endl;
 | 
				
			||||||
      csumref = csum;
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      if ( csum != csumref ) { 
 | 
					 | 
				
			||||||
	std::cerr << host<<" FAILURE " <<iter <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
 | 
					 | 
				
			||||||
	assert(0);
 | 
					 | 
				
			||||||
      } else {
 | 
					 | 
				
			||||||
	std::cout << host <<" OK " <<iter <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    iter ++;
 | 
					    iter ++;
 | 
				
			||||||
  } while (time(NULL) < (start + nsecs/2) );
 | 
					  } while (time(NULL) < (start + nsecs/10) );
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
 | 
					  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
 | 
				
			||||||
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
 | 
					  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
 | 
				
			||||||
  csumref=0;
 | 
					 | 
				
			||||||
  int i=0;
 | 
					  int i=0;
 | 
				
			||||||
  do { 
 | 
					  do { 
 | 
				
			||||||
 | 
					    if ( i == 0 ) {
 | 
				
			||||||
 | 
					      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
 | 
					    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
 | 
				
			||||||
    result_o_2 = Zero();
 | 
					    result_o_2 = Zero();
 | 
				
			||||||
    t1=usecond();
 | 
					    t1=usecond();
 | 
				
			||||||
@@ -150,19 +198,9 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
					    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
				
			||||||
    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
					    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
				
			||||||
 | 
					    std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
 | 
				
			||||||
    csum = crc(result_o);
 | 
					    assert(FlightRecorder::ErrorCount()==0);
 | 
				
			||||||
 | 
					    std::cout << " FlightRecorder is OK! "<<std::endl;
 | 
				
			||||||
    if ( csumref == 0 ) {
 | 
					 | 
				
			||||||
      csumref = csum;
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      if ( csum != csumref ) { 
 | 
					 | 
				
			||||||
	std::cerr << i <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
 | 
					 | 
				
			||||||
	assert(0);
 | 
					 | 
				
			||||||
      } else {
 | 
					 | 
				
			||||||
	std::cout << i <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    i++;
 | 
					    i++;
 | 
				
			||||||
  } while (time(NULL) < (start + nsecs) );
 | 
					  } while (time(NULL) < (start + nsecs) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user