mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 12:04:33 +00:00 
			
		
		
		
	Compare commits
	
		
			23 Commits
		
	
	
		
			6d7219b59d
			...
			feature/gp
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | da59379612 | ||
|  | 3ef2a41518 | ||
|  | aa96f420c6 | ||
|  | 49e9e4ed0e | ||
|  | f7b8163016 | ||
|  | 93769eacd3 | ||
|  | 59b0cc11df | ||
|  | f32c275376 | ||
|  | 5404fc66ab | ||
|  | 1f53458af8 | ||
|  | 434c3e7f1d | ||
|  | 500b119f3d | ||
|  | 4b87259c1b | ||
|  | 503dec34ef | ||
|  | d1e9fe50d2 | ||
|  | d01e5fa838 | ||
|  | a477c25e8c | ||
|  | 1bd20cd9e8 | ||
|  | e49e95b037 | ||
|  | 6f59fed563 | ||
|  | 60b7f6c99d | ||
|  | b92dfcc8d3 | ||
|  | f6fd6dd053 | 
| @@ -348,6 +348,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | |||||||
|   return offbytes; |   return offbytes; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #undef NVLINK_GET // Define to use get instead of put DMA | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 							 void *xmit, | 							 void *xmit, | ||||||
| 							 int dest,int dox, | 							 int dest,int dox, | ||||||
| @@ -380,9 +381,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
|       list.push_back(rrq); |       list.push_back(rrq); | ||||||
|       off_node_bytes+=rbytes; |       off_node_bytes+=rbytes; | ||||||
|     } |     } | ||||||
|  | #ifdef NVLINK_GET | ||||||
|  |       void *shm = (void *) this->ShmBufferTranslate(from,xmit); | ||||||
|  |       assert(shm!=NULL); | ||||||
|  |       acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); | ||||||
|  | #endif | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   if (dox) { |   if (dox) { | ||||||
|  |     //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes); | ||||||
|     if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { |     if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { | ||||||
|       tag= dir+_processor*32; |       tag= dir+_processor*32; | ||||||
|       ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); |       ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); | ||||||
| @@ -390,9 +397,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
|       list.push_back(xrq); |       list.push_back(xrq); | ||||||
|       off_node_bytes+=xbytes; |       off_node_bytes+=xbytes; | ||||||
|     } else { |     } else { | ||||||
|  | #ifndef NVLINK_GET | ||||||
|       void *shm = (void *) this->ShmBufferTranslate(dest,recv); |       void *shm = (void *) this->ShmBufferTranslate(dest,recv); | ||||||
|       assert(shm!=NULL); |       assert(shm!=NULL); | ||||||
|       acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); |       acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); | ||||||
|  | #endif | ||||||
|  |        | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -402,6 +412,8 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque | |||||||
| { | { | ||||||
|   int nreq=list.size(); |   int nreq=list.size(); | ||||||
|  |  | ||||||
|  |   acceleratorCopySynchronise(); | ||||||
|  |  | ||||||
|   if (nreq==0) return; |   if (nreq==0) return; | ||||||
|  |  | ||||||
|   std::vector<MPI_Status> status(nreq); |   std::vector<MPI_Status> status(nreq); | ||||||
|   | |||||||
| @@ -40,6 +40,9 @@ int                 GlobalSharedMemory::_ShmAlloc; | |||||||
| uint64_t            GlobalSharedMemory::_ShmAllocBytes; | uint64_t            GlobalSharedMemory::_ShmAllocBytes; | ||||||
|  |  | ||||||
| std::vector<void *> GlobalSharedMemory::WorldShmCommBufs; | std::vector<void *> GlobalSharedMemory::WorldShmCommBufs; | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  | void * GlobalSharedMemory::HostCommBuf; | ||||||
|  | #endif | ||||||
|  |  | ||||||
| Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm; | Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm; | ||||||
| int                 GlobalSharedMemory::WorldShmRank; | int                 GlobalSharedMemory::WorldShmRank; | ||||||
| @@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void) | |||||||
| ///////////////////////////////// | ///////////////////////////////// | ||||||
| // Alloc, free shmem region | // Alloc, free shmem region | ||||||
| ///////////////////////////////// | ///////////////////////////////// | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  | void *SharedMemory::HostBufferMalloc(size_t bytes){ | ||||||
|  |   void *ptr = (void *)host_heap_top; | ||||||
|  |   host_heap_top  += bytes; | ||||||
|  |   host_heap_bytes+= bytes; | ||||||
|  |   if (host_heap_bytes >= host_heap_size) { | ||||||
|  |     std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl; | ||||||
|  |     std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl; | ||||||
|  |     std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; | ||||||
|  |     std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl; | ||||||
|  |     std::cout<< " Current heap  is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl; | ||||||
|  |     assert(host_heap_bytes<host_heap_size); | ||||||
|  |   } | ||||||
|  |   return ptr; | ||||||
|  | } | ||||||
|  | void SharedMemory::HostBufferFreeAll(void) {  | ||||||
|  |   host_heap_top  =(size_t)HostCommBuf; | ||||||
|  |   host_heap_bytes=0; | ||||||
|  | } | ||||||
|  | #endif | ||||||
| void *SharedMemory::ShmBufferMalloc(size_t bytes){ | void *SharedMemory::ShmBufferMalloc(size_t bytes){ | ||||||
|   //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes |   //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes | ||||||
|   void *ptr = (void *)heap_top; |   void *ptr = (void *)heap_top; | ||||||
|   | |||||||
| @@ -75,7 +75,9 @@ public: | |||||||
|   static int           Hugepages; |   static int           Hugepages; | ||||||
|  |  | ||||||
|   static std::vector<void *> WorldShmCommBufs; |   static std::vector<void *> WorldShmCommBufs; | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   static void *HostCommBuf; | ||||||
|  | #endif | ||||||
|   static Grid_MPI_Comm WorldComm; |   static Grid_MPI_Comm WorldComm; | ||||||
|   static int           WorldRank; |   static int           WorldRank; | ||||||
|   static int           WorldSize; |   static int           WorldSize; | ||||||
| @@ -120,6 +122,13 @@ private: | |||||||
|   size_t heap_bytes; |   size_t heap_bytes; | ||||||
|   size_t heap_size; |   size_t heap_size; | ||||||
|  |  | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   size_t host_heap_top;  // set in free all | ||||||
|  |   size_t host_heap_bytes;// set in free all | ||||||
|  |   void *HostCommBuf;     // set in SetCommunicator | ||||||
|  |   size_t host_heap_size; // set in SetCommunicator | ||||||
|  | #endif | ||||||
|  |    | ||||||
| protected: | protected: | ||||||
|  |  | ||||||
|   Grid_MPI_Comm    ShmComm; // for barriers |   Grid_MPI_Comm    ShmComm; // for barriers | ||||||
| @@ -151,7 +160,10 @@ public: | |||||||
|   void *ShmBufferTranslate(int rank,void * local_p); |   void *ShmBufferTranslate(int rank,void * local_p); | ||||||
|   void *ShmBufferMalloc(size_t bytes); |   void *ShmBufferMalloc(size_t bytes); | ||||||
|   void  ShmBufferFreeAll(void) ; |   void  ShmBufferFreeAll(void) ; | ||||||
|    | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   void *HostBufferMalloc(size_t bytes); | ||||||
|  |   void HostBufferFreeAll(void); | ||||||
|  | #endif   | ||||||
|   ////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////// | ||||||
|   // Make info on Nodes & ranks and Shared memory available |   // Make info on Nodes & ranks and Shared memory available | ||||||
|   ////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -39,9 +39,11 @@ Author: Christoph Lehner <christoph@lhnr.de> | |||||||
| #include <hip/hip_runtime_api.h> | #include <hip/hip_runtime_api.h> | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|  | #ifdef ACCELERATOR_AWARE_MPI | ||||||
| #define GRID_SYCL_LEVEL_ZERO_IPC | #define GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|  | #define SHM_SOCKETS | ||||||
|  | #endif  | ||||||
| #include <syscall.h> | #include <syscall.h> | ||||||
| #define SHM_SOCKETS  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #include <sys/socket.h> | #include <sys/socket.h> | ||||||
| @@ -512,46 +514,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| // Hugetlbfs mapping intended | // Hugetlbfs mapping intended | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| #if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL) | #if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL) | ||||||
|  |  | ||||||
| //if defined(GRID_SYCL) |  | ||||||
| #if 0 |  | ||||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) |  | ||||||
| { |  | ||||||
|   void * ShmCommBuf ;  |  | ||||||
|   assert(_ShmSetup==1); |  | ||||||
|   assert(_ShmAlloc==0); |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // allocate the pointer array for shared windows for our group |  | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   MPI_Barrier(WorldShmComm); |  | ||||||
|   WorldShmCommBufs.resize(WorldShmSize); |  | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // Each MPI rank should allocate our own buffer |  | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   ShmCommBuf = acceleratorAllocDevice(bytes); |  | ||||||
|  |  | ||||||
|   if (ShmCommBuf == (void *)NULL ) { |  | ||||||
|     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; |  | ||||||
|     exit(EXIT_FAILURE);   |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes  |  | ||||||
| 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; |  | ||||||
|  |  | ||||||
|   SharedMemoryZero(ShmCommBuf,bytes); |  | ||||||
|  |  | ||||||
|   assert(WorldShmSize == 1); |  | ||||||
|   for(int r=0;r<WorldShmSize;r++){ |  | ||||||
|     WorldShmCommBufs[r] = ShmCommBuf; |  | ||||||
|   } |  | ||||||
|   _ShmAllocBytes=bytes; |  | ||||||
|   _ShmAlloc=1; |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)   |  | ||||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
| { | { | ||||||
|   void * ShmCommBuf ;  |   void * ShmCommBuf ;  | ||||||
| @@ -574,6 +536,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Each MPI rank should allocate our own buffer |   // Each MPI rank should allocate our own buffer | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   HostCommBuf= malloc(bytes); | ||||||
|  | #endif   | ||||||
|   ShmCommBuf = acceleratorAllocDevice(bytes); |   ShmCommBuf = acceleratorAllocDevice(bytes); | ||||||
|   if (ShmCommBuf == (void *)NULL ) { |   if (ShmCommBuf == (void *)NULL ) { | ||||||
|     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; |     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; | ||||||
| @@ -738,7 +703,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|   _ShmAllocBytes=bytes; |   _ShmAllocBytes=bytes; | ||||||
|   _ShmAlloc=1; |   _ShmAlloc=1; | ||||||
| } | } | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #else  | #else  | ||||||
| #ifdef GRID_MPI3_SHMMMAP | #ifdef GRID_MPI3_SHMMMAP | ||||||
| @@ -962,6 +926,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) | |||||||
|   } |   } | ||||||
|   ShmBufferFreeAll(); |   ShmBufferFreeAll(); | ||||||
|  |  | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   host_heap_size = heap_size; | ||||||
|  |   HostCommBuf= GlobalSharedMemory::HostCommBuf; | ||||||
|  |   HostBufferFreeAll(); | ||||||
|  | #endif   | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////////// | ||||||
|   // find comm ranks in our SHM group (i.e. which ranks are on our node) |   // find comm ranks in our SHM group (i.e. which ranks are on our node) | ||||||
|   ///////////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -285,13 +285,25 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | |||||||
| template<class vobj> | template<class vobj> | ||||||
| inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { | inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { | ||||||
|   GridBase *grid = left.Grid(); |   GridBase *grid = left.Grid(); | ||||||
|   uint32_t csum=0; |  | ||||||
|   //  Uint32Checksum(left,csum); | #ifdef GRID_SYCL | ||||||
|  |   uint64_t csum=0; | ||||||
|  |   if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) | ||||||
|  |   { | ||||||
|  |     // Hack | ||||||
|  |     // Fast integer xor checksum. Can also be used in comms now. | ||||||
|  |     autoView(l_v,left,AcceleratorRead); | ||||||
|  |     Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); | ||||||
|  |     uint64_t *base= (uint64_t *)&l_v[0]; | ||||||
|  |     csum=svm_xor(base,words); | ||||||
|  |   } | ||||||
|  |   FlightRecorder::CsumLog(csum); | ||||||
|  | #endif | ||||||
|   ComplexD nrm = rankInnerProduct(left,right); |   ComplexD nrm = rankInnerProduct(left,right); | ||||||
|   RealD local = real(nrm); |   RealD local = real(nrm); | ||||||
|   GridNormLog(real(nrm),csum); // Could log before and after global sum to distinguish local and MPI |   FlightRecorder::NormLog(real(nrm));  | ||||||
|   grid->GlobalSum(nrm); |   grid->GlobalSum(nrm); | ||||||
|   GridMPINormLog(local,real(nrm));  |   FlightRecorder::ReductionLog(local,real(nrm));  | ||||||
|   return nrm; |   return nrm; | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -69,29 +69,30 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite | |||||||
|   return result; |   return result; | ||||||
| } | } | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); |  | ||||||
|  |  | ||||||
| /* | template<class Word> Word svm_xor(Word *vec,uint64_t L) | ||||||
| template<class Double> Double svm_reduce(Double *vec,uint64_t L) |  | ||||||
| { | { | ||||||
|   Double sumResult; zeroit(sumResult); |   Word xorResult; xorResult = 0; | ||||||
|   Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator); |   Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator); | ||||||
|   Double identity;  zeroit(identity); |   Word identity;  identity=0; | ||||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||||
|      auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>()); |      auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>()); | ||||||
|      cgh.parallel_for(cl::sycl::range<1>{L}, |      cgh.parallel_for(cl::sycl::range<1>{L}, | ||||||
| 		      Reduction, | 		      Reduction, | ||||||
| 		      [=] (cl::sycl::id<1> index, auto &sum) { | 		      [=] (cl::sycl::id<1> index, auto &sum) { | ||||||
| 	 sum +=vec[index]; | 	 sum ^=vec[index]; | ||||||
|      }); |      }); | ||||||
|    }); |    }); | ||||||
|   theGridAccelerator->wait(); |   theGridAccelerator->wait(); | ||||||
|   Double ret = d_sum[0]; |   Word ret = d_sum[0]; | ||||||
|   free(d_sum,*theGridAccelerator); |   free(d_sum,*theGridAccelerator); | ||||||
|   std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl; |  | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|  | /* | ||||||
|  |  | ||||||
| template <class vobj> | template <class vobj> | ||||||
| inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites) | inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites) | ||||||
| { | { | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| #pragma once | #pragma once | ||||||
|  | #include <type_traits> | ||||||
| #if defined(GRID_CUDA) | #if defined(GRID_CUDA) | ||||||
|  |  | ||||||
| #include <cub/cub.cuh> | #include <cub/cub.cuh> | ||||||
| @@ -90,61 +90,8 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V | |||||||
|    |    | ||||||
|  |  | ||||||
| } | } | ||||||
| #endif  |  | ||||||
|  |  | ||||||
|  | template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { | ||||||
| #if defined(GRID_SYCL) |  | ||||||
| template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) |  | ||||||
| { |  | ||||||
|   size_t subvol_size = e1*e2; |  | ||||||
|  |  | ||||||
|   vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator); |  | ||||||
|   vobj vobj_zero; |  | ||||||
|   zeroit(vobj_zero); |  | ||||||
|   for (int r = 0; r<rd; r++) {  |  | ||||||
|     mysum[r] = vobj_zero;  |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   commVector<vobj> reduction_buffer(rd*subvol_size);     |  | ||||||
|  |  | ||||||
|   auto rb_p = &reduction_buffer[0]; |  | ||||||
|  |  | ||||||
|   // autoView(Data_v, Data, AcceleratorRead); |  | ||||||
|  |  | ||||||
|   //prepare reduction buffer  |  | ||||||
|   accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{  |  | ||||||
|    |  | ||||||
|       int n = s / e2; |  | ||||||
|       int b = s % e2; |  | ||||||
|       int so=r*ostride; // base offset for start of plane  |  | ||||||
|       int ss= so+n*stride+b; |  | ||||||
|  |  | ||||||
|       coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss])); |  | ||||||
|  |  | ||||||
|   }); |  | ||||||
|  |  | ||||||
|   for (int r = 0; r < rd; r++) { |  | ||||||
|       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |  | ||||||
|           auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); |  | ||||||
|           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, |  | ||||||
|           Reduction, |  | ||||||
|           [=](cl::sycl::id<1> item, auto &sum) { |  | ||||||
|               auto s = item[0]; |  | ||||||
|               sum += rb_p[r*subvol_size+s]; |  | ||||||
|           }); |  | ||||||
|       }); |  | ||||||
|        |  | ||||||
|       |  | ||||||
|   } |  | ||||||
|   theGridAccelerator->wait(); |  | ||||||
|   for (int r = 0; r < rd; r++) { |  | ||||||
|     lvSum[r] = mysum[r]; |  | ||||||
|   } |  | ||||||
|   free(mysum,*theGridAccelerator); |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { |  | ||||||
|   typedef typename vobj::vector_type vector; |   typedef typename vobj::vector_type vector; | ||||||
|   const int words = sizeof(vobj)/sizeof(vector); |   const int words = sizeof(vobj)/sizeof(vector); | ||||||
|   const int osites = rd*e1*e2; |   const int osites = rd*e1*e2; | ||||||
| @@ -159,12 +106,8 @@ template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vecto | |||||||
| 	    buf[ss] = dat[ss*words+w]; | 	    buf[ss] = dat[ss*words+w]; | ||||||
|     }); |     }); | ||||||
|  |  | ||||||
|     #if defined(GRID_CUDA) || defined(GRID_HIP) |     sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); | ||||||
|       sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); |        | ||||||
|     #elif defined(GRID_SYCL) |  | ||||||
|       sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); |  | ||||||
|     #endif |  | ||||||
|  |  | ||||||
|     for (int r = 0; r < rd; r++) { |     for (int r = 0; r < rd; r++) { | ||||||
|       lvSum_ptr[w+words*r]=lvSum_small[r]; |       lvSum_ptr[w+words*r]=lvSum_small[r]; | ||||||
|     } |     } | ||||||
| @@ -174,24 +117,66 @@ template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vecto | |||||||
|    |    | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | ||||||
| { | { | ||||||
|   autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. |   autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case. | ||||||
|     if constexpr (sizeof(vobj) <= 256) {  |     if constexpr (sizeof(vobj) <= 256) {  | ||||||
|  |       sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|       #if defined(GRID_CUDA) || defined(GRID_HIP) |  | ||||||
|         sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|       #elif defined (GRID_SYCL) |  | ||||||
|         sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|       #endif |  | ||||||
|  |  | ||||||
|     } |     } | ||||||
|     else { |     else { | ||||||
|       sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |       sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #if defined(GRID_SYCL) | ||||||
|  | template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | ||||||
|  | { | ||||||
|  |   typedef typename vobj::scalar_object sobj; | ||||||
|  |   size_t subvol_size = e1*e2; | ||||||
|  |  | ||||||
|  |   vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); | ||||||
|  |   vobj vobj_zero; | ||||||
|  |   zeroit(vobj_zero); | ||||||
|  |      | ||||||
|  |   commVector<vobj> reduction_buffer(rd*subvol_size);     | ||||||
|  |  | ||||||
|  |   auto rb_p = &reduction_buffer[0]; | ||||||
|  |  | ||||||
|  |   autoView(Data_v, Data, AcceleratorRead); | ||||||
|  |  | ||||||
|  |   //prepare reduction buffer  | ||||||
|  |   accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{  | ||||||
|  |    | ||||||
|  |       int n = s / e2; | ||||||
|  |       int b = s % e2; | ||||||
|  |       int so=r*ostride; // base offset for start of plane  | ||||||
|  |       int ss= so+n*stride+b; | ||||||
|  |  | ||||||
|  |       coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); | ||||||
|  |  | ||||||
|  |   }); | ||||||
|  |  | ||||||
|  |   for (int r = 0; r < rd; r++) { | ||||||
|  |       mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable | ||||||
|  |       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||||
|  |           auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); | ||||||
|  |           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, | ||||||
|  |           Reduction, | ||||||
|  |           [=](cl::sycl::id<1> item, auto &sum) { | ||||||
|  |               auto s = item[0]; | ||||||
|  |               sum += rb_p[r*subvol_size+s]; | ||||||
|  |           }); | ||||||
|  |       }); | ||||||
|  |       theGridAccelerator->wait(); | ||||||
|  |       lvSum[r] = mysum[0]; | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   free(mysum,*theGridAccelerator); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | ||||||
| { | { | ||||||
|   // sum over reduced dimension planes, breaking out orthog dir |   // sum over reduced dimension planes, breaking out orthog dir | ||||||
| @@ -210,9 +195,13 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data | |||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)  | template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)  | ||||||
| { | { | ||||||
|   #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) |   #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||||
|    |    | ||||||
|   sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |   sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|  |    | ||||||
|  |   #elif defined(GRID_SYCL) | ||||||
|  |    | ||||||
|  |   sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|    |    | ||||||
|   #else |   #else | ||||||
|   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|   | |||||||
| @@ -462,6 +462,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|     autoView(st_v , st,AcceleratorRead); |     autoView(st_v , st,AcceleratorRead); | ||||||
|  |  | ||||||
|    if( interior && exterior ) { |    if( interior && exterior ) { | ||||||
|  |      acceleratorFenceComputeStream(); | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
| @@ -495,6 +496,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|     autoView(st_v ,st,AcceleratorRead); |     autoView(st_v ,st,AcceleratorRead); | ||||||
|  |  | ||||||
|    if( interior && exterior ) { |    if( interior && exterior ) { | ||||||
|  |      acceleratorFenceComputeStream(); | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|   | |||||||
| @@ -70,57 +70,6 @@ struct DefaultImplParams { | |||||||
| void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, | void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, | ||||||
| 				 int off,std::vector<std::pair<int,int> > & table); | 				 int off,std::vector<std::pair<int,int> > & table); | ||||||
|  |  | ||||||
| /* |  | ||||||
| template<class vobj,class cobj,class compressor> |  | ||||||
| void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)   __attribute__((noinline)); |  | ||||||
|  |  | ||||||
| template<class vobj,class cobj,class compressor> |  | ||||||
| void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) |  | ||||||
| { |  | ||||||
|   int num=table.size(); |  | ||||||
|   std::pair<int,int> *table_v = & table[0]; |  | ||||||
|  |  | ||||||
|   auto rhs_v = rhs.View(AcceleratorRead); |  | ||||||
|   accelerator_forNB( i,num, vobj::Nsimd(), { |  | ||||||
|     compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]); |  | ||||||
|   }); |  | ||||||
|   rhs_v.ViewClose(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /////////////////////////////////////////////////////////////////// |  | ||||||
| // Gather for when there *is* need to SIMD split with compression |  | ||||||
| /////////////////////////////////////////////////////////////////// |  | ||||||
| template<class cobj,class vobj,class compressor> |  | ||||||
| void Gather_plane_exchange_table(const Lattice<vobj> &rhs, |  | ||||||
| 				 commVector<cobj *> pointers, |  | ||||||
| 				 int dimension,int plane, |  | ||||||
| 				 int cbmask,compressor &compress,int type) __attribute__((noinline)); |  | ||||||
|  |  | ||||||
| template<class cobj,class vobj,class compressor> |  | ||||||
| void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table, |  | ||||||
| 				 const Lattice<vobj> &rhs, |  | ||||||
| 				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask, |  | ||||||
| 				 compressor &compress,int type) |  | ||||||
| { |  | ||||||
|   assert( (table.size()&0x1)==0); |  | ||||||
|   int num=table.size()/2; |  | ||||||
|   int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane |  | ||||||
|  |  | ||||||
|   auto rhs_v = rhs.View(AcceleratorRead); |  | ||||||
|   auto rhs_p = &rhs_v[0]; |  | ||||||
|   auto p0=&pointers[0][0]; |  | ||||||
|   auto p1=&pointers[1][0]; |  | ||||||
|   auto tp=&table[0]; |  | ||||||
|   accelerator_forNB(j, num, vobj::Nsimd(), { |  | ||||||
|       compress.CompressExchange(p0,p1, rhs_p, j, |  | ||||||
| 				so+tp[2*j  ].second, |  | ||||||
| 				so+tp[2*j+1].second, |  | ||||||
| 				type); |  | ||||||
|   }); |  | ||||||
|   rhs_v.ViewClose(); |  | ||||||
| } |  | ||||||
| */ |  | ||||||
|  |  | ||||||
| void DslashResetCounts(void); | void DslashResetCounts(void); | ||||||
| void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); | void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); | ||||||
| void DslashLogFull(void); | void DslashLogFull(void); | ||||||
| @@ -258,6 +207,10 @@ public: | |||||||
|   struct Packet { |   struct Packet { | ||||||
|     void * send_buf; |     void * send_buf; | ||||||
|     void * recv_buf; |     void * recv_buf; | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |     void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE | ||||||
|  |     void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE | ||||||
|  | #endif | ||||||
|     Integer to_rank; |     Integer to_rank; | ||||||
|     Integer from_rank; |     Integer from_rank; | ||||||
|     Integer do_send; |     Integer do_send; | ||||||
| @@ -324,7 +277,7 @@ public: | |||||||
|   Vector<int> surface_list; |   Vector<int> surface_list; | ||||||
|  |  | ||||||
|   stencilVector<StencilEntry>  _entries; // Resident in managed memory |   stencilVector<StencilEntry>  _entries; // Resident in managed memory | ||||||
|   commVector<StencilEntry>     _entries_device; // Resident in managed memory |   commVector<StencilEntry>     _entries_device; // Resident in device memory | ||||||
|   std::vector<Packet> Packets; |   std::vector<Packet> Packets; | ||||||
|   std::vector<Merge> Mergers; |   std::vector<Merge> Mergers; | ||||||
|   std::vector<Merge> MergersSHM; |   std::vector<Merge> MergersSHM; | ||||||
| @@ -408,33 +361,16 @@ public: | |||||||
|   // Use OpenMP Tasks for cleaner ??? |   // Use OpenMP Tasks for cleaner ??? | ||||||
|   // must be called *inside* parallel region |   // must be called *inside* parallel region | ||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|   /* |  | ||||||
|   void CommunicateThreaded() |  | ||||||
|   { |  | ||||||
| #ifdef GRID_OMP |  | ||||||
|     int mythread = omp_get_thread_num(); |  | ||||||
|     int nthreads = CartesianCommunicator::nCommThreads; |  | ||||||
| #else |  | ||||||
|     int mythread = 0; |  | ||||||
|     int nthreads = 1; |  | ||||||
| #endif |  | ||||||
|     if (nthreads == -1) nthreads = 1; |  | ||||||
|     if (mythread < nthreads) { |  | ||||||
|       for (int i = mythread; i < Packets.size(); i += nthreads) { |  | ||||||
| 	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, |  | ||||||
| 						      Packets[i].to_rank, |  | ||||||
| 						      Packets[i].recv_buf, |  | ||||||
| 						      Packets[i].from_rank, |  | ||||||
| 						      Packets[i].bytes,i); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   */ |  | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   // Non blocking send and receive. Necessarily parallel. |   // Non blocking send and receive. Necessarily parallel. | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|  |     // All GPU kernel tasks must complete | ||||||
|  |     //    accelerator_barrier();     // All kernels should ALREADY be complete | ||||||
|  |     //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer | ||||||
|  |                                // But the HaloGather had a barrier too. | ||||||
|  | #ifdef ACCELERATOR_AWARE_MPI | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       _grid->StencilSendToRecvFromBegin(MpiReqs, |       _grid->StencilSendToRecvFromBegin(MpiReqs, | ||||||
| 					Packets[i].send_buf, | 					Packets[i].send_buf, | ||||||
| @@ -443,16 +379,54 @@ public: | |||||||
| 					Packets[i].from_rank,Packets[i].do_recv, | 					Packets[i].from_rank,Packets[i].do_recv, | ||||||
| 					Packets[i].xbytes,Packets[i].rbytes,i); | 					Packets[i].xbytes,Packets[i].rbytes,i); | ||||||
|     } |     } | ||||||
|  | #else | ||||||
|  | #warning "Using COPY VIA HOST BUFFERS IN STENCIL" | ||||||
|  |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |       // Introduce a host buffer with a cheap slab allocator and zero cost wipe all | ||||||
|  |       Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes); | ||||||
|  |       Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); | ||||||
|  |       if ( Packets[i].do_send ) { | ||||||
|  | 	acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); | ||||||
|  |       } | ||||||
|  |       _grid->StencilSendToRecvFromBegin(MpiReqs, | ||||||
|  | 					Packets[i].host_send_buf, | ||||||
|  | 					Packets[i].to_rank,Packets[i].do_send, | ||||||
|  | 					Packets[i].host_recv_buf, | ||||||
|  | 					Packets[i].from_rank,Packets[i].do_recv, | ||||||
|  | 					Packets[i].xbytes,Packets[i].rbytes,i); | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  |     // Get comms started then run checksums | ||||||
|  |     // Having this PRIOR to the dslash seems to make Sunspot work... (!) | ||||||
|  |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |       if ( Packets[i].do_send ) | ||||||
|  | 	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes); | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     _grid->StencilSendToRecvFromComplete(MpiReqs,0); |     _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done | ||||||
|     if   ( this->partialDirichlet ) DslashLogPartial(); |     if   ( this->partialDirichlet ) DslashLogPartial(); | ||||||
|     else if ( this->fullDirichlet ) DslashLogDirichlet(); |     else if ( this->fullDirichlet ) DslashLogDirichlet(); | ||||||
|     else DslashLogFull(); |     else DslashLogFull(); | ||||||
|     acceleratorCopySynchronise(); |     // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete | ||||||
|  |     //    accelerator_barrier();  | ||||||
|     _grid->StencilBarrier();  |     _grid->StencilBarrier();  | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  | #warning "Using COPY VIA HOST BUFFERS IN STENCIL" | ||||||
|  |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |       if ( Packets[i].do_recv ) { | ||||||
|  | 	acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     _grid->HostBufferFreeAll(); | ||||||
|  | #endif | ||||||
|  |     // run any checksums | ||||||
|  |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |       if ( Packets[i].do_recv ) | ||||||
|  | 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   // Blocking send and receive. Either sequential or parallel. |   // Blocking send and receive. Either sequential or parallel. | ||||||
| @@ -528,6 +502,7 @@ public: | |||||||
|   template<class compressor> |   template<class compressor> | ||||||
|   void HaloGather(const Lattice<vobj> &source,compressor &compress) |   void HaloGather(const Lattice<vobj> &source,compressor &compress) | ||||||
|   { |   { | ||||||
|  |     //    accelerator_barrier(); | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||||
|  |  | ||||||
|     assert(source.Grid()==_grid); |     assert(source.Grid()==_grid); | ||||||
| @@ -540,10 +515,9 @@ public: | |||||||
|       compress.Point(point); |       compress.Point(point); | ||||||
|       HaloGatherDir(source,compress,point,face_idx); |       HaloGatherDir(source,compress,point,face_idx); | ||||||
|     } |     } | ||||||
|     accelerator_barrier(); |     accelerator_barrier(); // All my local gathers are complete | ||||||
|     face_table_computed=1; |     face_table_computed=1; | ||||||
|     assert(u_comm_offset==_unified_buffer_size); |     assert(u_comm_offset==_unified_buffer_size); | ||||||
|  |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   ///////////////////////// |   ///////////////////////// | ||||||
| @@ -579,6 +553,7 @@ public: | |||||||
|       accelerator_forNB(j, words, cobj::Nsimd(), { |       accelerator_forNB(j, words, cobj::Nsimd(), { | ||||||
| 	  coalescedWrite(to[j] ,coalescedRead(from [j])); | 	  coalescedWrite(to[j] ,coalescedRead(from [j])); | ||||||
|       }); |       }); | ||||||
|  |       acceleratorFenceComputeStream(); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|    |    | ||||||
| @@ -669,6 +644,7 @@ public: | |||||||
|     for(int i=0;i<dd.size();i++){ |     for(int i=0;i<dd.size();i++){ | ||||||
|       decompressor::DecompressFace(decompress,dd[i]); |       decompressor::DecompressFace(decompress,dd[i]); | ||||||
|     } |     } | ||||||
|  |     acceleratorFenceComputeStream(); // dependent kernels | ||||||
|   } |   } | ||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
|   // Set up routines |   // Set up routines | ||||||
| @@ -1224,7 +1200,6 @@ public: | |||||||
| 	  /////////////////////////////////////////////////////////// | 	  /////////////////////////////////////////////////////////// | ||||||
| 	  int do_send = (comms_send|comms_partial_send) && (!shm_send ); | 	  int do_send = (comms_send|comms_partial_send) && (!shm_send ); | ||||||
| 	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv ); | 	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv ); | ||||||
| 	   |  | ||||||
| 	  AddPacket((void *)&send_buf[comm_off], | 	  AddPacket((void *)&send_buf[comm_off], | ||||||
| 		    (void *)&recv_buf[comm_off], | 		    (void *)&recv_buf[comm_off], | ||||||
| 		    xmit_to_rank, do_send, | 		    xmit_to_rank, do_send, | ||||||
|   | |||||||
							
								
								
									
										339
									
								
								Grid/util/FlightRecorder.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										339
									
								
								Grid/util/FlightRecorder.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,339 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |  | ||||||
|  |     Source file: ./lib/Init.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  | Author: Peter Boyle <peterboyle@MacBook-Pro.local> | ||||||
|  | Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
|  | NAMESPACE_BEGIN(Grid); | ||||||
|  | /////////////////////////////////////////////////////// | ||||||
|  | // Grid Norm logging for repro testing | ||||||
|  | /////////////////////////////////////////////////////// | ||||||
|  | int FlightRecorder::PrintEntireLog; | ||||||
|  | int FlightRecorder::ContinueOnFail; | ||||||
|  | int FlightRecorder::LoggingMode; | ||||||
|  | int FlightRecorder::ChecksumComms; | ||||||
|  | int FlightRecorder::ChecksumCommsSend; | ||||||
|  | int32_t  FlightRecorder::XmitLoggingCounter; | ||||||
|  | int32_t  FlightRecorder::RecvLoggingCounter; | ||||||
|  | int32_t  FlightRecorder::CsumLoggingCounter; | ||||||
|  | int32_t  FlightRecorder::NormLoggingCounter; | ||||||
|  | int32_t  FlightRecorder::ReductionLoggingCounter; | ||||||
|  | uint64_t FlightRecorder::ErrorCounter; | ||||||
|  | std::vector<double> FlightRecorder::NormLogVector; | ||||||
|  | std::vector<double> FlightRecorder::ReductionLogVector; | ||||||
|  | std::vector<uint64_t> FlightRecorder::CsumLogVector; | ||||||
|  | std::vector<uint64_t> FlightRecorder::XmitLogVector; | ||||||
|  | std::vector<uint64_t> FlightRecorder::RecvLogVector; | ||||||
|  |  | ||||||
|  | void FlightRecorder::ResetCounters(void) | ||||||
|  | { | ||||||
|  |   XmitLoggingCounter=0; | ||||||
|  |   RecvLoggingCounter=0; | ||||||
|  |   CsumLoggingCounter=0; | ||||||
|  |   NormLoggingCounter=0; | ||||||
|  |   ReductionLoggingCounter=0; | ||||||
|  | } | ||||||
|  | void FlightRecorder::Truncate(void) | ||||||
|  | { | ||||||
|  |   ResetCounters(); | ||||||
|  |   XmitLogVector.resize(0); | ||||||
|  |   RecvLogVector.resize(0); | ||||||
|  |   NormLogVector.resize(0); | ||||||
|  |   CsumLogVector.resize(0); | ||||||
|  |   ReductionLogVector.resize(0); | ||||||
|  | } | ||||||
|  | void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode) | ||||||
|  | { | ||||||
|  |   switch ( mode ) { | ||||||
|  |   case LoggingModePrint: | ||||||
|  |     SetLoggingModePrint(); | ||||||
|  |     break; | ||||||
|  |   case LoggingModeRecord: | ||||||
|  |     SetLoggingModeRecord(); | ||||||
|  |     break; | ||||||
|  |   case LoggingModeVerify: | ||||||
|  |     SetLoggingModeVerify(); | ||||||
|  |     break; | ||||||
|  |   case LoggingModeNone: | ||||||
|  |     LoggingMode = mode; | ||||||
|  |     Truncate(); | ||||||
|  |     break; | ||||||
|  |   default: | ||||||
|  |     assert(0); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void FlightRecorder::SetLoggingModePrint(void) | ||||||
|  | { | ||||||
|  |   std::cout << " FlightRecorder: set to print output " <<std::endl; | ||||||
|  |   Truncate(); | ||||||
|  |   LoggingMode = LoggingModePrint; | ||||||
|  | } | ||||||
|  | void FlightRecorder::SetLoggingModeRecord(void) | ||||||
|  | { | ||||||
|  |   std::cout << " FlightRecorder: set to RECORD " <<std::endl; | ||||||
|  |   Truncate(); | ||||||
|  |   LoggingMode = LoggingModeRecord; | ||||||
|  | } | ||||||
|  | void FlightRecorder::SetLoggingModeVerify(void) | ||||||
|  | { | ||||||
|  |   std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl; | ||||||
|  |   ResetCounters(); | ||||||
|  |   LoggingMode = LoggingModeVerify; | ||||||
|  | } | ||||||
|  | uint64_t FlightRecorder::ErrorCount(void) | ||||||
|  | { | ||||||
|  |   return ErrorCounter; | ||||||
|  | } | ||||||
|  | void FlightRecorder::NormLog(double value) | ||||||
|  | { | ||||||
|  |   uint64_t hex = * ( (uint64_t *)&value ); | ||||||
|  |   if(LoggingMode == LoggingModePrint) { | ||||||
|  |     std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|  |     NormLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeRecord) { | ||||||
|  |     std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|  |     NormLogVector.push_back(value); | ||||||
|  |     NormLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeVerify) { | ||||||
|  |  | ||||||
|  |     if(NormLoggingCounter < NormLogVector.size()){ | ||||||
|  |       uint64_t hexref  = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] ); | ||||||
|  |  | ||||||
|  |       if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) { | ||||||
|  |  | ||||||
|  | 	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter | ||||||
|  | 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" " | ||||||
|  | 		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl; | ||||||
|  |  | ||||||
|  | 	std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl; | ||||||
|  |  | ||||||
|  | 	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n", | ||||||
|  | 		GridHostname(), | ||||||
|  | 		GlobalSharedMemory::WorldShmRank, | ||||||
|  | 		NormLoggingCounter,NormLogVector.size(), | ||||||
|  | 		value, NormLogVector[NormLoggingCounter]); fflush(stderr); | ||||||
|  |  | ||||||
|  | 	if(!ContinueOnFail)assert(0); // Force takedown of job | ||||||
|  | 	   | ||||||
|  | 	ErrorCounter++; | ||||||
|  |       } else { | ||||||
|  | 	if ( PrintEntireLog ) {  | ||||||
|  | 	  std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex | ||||||
|  | 		   <<" "<<hex<<" "<<hexref | ||||||
|  | 		   <<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |         | ||||||
|  |     } | ||||||
|  |     if ( NormLogVector.size()==NormLoggingCounter ) { | ||||||
|  |       std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl; | ||||||
|  |     } | ||||||
|  |     NormLoggingCounter++; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void FlightRecorder::CsumLog(uint64_t hex) | ||||||
|  | { | ||||||
|  |   if(LoggingMode == LoggingModePrint) { | ||||||
|  |     std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|  |     CsumLoggingCounter++; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if(LoggingMode == LoggingModeRecord) { | ||||||
|  |     std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|  |     CsumLogVector.push_back(hex); | ||||||
|  |     CsumLoggingCounter++; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if(LoggingMode == LoggingModeVerify) { | ||||||
|  |      | ||||||
|  |     if(CsumLoggingCounter < CsumLogVector.size()) { | ||||||
|  |  | ||||||
|  |       uint64_t hexref  = CsumLogVector[CsumLoggingCounter] ; | ||||||
|  |  | ||||||
|  |       if ( hex != hexref ) { | ||||||
|  |  | ||||||
|  |         std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter | ||||||
|  | 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl; | ||||||
|  |  | ||||||
|  | 	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n", | ||||||
|  | 		GridHostname(), | ||||||
|  | 		GlobalSharedMemory::WorldShmRank, | ||||||
|  | 		CsumLoggingCounter,hex, hexref); | ||||||
|  | 	fflush(stderr); | ||||||
|  |  | ||||||
|  | 	if(!ContinueOnFail) assert(0); // Force takedown of job | ||||||
|  | 	   | ||||||
|  | 	ErrorCounter++; | ||||||
|  |  | ||||||
|  |       } else { | ||||||
|  |  | ||||||
|  | 	if ( PrintEntireLog ) {  | ||||||
|  | 	  std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex | ||||||
|  | 		   <<" "<<hex<<" "<<hexref<<std::dec<<std::endl; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     }   | ||||||
|  |     if ( CsumLogVector.size()==CsumLoggingCounter ) { | ||||||
|  |       std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl; | ||||||
|  |     } | ||||||
|  |     CsumLoggingCounter++; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void FlightRecorder::ReductionLog(double local,double global) | ||||||
|  | { | ||||||
|  |   uint64_t hex_l = * ( (uint64_t *)&local ); | ||||||
|  |   uint64_t hex_g = * ( (uint64_t *)&global ); | ||||||
|  |   if(LoggingMode == LoggingModePrint) { | ||||||
|  |     std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl; | ||||||
|  |     ReductionLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeRecord) { | ||||||
|  |     std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl; | ||||||
|  |     ReductionLogVector.push_back(global); | ||||||
|  |     ReductionLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeVerify) { | ||||||
|  |     if(ReductionLoggingCounter < ReductionLogVector.size()){ | ||||||
|  |       if ( global != ReductionLogVector[ReductionLoggingCounter] ) { | ||||||
|  | 	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n", | ||||||
|  | 		GridHostname(), | ||||||
|  | 		GlobalSharedMemory::WorldShmRank, | ||||||
|  | 		ReductionLoggingCounter,ReductionLogVector.size(), | ||||||
|  | 		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr); | ||||||
|  | 	 | ||||||
|  | 	if ( !ContinueOnFail ) assert(0); | ||||||
|  |  | ||||||
|  | 	ErrorCounter++; | ||||||
|  |       } else { | ||||||
|  | 	if ( PrintEntireLog ) {  | ||||||
|  | 	  std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if ( ReductionLogVector.size()==ReductionLoggingCounter ) { | ||||||
|  |       std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl; | ||||||
|  |     } | ||||||
|  |     ReductionLoggingCounter++; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void FlightRecorder::xmitLog(void *buf,uint64_t bytes) | ||||||
|  | { | ||||||
|  |   if ( ChecksumCommsSend ){ | ||||||
|  |   uint64_t *ubuf = (uint64_t *)buf; | ||||||
|  |   if(LoggingMode == LoggingModeNone) return; | ||||||
|  | #ifdef GRID_SYCL | ||||||
|  |   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); | ||||||
|  |   if(LoggingMode == LoggingModePrint) { | ||||||
|  |     std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; | ||||||
|  |     XmitLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeRecord) { | ||||||
|  |     std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; | ||||||
|  |     XmitLogVector.push_back(_xor); | ||||||
|  |     XmitLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeVerify) { | ||||||
|  |     if(XmitLoggingCounter < XmitLogVector.size()){ | ||||||
|  |       if ( _xor != XmitLogVector[XmitLoggingCounter] ) { | ||||||
|  | 	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n", | ||||||
|  | 		GridHostname(), | ||||||
|  | 		GlobalSharedMemory::WorldShmRank, | ||||||
|  | 		XmitLoggingCounter,XmitLogVector.size(), | ||||||
|  | 		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr); | ||||||
|  | 	 | ||||||
|  | 	if ( !ContinueOnFail ) assert(0); | ||||||
|  |  | ||||||
|  | 	ErrorCounter++; | ||||||
|  |       } else { | ||||||
|  | 	if ( PrintEntireLog ) {  | ||||||
|  | 	  std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  XmitLogVector[XmitLoggingCounter] <<std::endl; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if ( XmitLogVector.size()==XmitLoggingCounter ) { | ||||||
|  |       std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl; | ||||||
|  |     } | ||||||
|  |     XmitLoggingCounter++; | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |   } else { | ||||||
|  |     uint64_t word = 1; | ||||||
|  |     deviceVector<uint64_t> dev(1); | ||||||
|  |     acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t)); | ||||||
|  |     acceleratorCopySynchronise(); | ||||||
|  |     MPI_Barrier(MPI_COMM_WORLD); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) | ||||||
|  | { | ||||||
|  |   if ( ChecksumComms ){ | ||||||
|  |   uint64_t *ubuf = (uint64_t *)buf; | ||||||
|  |   if(LoggingMode == LoggingModeNone) return; | ||||||
|  | #ifdef GRID_SYCL | ||||||
|  |   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); | ||||||
|  |   if(LoggingMode == LoggingModePrint) { | ||||||
|  |     std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; | ||||||
|  |     RecvLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeRecord) { | ||||||
|  |     std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; | ||||||
|  |     RecvLogVector.push_back(_xor); | ||||||
|  |     RecvLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeVerify) { | ||||||
|  |     if(RecvLoggingCounter < RecvLogVector.size()){ | ||||||
|  |       if ( _xor != RecvLogVector[RecvLoggingCounter] ) { | ||||||
|  | 	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n", | ||||||
|  | 		GridHostname(), | ||||||
|  | 		GlobalSharedMemory::WorldShmRank, | ||||||
|  | 		RecvLoggingCounter,RecvLogVector.size(), | ||||||
|  | 		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr); | ||||||
|  | 	 | ||||||
|  | 	if ( !ContinueOnFail ) assert(0); | ||||||
|  |  | ||||||
|  | 	ErrorCounter++; | ||||||
|  |       } else { | ||||||
|  | 	if ( PrintEntireLog ) {  | ||||||
|  | 	  std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  RecvLogVector[RecvLoggingCounter] <<std::endl; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if ( RecvLogVector.size()==RecvLoggingCounter ) { | ||||||
|  |       std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl; | ||||||
|  |     } | ||||||
|  |     RecvLoggingCounter++; | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | NAMESPACE_END(Grid); | ||||||
							
								
								
									
										43
									
								
								Grid/util/FlightRecorder.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								Grid/util/FlightRecorder.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | |||||||
|  | #pragma once | ||||||
|  |  | ||||||
|  | NAMESPACE_BEGIN(Grid); | ||||||
|  | class FlightRecorder { | ||||||
|  |  public: | ||||||
|  |   enum LoggingMode_t { | ||||||
|  |     LoggingModeNone, | ||||||
|  |     LoggingModePrint, | ||||||
|  |     LoggingModeRecord, | ||||||
|  |     LoggingModeVerify | ||||||
|  |   }; | ||||||
|  |    | ||||||
|  |   static int                   LoggingMode; | ||||||
|  |   static uint64_t              ErrorCounter; | ||||||
|  |   static int32_t               XmitLoggingCounter; | ||||||
|  |   static int32_t               RecvLoggingCounter; | ||||||
|  |   static int32_t               CsumLoggingCounter; | ||||||
|  |   static int32_t               NormLoggingCounter; | ||||||
|  |   static int32_t               ReductionLoggingCounter; | ||||||
|  |   static std::vector<uint64_t> XmitLogVector; | ||||||
|  |   static std::vector<uint64_t> RecvLogVector; | ||||||
|  |   static std::vector<uint64_t> CsumLogVector; | ||||||
|  |   static std::vector<double>   NormLogVector; | ||||||
|  |   static std::vector<double>   ReductionLogVector; | ||||||
|  |   static int ContinueOnFail; | ||||||
|  |   static int PrintEntireLog; | ||||||
|  |   static int ChecksumComms; | ||||||
|  |   static int ChecksumCommsSend; | ||||||
|  |   static void SetLoggingModePrint(void); | ||||||
|  |   static void SetLoggingModeRecord(void); | ||||||
|  |   static void SetLoggingModeVerify(void); | ||||||
|  |   static void SetLoggingMode(LoggingMode_t mode); | ||||||
|  |   static void NormLog(double value); | ||||||
|  |   static void CsumLog(uint64_t csum); | ||||||
|  |   static void ReductionLog(double lcl, double glbl); | ||||||
|  |   static void Truncate(void); | ||||||
|  |   static void ResetCounters(void); | ||||||
|  |   static uint64_t ErrorCount(void); | ||||||
|  |   static void xmitLog(void *,uint64_t bytes); | ||||||
|  |   static void recvLog(void *,uint64_t bytes,int rank); | ||||||
|  | }; | ||||||
|  | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| @@ -90,129 +90,6 @@ NAMESPACE_BEGIN(Grid); | |||||||
| static Coordinate Grid_default_latt; | static Coordinate Grid_default_latt; | ||||||
| static Coordinate Grid_default_mpi; | static Coordinate Grid_default_mpi; | ||||||
|  |  | ||||||
|  |  | ||||||
| /////////////////////////////////////////////////////// |  | ||||||
| // Grid Norm logging for repro testing |  | ||||||
| /////////////////////////////////////////////////////// |  | ||||||
| int GridNormLoggingMode; |  | ||||||
| int32_t GridNormLoggingCounter; |  | ||||||
| int32_t GridMPINormLoggingCounter; |  | ||||||
| std::vector<double> GridNormLogVector; |  | ||||||
| std::vector<double> GridMPINormLogVector; |  | ||||||
| std::vector<uint32_t> GridCsumLogVector; |  | ||||||
|  |  | ||||||
| void SetGridNormLoggingMode(GridNormLoggingMode_t mode) |  | ||||||
| { |  | ||||||
|   switch ( mode ) { |  | ||||||
|   case GridNormLoggingModePrint: |  | ||||||
|     SetGridNormLoggingModePrint(); |  | ||||||
|     break; |  | ||||||
|   case GridNormLoggingModeRecord: |  | ||||||
|     SetGridNormLoggingModeRecord(); |  | ||||||
|     break; |  | ||||||
|   case GridNormLoggingModeVerify: |  | ||||||
|     SetGridNormLoggingModeVerify(); |  | ||||||
|     break; |  | ||||||
|   case GridNormLoggingModeNone: |  | ||||||
|     GridNormLoggingMode = mode; |  | ||||||
|     GridNormLoggingCounter=0; |  | ||||||
|     GridMPINormLoggingCounter=0; |  | ||||||
|     GridNormLogVector.resize(0); |  | ||||||
|     GridCsumLogVector.resize(0); |  | ||||||
|     GridMPINormLogVector.resize(0); |  | ||||||
|     break; |  | ||||||
|   default: |  | ||||||
|     assert(0); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void SetGridNormLoggingModePrint(void) |  | ||||||
| { |  | ||||||
|   std::cout << " GridNormLogging Reproducibility logging set to print output " <<std::endl; |  | ||||||
|   GridNormLoggingCounter = 0; |  | ||||||
|   GridMPINormLoggingCounter=0; |  | ||||||
|   GridNormLogVector.resize(0); |  | ||||||
|   GridCsumLogVector.resize(0); |  | ||||||
|   GridMPINormLogVector.resize(0); |  | ||||||
|   GridNormLoggingMode = GridNormLoggingModePrint; |  | ||||||
| } |  | ||||||
| void SetGridNormLoggingModeRecord(void) |  | ||||||
| { |  | ||||||
|   std::cout << " GridNormLogging Reproducibility logging set to RECORD " <<std::endl; |  | ||||||
|   GridNormLoggingCounter = 0; |  | ||||||
|   GridMPINormLoggingCounter=0; |  | ||||||
|   GridNormLogVector.resize(0); |  | ||||||
|   GridCsumLogVector.resize(0); |  | ||||||
|   GridMPINormLogVector.resize(0); |  | ||||||
|   GridNormLoggingMode = GridNormLoggingModeRecord; |  | ||||||
| } |  | ||||||
| void SetGridNormLoggingModeVerify(void) |  | ||||||
| { |  | ||||||
|   std::cout << " GridNormLogging Reproducibility logging set to VERIFY " << GridNormLogVector.size()<< " log entries "<<std::endl; |  | ||||||
|   GridNormLoggingCounter = 0; |  | ||||||
|   GridMPINormLoggingCounter=0; |  | ||||||
|   GridNormLoggingMode = GridNormLoggingModeVerify; |  | ||||||
| } |  | ||||||
| void GridNormLog(double value,uint32_t csum) |  | ||||||
| { |  | ||||||
|   if(GridNormLoggingMode == GridNormLoggingModePrint) { |  | ||||||
|     std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value << " csum " <<std::hex<<csum<<std::dec <<std::endl; |  | ||||||
|     GridNormLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(GridNormLoggingMode == GridNormLoggingModeRecord) { |  | ||||||
|     GridNormLogVector.push_back(value); |  | ||||||
|     GridCsumLogVector.push_back(csum); |  | ||||||
|     GridNormLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(GridNormLoggingMode == GridNormLoggingModeVerify) { |  | ||||||
|     assert(GridNormLoggingCounter < GridNormLogVector.size()); |  | ||||||
|     if ( (value != GridNormLogVector[GridNormLoggingCounter]) |  | ||||||
| 	 || (csum!=GridCsumLogVector[GridNormLoggingCounter]) ) { |  | ||||||
|       std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<GridNormLogVector[GridNormLoggingCounter] <<std::endl; |  | ||||||
|       std::cerr << " Oops got csum "<< std::hex<<csum<<" expect "<<GridCsumLogVector[GridNormLoggingCounter] <<std::endl; |  | ||||||
|       fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e %.16e %x %x\n", |  | ||||||
| 	      GridHostname(), |  | ||||||
| 	      GlobalSharedMemory::WorldShmRank, |  | ||||||
| 	      GridNormLoggingCounter,GridNormLogVector.size(), |  | ||||||
| 	      value, GridNormLogVector[GridNormLoggingCounter], |  | ||||||
| 	      csum, GridCsumLogVector[GridNormLoggingCounter]); fflush(stderr); |  | ||||||
|       assert(0); // Force takedown of job |  | ||||||
|     } |  | ||||||
|     if ( GridNormLogVector.size()==GridNormLoggingCounter ) { |  | ||||||
|       std::cout << " GridNormLogging : Verified entire sequence of "<<GridNormLoggingCounter<<" norms "<<std::endl; |  | ||||||
|     } |  | ||||||
|     GridNormLoggingCounter++; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| void GridMPINormLog(double local,double result) |  | ||||||
| { |  | ||||||
|   if(GridNormLoggingMode == GridNormLoggingModePrint) { |  | ||||||
|     std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << " -> " <<result <<std::endl; |  | ||||||
|     GridMPINormLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(GridNormLoggingMode == GridNormLoggingModeRecord) { |  | ||||||
|     std::cerr<<"GridMPINormLog RECORDING : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl; |  | ||||||
|     GridMPINormLogVector.push_back(result); |  | ||||||
|     GridMPINormLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(GridNormLoggingMode == GridNormLoggingModeVerify) { |  | ||||||
|     std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl; |  | ||||||
|     assert(GridMPINormLoggingCounter < GridMPINormLogVector.size()); |  | ||||||
|     if ( result != GridMPINormLogVector[GridMPINormLoggingCounter] ) { |  | ||||||
|       fprintf(stderr,"%s:%d MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e hist %.16e\n", |  | ||||||
| 	      GridHostname(), |  | ||||||
| 	      GlobalSharedMemory::WorldShmRank, |  | ||||||
| 	      GridMPINormLoggingCounter,GridMPINormLogVector.size(), |  | ||||||
| 	      result, local, GridMPINormLogVector[GridMPINormLoggingCounter]); fflush(stderr); |  | ||||||
|       assert(0); // Force takedown of job |  | ||||||
|     } |  | ||||||
|     if ( GridMPINormLogVector.size()==GridMPINormLoggingCounter ) { |  | ||||||
|       std::cout << " GridMPINormLogging : Verified entire sequence of "<<GridMPINormLoggingCounter<<" norms "<<std::endl; |  | ||||||
|     } |  | ||||||
|     GridMPINormLoggingCounter++; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| int GridThread::_threads =1; | int GridThread::_threads =1; | ||||||
| int GridThread::_hyperthreads=1; | int GridThread::_hyperthreads=1; | ||||||
| int GridThread::_cores=1; | int GridThread::_cores=1; | ||||||
|   | |||||||
| @@ -70,21 +70,6 @@ void GridParseLayout(char **argv,int argc, | |||||||
| void printHash(void); | void printHash(void); | ||||||
|  |  | ||||||
|  |  | ||||||
| enum GridNormLoggingMode_t { |  | ||||||
|   GridNormLoggingModeNone, |  | ||||||
|   GridNormLoggingModePrint, |  | ||||||
|   GridNormLoggingModeRecord, |  | ||||||
|   GridNormLoggingModeVerify |  | ||||||
| }; |  | ||||||
| //extern int GridNormLoggingMode; |  | ||||||
| //extern int32_t GridNormLoggingCounter; |  | ||||||
| //extern std::vector<double> GridNormLogVector; |  | ||||||
| void SetGridNormLoggingModePrint(void); |  | ||||||
| void SetGridNormLoggingModeRecord(void); |  | ||||||
| void SetGridNormLoggingModeVerify(void); |  | ||||||
| void SetGridNormLoggingMode(GridNormLoggingMode_t mode); |  | ||||||
| void GridNormLog(double value,uint32_t csum); |  | ||||||
| void GridMPINormLog(double lcl, double glbl); |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| #ifndef GRID_UTIL_H | #pragma once | ||||||
| #define GRID_UTIL_H |  | ||||||
| #include <Grid/util/Coordinate.h> | #include <Grid/util/Coordinate.h> | ||||||
| #include <Grid/util/Lexicographic.h> | #include <Grid/util/Lexicographic.h> | ||||||
| #include <Grid/util/Init.h> | #include <Grid/util/Init.h> | ||||||
| #endif | #include <Grid/util/FlightRecorder.h> | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										21
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -226,23 +226,14 @@ case ${ac_SFW_FP16} in | |||||||
| esac | esac | ||||||
|  |  | ||||||
| ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons | ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons | ||||||
| AC_ARG_ENABLE([accelerator-cshift], | AC_ARG_ENABLE([accelerator-aware-mpi], | ||||||
|     [AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])], |     [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], | ||||||
|     [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes]) |     [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) | ||||||
|  |  | ||||||
| AC_ARG_ENABLE([ucx-buggy], | case ${ac_ACCELERATOR_AWARE_MPI} in | ||||||
|     [AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])], |  | ||||||
|     [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no]) |  | ||||||
|  |  | ||||||
| case ${ac_UCXBUGGY} in |  | ||||||
|     yes) |     yes) | ||||||
|     ac_ACC_CSHIFT=no;; |       AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host]) | ||||||
|     *);; |       AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; | ||||||
| esac |  | ||||||
|  |  | ||||||
| case ${ac_ACC_CSHIFT} in |  | ||||||
|     yes) |  | ||||||
|       AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);; |  | ||||||
|     *);; |     *);; | ||||||
| esac | esac | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,16 +1,16 @@ | |||||||
| TOOLS=$HOME/tools |  | ||||||
| ../../configure \ | ../../configure \ | ||||||
| 	--enable-simd=GPU \ | 	--enable-simd=GPU \ | ||||||
| 	--enable-gen-simd-width=64 \ | 	--enable-gen-simd-width=64 \ | ||||||
| 	--enable-comms=mpi-auto \ | 	--enable-comms=mpi-auto \ | ||||||
| 	--enable-accelerator-cshift \ |  | ||||||
| 	--disable-gparity \ | 	--disable-gparity \ | ||||||
| 	--disable-fermion-reps \ | 	--disable-fermion-reps \ | ||||||
| 	--enable-shm=nvlink \ | 	--enable-shm=nvlink \ | ||||||
| 	--enable-accelerator=sycl \ | 	--enable-accelerator=sycl \ | ||||||
|  | 	--enable-accelerator-aware-mpi=no\ | ||||||
| 	--enable-unified=no \ | 	--enable-unified=no \ | ||||||
| 	MPICXX=mpicxx \ | 	MPICXX=mpicxx \ | ||||||
| 	CXX=icpx \ | 	CXX=icpx \ | ||||||
| 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \ | 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ | ||||||
| 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel" | 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								systems/Aurora/sourceme-sunspot-deterministic.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								systems/Aurora/sourceme-sunspot-deterministic.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | |||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
| @@ -30,6 +30,7 @@ export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling | |||||||
| unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | ||||||
| unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | ||||||
| unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| #!/bin/bash | #!/bin/bash | ||||||
|  |  | ||||||
| #PBS -l select=16:system=sunspot,place=scatter | #PBS -l select=32:system=sunspot,place=scatter | ||||||
| #PBS -A LatticeQCD_aesp_CNDA | #PBS -A LatticeQCD_aesp_CNDA | ||||||
| #PBS -l walltime=02:00:00 | #PBS -l walltime=02:00:00 | ||||||
| #PBS -N reproN | #PBS -N reproN | ||||||
| @@ -15,13 +15,23 @@ module load mpich/51.2/icc-all-deterministic-pmix-gpu | |||||||
| # 56 cores / 6 threads ~9 | # 56 cores / 6 threads ~9 | ||||||
| export OMP_NUM_THREADS=6 | export OMP_NUM_THREADS=6 | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
| export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  |  | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 | ||||||
|  | export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||||
|  |  | ||||||
|  | export GRID_PRINT_ENTIRE_LOG=0 | ||||||
|  | export GRID_CHECKSUM_RECV_BUF=0 | ||||||
|  | export GRID_CHECKSUM_SEND_BUF=0 | ||||||
|  |  | ||||||
| export MPICH_OFI_NIC_POLICY=GPU | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
| export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | ||||||
| @@ -46,29 +56,39 @@ cd $PBS_O_WORKDIR | |||||||
| THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
| echo Node $n is $THIS_NODE | echo Node $n is $THIS_NODE | ||||||
|  |  | ||||||
| DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE | DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE | ||||||
|  |  | ||||||
| mkdir -p $DIR | mkdir -p $DIR | ||||||
| cd $DIR | cd $DIR | ||||||
|  |  | ||||||
| echo $THIS_NODE > nodefile | echo $THIS_NODE > nodefile | ||||||
|  |  | ||||||
|  | #CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \ | ||||||
|  | #	     ../../gpu_tile_compact.sh \ | ||||||
|  | #	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ | ||||||
|  | #		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" | ||||||
|  |  | ||||||
| CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \ | CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \ | ||||||
| 	     ../../gpu_tile_compact.sh \ | 	     ../../gpu_tile_compact.sh \ | ||||||
| 	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ | 	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ | ||||||
| 		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" | 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" | ||||||
|  |  | ||||||
|  | echo $CMD > command-line | ||||||
|  | env > environment | ||||||
| $CMD & | $CMD & | ||||||
|  |  | ||||||
| done | done | ||||||
|  |  | ||||||
| wait | # Suspicious wait is allowing jobs to collide and knock out | ||||||
|  | #wait | ||||||
|  |  | ||||||
|  | sleep 6500 | ||||||
|  |  | ||||||
| for n in ` eval echo {1..$NN} ` | for n in ` eval echo {1..$NN} ` | ||||||
| do | do | ||||||
|  |  | ||||||
| THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
| DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE | DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE | ||||||
|  |  | ||||||
| cd $DIR | cd $DIR | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| TOOLS=$HOME/tools |  | ||||||
| ../../configure \ | ../../configure \ | ||||||
| 	--enable-simd=GPU \ | 	--enable-simd=GPU \ | ||||||
| 	--enable-gen-simd-width=64 \ | 	--enable-gen-simd-width=64 \ | ||||||
| @@ -11,6 +11,6 @@ TOOLS=$HOME/tools | |||||||
| 	--enable-unified=no \ | 	--enable-unified=no \ | ||||||
| 	MPICXX=mpicxx \ | 	MPICXX=mpicxx \ | ||||||
| 	CXX=icpx \ | 	CXX=icpx \ | ||||||
| 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \ | 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ | ||||||
| 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include" | 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								systems/Sunspot/sourceme.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								systems/Sunspot/sourceme.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | |||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
							
								
								
									
										81
									
								
								systems/Sunspot/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								systems/Sunspot/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,81 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | #PBS -l select=16:system=sunspot,place=scatter | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  | #PBS -l walltime=02:00:00 | ||||||
|  | #PBS -N repro1gpu | ||||||
|  | #PBS -k doe | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
|  |  | ||||||
|  | # 56 cores / 6 threads ~9 | ||||||
|  | export OMP_NUM_THREADS=6 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling | ||||||
|  | unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | NN=`cat $PBS_NODEFILE | wc -l` | ||||||
|  | echo $PBS_NODEFILE | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | echo $NN nodes in node file | ||||||
|  | for n in `eval echo {1..$NN}` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | echo Node $n is $THIS_NODE | ||||||
|  |  | ||||||
|  |  | ||||||
|  | for g in {0..11} | ||||||
|  | do | ||||||
|  | export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) | ||||||
|  | export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) | ||||||
|  | export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) | ||||||
|  |  | ||||||
|  | export numa=${NUMA_MAP[$g]} | ||||||
|  | export gpu_id=${GPU_MAP[$g]} | ||||||
|  | export tile_id=${TILE_MAP[$g]} | ||||||
|  | export gpu=$gpu_id.$tile_id | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu | ||||||
|  | mkdir -p $DIR | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | echo $THIS_NODE > nodefile | ||||||
|  | echo $gpu > gpu | ||||||
|  |  | ||||||
|  | export ZE_AFFINITY_MASK=$gpu | ||||||
|  | export ONEAPI_DEVICE_FILTER=gpu,level_zero | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 1 -ppn 1  -envall --hostfile nodefile \ | ||||||
|  | 	     numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \ | ||||||
|  | 		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message" | ||||||
|  | echo $CMD | ||||||
|  | $CMD & | ||||||
|  |  | ||||||
|  | done | ||||||
|  | done | ||||||
|  |  | ||||||
|  | wait | ||||||
|  |  | ||||||
							
								
								
									
										97
									
								
								systems/Sunspot/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								systems/Sunspot/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | #PBS -l select=32:system=sunspot,place=scatter | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  | #PBS -l walltime=02:00:00 | ||||||
|  | #PBS -N reproN | ||||||
|  | #PBS -k doe | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
|  |  | ||||||
|  | # 56 cores / 6 threads ~9 | ||||||
|  | export OMP_NUM_THREADS=6 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  |  | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 | ||||||
|  |  | ||||||
|  | export GRID_PRINT_ENTIRE_LOG=0 | ||||||
|  | export GRID_CHECKSUM_RECV_BUF=1 | ||||||
|  | export GRID_CHECKSUM_SEND_BUF=0 | ||||||
|  |  | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling | ||||||
|  | unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | NN=`cat $PBS_NODEFILE | wc -l` | ||||||
|  | echo $PBS_NODEFILE | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | echo $NN nodes in node file | ||||||
|  | for n in `eval echo {1..$NN}` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | echo Node $n is $THIS_NODE | ||||||
|  |  | ||||||
|  | DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE | ||||||
|  |  | ||||||
|  | mkdir -p $DIR | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | echo $THIS_NODE > nodefile | ||||||
|  |  | ||||||
|  | #CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \ | ||||||
|  | #	     ../../gpu_tile_compact.sh \ | ||||||
|  | #	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ | ||||||
|  | #		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \ | ||||||
|  | 	     ../../gpu_tile_compact.sh \ | ||||||
|  | 	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ | ||||||
|  | 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" | ||||||
|  |  | ||||||
|  | echo $CMD > command-line | ||||||
|  | env > environment | ||||||
|  | $CMD & | ||||||
|  |  | ||||||
|  | done | ||||||
|  |  | ||||||
|  | # Suspicious wait is allowing jobs to collide and knock out | ||||||
|  | #wait | ||||||
|  |  | ||||||
|  | sleep 6500 | ||||||
|  |  | ||||||
|  | for n in ` eval echo {1..$NN} ` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE | ||||||
|  |  | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | grep Oops Grid.stderr.* > failures.$PBS_JOBID | ||||||
|  | rm core.* | ||||||
|  |  | ||||||
|  | done | ||||||
| @@ -34,6 +34,7 @@ using namespace Grid; | |||||||
| #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX | #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
| template<class Matrix,class Field> | template<class Matrix,class Field> | ||||||
|   class SchurDiagMooeeOperatorParanoid :  public SchurOperatorBase<Field> { |   class SchurDiagMooeeOperatorParanoid :  public SchurOperatorBase<Field> { | ||||||
| @@ -143,14 +144,21 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   time_t start = time(NULL); |   time_t start = time(NULL); | ||||||
|  |  | ||||||
|   uint32_t csum, csumref; |   FlightRecorder::ContinueOnFail = 0; | ||||||
|   csumref=0; |   FlightRecorder::PrintEntireLog = 0; | ||||||
|  |   FlightRecorder::ChecksumComms  = 1; | ||||||
|  |   FlightRecorder::ChecksumCommsSend=0; | ||||||
|  |  | ||||||
|  |   if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s); | ||||||
|  |   if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms      = atoi(s); | ||||||
|  |   if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend  = atoi(s); | ||||||
|  |  | ||||||
|   int iter=0; |   int iter=0; | ||||||
|   do { |   do { | ||||||
|     if ( iter == 0 ) { |     if ( iter == 0 ) { | ||||||
|       SetGridNormLoggingMode(GridNormLoggingModeRecord); |       FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord); | ||||||
|     } else { |     } else { | ||||||
|       SetGridNormLoggingMode(GridNormLoggingModeVerify); |       FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify); | ||||||
|     } |     } | ||||||
|     std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl; |     std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl; | ||||||
|     result_o = Zero(); |     result_o = Zero(); | ||||||
| @@ -162,31 +170,22 @@ int main (int argc, char ** argv) | |||||||
|     flops+= CGsiteflops*FrbGrid->gSites()*iters; |     flops+= CGsiteflops*FrbGrid->gSites()*iters; | ||||||
|     std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl; |     std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl; | ||||||
|     std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl; |     std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl; | ||||||
|  |     std::cout << " SinglePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl; | ||||||
|  |  | ||||||
|     csum = crc(result_o); |     assert(FlightRecorder::ErrorCount()==0); | ||||||
|  |  | ||||||
|     if ( csumref == 0 ) { |     std::cout << " FlightRecorder is OK! "<<std::endl; | ||||||
|       csumref = csum; |  | ||||||
|     } else { |  | ||||||
|       if ( csum != csumref ) {  |  | ||||||
| 	std::cerr << host<<" FAILURE " <<iter <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl; |  | ||||||
| 	assert(0); |  | ||||||
|       } else { |  | ||||||
| 	std::cout << host <<" OK " <<iter <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     iter ++; |     iter ++; | ||||||
|   } while (time(NULL) < (start + nsecs/2) ); |   } while (time(NULL) < (start + nsecs/10) ); | ||||||
|      |      | ||||||
|   std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl; |   std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl; | ||||||
|   ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); |   ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); | ||||||
|   csumref=0; |  | ||||||
|   int i=0; |   int i=0; | ||||||
|   do {  |   do {  | ||||||
|     if ( i == 0 ) { |     if ( i == 0 ) { | ||||||
|       SetGridNormLoggingMode(GridNormLoggingModeRecord); |       FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord); | ||||||
|     } else { |     } else { | ||||||
|       SetGridNormLoggingMode(GridNormLoggingModeVerify); |       FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify); | ||||||
|     } |     } | ||||||
|     std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl; |     std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl; | ||||||
|     result_o_2 = Zero(); |     result_o_2 = Zero(); | ||||||
| @@ -199,19 +198,9 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|     std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl; |     std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl; | ||||||
|     std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl; |     std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl; | ||||||
|  |     std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl; | ||||||
|     csum = crc(result_o); |     assert(FlightRecorder::ErrorCount()==0); | ||||||
|  |     std::cout << " FlightRecorder is OK! "<<std::endl; | ||||||
|     if ( csumref == 0 ) { |  | ||||||
|       csumref = csum; |  | ||||||
|     } else { |  | ||||||
|       if ( csum != csumref ) {  |  | ||||||
| 	std::cerr << i <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl; |  | ||||||
| 	assert(0); |  | ||||||
|       } else { |  | ||||||
| 	std::cout << i <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     i++; |     i++; | ||||||
|   } while (time(NULL) < (start + nsecs) ); |   } while (time(NULL) < (start + nsecs) ); | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user