mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-28 18:49:33 +00:00 
			
		
		
		
	Compare commits
	
		
			35 Commits
		
	
	
		
			fix/HOST_N
			...
			8a098889fc
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 8a098889fc | ||
|  | ff2ea5de18 | ||
|  | da59379612 | ||
|  | 3ef2a41518 | ||
|  | aa96f420c6 | ||
|  | 49e9e4ed0e | ||
|  | f7b8163016 | ||
|  | 93769eacd3 | ||
|  | 59b0cc11df | ||
|  | f32c275376 | ||
|  | 5404fc66ab | ||
|  | 1f53458af8 | ||
|  | 434c3e7f1d | ||
|  | 500b119f3d | ||
|  | 4b87259c1b | ||
|  | 503dec34ef | ||
|  | d1e9fe50d2 | ||
|  | d01e5fa838 | ||
|  | a477c25e8c | ||
|  | 1bd20cd9e8 | ||
|  | e49e95b037 | ||
|  | 6f59fed563 | ||
|  | 60b7f6c99d | ||
|  | b92dfcc8d3 | ||
|  | f6fd6dd053 | ||
|  | 79ad567dd5 | ||
|  | fab1efb48c | ||
|  | 660eb76d93 | ||
|  | 62e7bf024a | ||
|  | 95f3d69cf9 | ||
| 89c0519f83 | |||
| 2704b82084 | |||
| cf8632bbac | |||
| d224297972 | |||
|  | a4d11a630f | 
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #define _GRID_FFT_H_ | #define _GRID_FFT_H_ | ||||||
|  |  | ||||||
| #ifdef HAVE_FFTW | #ifdef HAVE_FFTW | ||||||
| #ifdef USE_MKL | #if defined(USE_MKL) || defined(GRID_SYCL) | ||||||
| #include <fftw/fftw3.h> | #include <fftw/fftw3.h> | ||||||
| #else | #else | ||||||
| #include <fftw3.h> | #include <fftw3.h> | ||||||
|   | |||||||
| @@ -348,6 +348,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | |||||||
|   return offbytes; |   return offbytes; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #undef NVLINK_GET // Define to use get instead of put DMA | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 							 void *xmit, | 							 void *xmit, | ||||||
| 							 int dest,int dox, | 							 int dest,int dox, | ||||||
| @@ -380,9 +381,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
|       list.push_back(rrq); |       list.push_back(rrq); | ||||||
|       off_node_bytes+=rbytes; |       off_node_bytes+=rbytes; | ||||||
|     } |     } | ||||||
|  | #ifdef NVLINK_GET | ||||||
|  |       void *shm = (void *) this->ShmBufferTranslate(from,xmit); | ||||||
|  |       assert(shm!=NULL); | ||||||
|  |       acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); | ||||||
|  | #endif | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   if (dox) { |   if (dox) { | ||||||
|  |     //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes); | ||||||
|     if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { |     if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { | ||||||
|       tag= dir+_processor*32; |       tag= dir+_processor*32; | ||||||
|       ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); |       ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); | ||||||
| @@ -390,9 +397,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
|       list.push_back(xrq); |       list.push_back(xrq); | ||||||
|       off_node_bytes+=xbytes; |       off_node_bytes+=xbytes; | ||||||
|     } else { |     } else { | ||||||
|  | #ifndef NVLINK_GET | ||||||
|       void *shm = (void *) this->ShmBufferTranslate(dest,recv); |       void *shm = (void *) this->ShmBufferTranslate(dest,recv); | ||||||
|       assert(shm!=NULL); |       assert(shm!=NULL); | ||||||
|       acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); |       acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); | ||||||
|  | #endif | ||||||
|  |        | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -402,6 +412,8 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque | |||||||
| { | { | ||||||
|   int nreq=list.size(); |   int nreq=list.size(); | ||||||
|  |  | ||||||
|  |   acceleratorCopySynchronise(); | ||||||
|  |  | ||||||
|   if (nreq==0) return; |   if (nreq==0) return; | ||||||
|  |  | ||||||
|   std::vector<MPI_Status> status(nreq); |   std::vector<MPI_Status> status(nreq); | ||||||
|   | |||||||
| @@ -40,6 +40,9 @@ int                 GlobalSharedMemory::_ShmAlloc; | |||||||
| uint64_t            GlobalSharedMemory::_ShmAllocBytes; | uint64_t            GlobalSharedMemory::_ShmAllocBytes; | ||||||
|  |  | ||||||
| std::vector<void *> GlobalSharedMemory::WorldShmCommBufs; | std::vector<void *> GlobalSharedMemory::WorldShmCommBufs; | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  | void * GlobalSharedMemory::HostCommBuf; | ||||||
|  | #endif | ||||||
|  |  | ||||||
| Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm; | Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm; | ||||||
| int                 GlobalSharedMemory::WorldShmRank; | int                 GlobalSharedMemory::WorldShmRank; | ||||||
| @@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void) | |||||||
| ///////////////////////////////// | ///////////////////////////////// | ||||||
| // Alloc, free shmem region | // Alloc, free shmem region | ||||||
| ///////////////////////////////// | ///////////////////////////////// | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  | void *SharedMemory::HostBufferMalloc(size_t bytes){ | ||||||
|  |   void *ptr = (void *)host_heap_top; | ||||||
|  |   host_heap_top  += bytes; | ||||||
|  |   host_heap_bytes+= bytes; | ||||||
|  |   if (host_heap_bytes >= host_heap_size) { | ||||||
|  |     std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl; | ||||||
|  |     std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl; | ||||||
|  |     std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; | ||||||
|  |     std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl; | ||||||
|  |     std::cout<< " Current heap  is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl; | ||||||
|  |     assert(host_heap_bytes<host_heap_size); | ||||||
|  |   } | ||||||
|  |   return ptr; | ||||||
|  | } | ||||||
|  | void SharedMemory::HostBufferFreeAll(void) {  | ||||||
|  |   host_heap_top  =(size_t)HostCommBuf; | ||||||
|  |   host_heap_bytes=0; | ||||||
|  | } | ||||||
|  | #endif | ||||||
| void *SharedMemory::ShmBufferMalloc(size_t bytes){ | void *SharedMemory::ShmBufferMalloc(size_t bytes){ | ||||||
|   //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes |   //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes | ||||||
|   void *ptr = (void *)heap_top; |   void *ptr = (void *)heap_top; | ||||||
|   | |||||||
| @@ -75,7 +75,9 @@ public: | |||||||
|   static int           Hugepages; |   static int           Hugepages; | ||||||
|  |  | ||||||
|   static std::vector<void *> WorldShmCommBufs; |   static std::vector<void *> WorldShmCommBufs; | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   static void *HostCommBuf; | ||||||
|  | #endif | ||||||
|   static Grid_MPI_Comm WorldComm; |   static Grid_MPI_Comm WorldComm; | ||||||
|   static int           WorldRank; |   static int           WorldRank; | ||||||
|   static int           WorldSize; |   static int           WorldSize; | ||||||
| @@ -120,6 +122,13 @@ private: | |||||||
|   size_t heap_bytes; |   size_t heap_bytes; | ||||||
|   size_t heap_size; |   size_t heap_size; | ||||||
|  |  | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   size_t host_heap_top;  // set in free all | ||||||
|  |   size_t host_heap_bytes;// set in free all | ||||||
|  |   void *HostCommBuf;     // set in SetCommunicator | ||||||
|  |   size_t host_heap_size; // set in SetCommunicator | ||||||
|  | #endif | ||||||
|  |    | ||||||
| protected: | protected: | ||||||
|  |  | ||||||
|   Grid_MPI_Comm    ShmComm; // for barriers |   Grid_MPI_Comm    ShmComm; // for barriers | ||||||
| @@ -151,7 +160,10 @@ public: | |||||||
|   void *ShmBufferTranslate(int rank,void * local_p); |   void *ShmBufferTranslate(int rank,void * local_p); | ||||||
|   void *ShmBufferMalloc(size_t bytes); |   void *ShmBufferMalloc(size_t bytes); | ||||||
|   void  ShmBufferFreeAll(void) ; |   void  ShmBufferFreeAll(void) ; | ||||||
|    | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   void *HostBufferMalloc(size_t bytes); | ||||||
|  |   void HostBufferFreeAll(void); | ||||||
|  | #endif   | ||||||
|   ////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////// | ||||||
|   // Make info on Nodes & ranks and Shared memory available |   // Make info on Nodes & ranks and Shared memory available | ||||||
|   ////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -39,9 +39,11 @@ Author: Christoph Lehner <christoph@lhnr.de> | |||||||
| #include <hip/hip_runtime_api.h> | #include <hip/hip_runtime_api.h> | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|  | #ifdef ACCELERATOR_AWARE_MPI | ||||||
| #define GRID_SYCL_LEVEL_ZERO_IPC | #define GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|  | #define SHM_SOCKETS | ||||||
|  | #endif  | ||||||
| #include <syscall.h> | #include <syscall.h> | ||||||
| #define SHM_SOCKETS  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #include <sys/socket.h> | #include <sys/socket.h> | ||||||
| @@ -512,46 +514,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| // Hugetlbfs mapping intended | // Hugetlbfs mapping intended | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| #if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL) | #if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL) | ||||||
|  |  | ||||||
| //if defined(GRID_SYCL) |  | ||||||
| #if 0 |  | ||||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) |  | ||||||
| { |  | ||||||
|   void * ShmCommBuf ;  |  | ||||||
|   assert(_ShmSetup==1); |  | ||||||
|   assert(_ShmAlloc==0); |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // allocate the pointer array for shared windows for our group |  | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   MPI_Barrier(WorldShmComm); |  | ||||||
|   WorldShmCommBufs.resize(WorldShmSize); |  | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // Each MPI rank should allocate our own buffer |  | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   ShmCommBuf = acceleratorAllocDevice(bytes); |  | ||||||
|  |  | ||||||
|   if (ShmCommBuf == (void *)NULL ) { |  | ||||||
|     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; |  | ||||||
|     exit(EXIT_FAILURE);   |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes  |  | ||||||
| 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; |  | ||||||
|  |  | ||||||
|   SharedMemoryZero(ShmCommBuf,bytes); |  | ||||||
|  |  | ||||||
|   assert(WorldShmSize == 1); |  | ||||||
|   for(int r=0;r<WorldShmSize;r++){ |  | ||||||
|     WorldShmCommBufs[r] = ShmCommBuf; |  | ||||||
|   } |  | ||||||
|   _ShmAllocBytes=bytes; |  | ||||||
|   _ShmAlloc=1; |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)   |  | ||||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
| { | { | ||||||
|   void * ShmCommBuf ;  |   void * ShmCommBuf ;  | ||||||
| @@ -574,6 +536,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Each MPI rank should allocate our own buffer |   // Each MPI rank should allocate our own buffer | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   HostCommBuf= malloc(bytes); | ||||||
|  | #endif   | ||||||
|   ShmCommBuf = acceleratorAllocDevice(bytes); |   ShmCommBuf = acceleratorAllocDevice(bytes); | ||||||
|   if (ShmCommBuf == (void *)NULL ) { |   if (ShmCommBuf == (void *)NULL ) { | ||||||
|     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; |     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; | ||||||
| @@ -738,7 +703,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|   _ShmAllocBytes=bytes; |   _ShmAllocBytes=bytes; | ||||||
|   _ShmAlloc=1; |   _ShmAlloc=1; | ||||||
| } | } | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #else  | #else  | ||||||
| #ifdef GRID_MPI3_SHMMMAP | #ifdef GRID_MPI3_SHMMMAP | ||||||
| @@ -962,6 +926,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) | |||||||
|   } |   } | ||||||
|   ShmBufferFreeAll(); |   ShmBufferFreeAll(); | ||||||
|  |  | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   host_heap_size = heap_size; | ||||||
|  |   HostCommBuf= GlobalSharedMemory::HostCommBuf; | ||||||
|  |   HostBufferFreeAll(); | ||||||
|  | #endif   | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////////// | ||||||
|   // find comm ranks in our SHM group (i.e. which ranks are on our node) |   // find comm ranks in our SHM group (i.e. which ranks are on our node) | ||||||
|   ///////////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -281,12 +281,29 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | |||||||
|   return nrm; |   return nrm; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { | inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { | ||||||
|   GridBase *grid = left.Grid(); |   GridBase *grid = left.Grid(); | ||||||
|  |  | ||||||
|  | #ifdef GRID_SYCL | ||||||
|  |   uint64_t csum=0; | ||||||
|  |   if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) | ||||||
|  |   { | ||||||
|  |     // Hack | ||||||
|  |     // Fast integer xor checksum. Can also be used in comms now. | ||||||
|  |     autoView(l_v,left,AcceleratorRead); | ||||||
|  |     Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); | ||||||
|  |     uint64_t *base= (uint64_t *)&l_v[0]; | ||||||
|  |     csum=svm_xor(base,words); | ||||||
|  |   } | ||||||
|  |   FlightRecorder::CsumLog(csum); | ||||||
|  | #endif | ||||||
|   ComplexD nrm = rankInnerProduct(left,right); |   ComplexD nrm = rankInnerProduct(left,right); | ||||||
|   //  std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl; |   RealD local = real(nrm); | ||||||
|  |   FlightRecorder::NormLog(real(nrm));  | ||||||
|   grid->GlobalSum(nrm); |   grid->GlobalSum(nrm); | ||||||
|  |   FlightRecorder::ReductionLog(local,real(nrm));  | ||||||
|   return nrm; |   return nrm; | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -69,29 +69,30 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite | |||||||
|   return result; |   return result; | ||||||
| } | } | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); |  | ||||||
|  |  | ||||||
| /* | template<class Word> Word svm_xor(Word *vec,uint64_t L) | ||||||
| template<class Double> Double svm_reduce(Double *vec,uint64_t L) |  | ||||||
| { | { | ||||||
|   Double sumResult; zeroit(sumResult); |   Word xorResult; xorResult = 0; | ||||||
|   Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator); |   Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator); | ||||||
|   Double identity;  zeroit(identity); |   Word identity;  identity=0; | ||||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||||
|      auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>()); |      auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>()); | ||||||
|      cgh.parallel_for(cl::sycl::range<1>{L}, |      cgh.parallel_for(cl::sycl::range<1>{L}, | ||||||
| 		      Reduction, | 		      Reduction, | ||||||
| 		      [=] (cl::sycl::id<1> index, auto &sum) { | 		      [=] (cl::sycl::id<1> index, auto &sum) { | ||||||
| 	 sum +=vec[index]; | 	 sum ^=vec[index]; | ||||||
|      }); |      }); | ||||||
|    }); |    }); | ||||||
|   theGridAccelerator->wait(); |   theGridAccelerator->wait(); | ||||||
|   Double ret = d_sum[0]; |   Word ret = d_sum[0]; | ||||||
|   free(d_sum,*theGridAccelerator); |   free(d_sum,*theGridAccelerator); | ||||||
|   std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl; |  | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|  | /* | ||||||
|  |  | ||||||
| template <class vobj> | template <class vobj> | ||||||
| inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites) | inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites) | ||||||
| { | { | ||||||
|   | |||||||
| @@ -411,7 +411,7 @@ public: | |||||||
|       std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl; |       std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl; | ||||||
|       SeedFixedIntegers(seeds); |       SeedFixedIntegers(seeds); | ||||||
|     } |     } | ||||||
|   void SeedFixedIntegers(const std::vector<int> &seeds){ |   void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){ | ||||||
|  |  | ||||||
|     // Everyone generates the same seed_seq based on input seeds |     // Everyone generates the same seed_seq based on input seeds | ||||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); |     CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); | ||||||
| @@ -428,7 +428,6 @@ public: | |||||||
|     // MT implementation does not implement fast discard even though |     // MT implementation does not implement fast discard even though | ||||||
|     // in principle this is possible |     // in principle this is possible | ||||||
|     //////////////////////////////////////////////// |     //////////////////////////////////////////////// | ||||||
| #if 1 |  | ||||||
|     thread_for( lidx, _grid->lSites(), { |     thread_for( lidx, _grid->lSites(), { | ||||||
|  |  | ||||||
| 	int gidx; | 	int gidx; | ||||||
| @@ -449,29 +448,12 @@ public: | |||||||
| 	 | 	 | ||||||
| 	int l_idx=generator_idx(o_idx,i_idx); | 	int l_idx=generator_idx(o_idx,i_idx); | ||||||
| 	_generators[l_idx] = master_engine; | 	_generators[l_idx] = master_engine; | ||||||
| 	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence | 	if ( britney ) {  | ||||||
|     }); | 	  Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence | ||||||
| #else | 	} else { 	 | ||||||
|     // Everybody loops over global volume. |  | ||||||
|     thread_for( gidx, _grid->_gsites, { |  | ||||||
|  |  | ||||||
| 	// Where is it? |  | ||||||
| 	int rank; |  | ||||||
| 	int o_idx; |  | ||||||
| 	int i_idx; |  | ||||||
|  |  | ||||||
| 	Coordinate gcoor; |  | ||||||
| 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor); |  | ||||||
| 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); |  | ||||||
| 	 |  | ||||||
| 	// If this is one of mine we take it |  | ||||||
| 	if( rank == _grid->ThisRank() ){ |  | ||||||
| 	  int l_idx=generator_idx(o_idx,i_idx); |  | ||||||
| 	  _generators[l_idx] = master_engine; |  | ||||||
| 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence | 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence | ||||||
| 	} | 	} | ||||||
|     }); |     }); | ||||||
| #endif |  | ||||||
| #else  | #else  | ||||||
|     //////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////// | ||||||
|     // Machine and thread decomposition dependent seeding is efficient |     // Machine and thread decomposition dependent seeding is efficient | ||||||
|   | |||||||
| @@ -462,6 +462,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|     autoView(st_v , st,AcceleratorRead); |     autoView(st_v , st,AcceleratorRead); | ||||||
|  |  | ||||||
|    if( interior && exterior ) { |    if( interior && exterior ) { | ||||||
|  |      acceleratorFenceComputeStream(); | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
| @@ -495,6 +496,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|     autoView(st_v ,st,AcceleratorRead); |     autoView(st_v ,st,AcceleratorRead); | ||||||
|  |  | ||||||
|    if( interior && exterior ) { |    if( interior && exterior ) { | ||||||
|  |      acceleratorFenceComputeStream(); | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|   | |||||||
| @@ -70,57 +70,6 @@ struct DefaultImplParams { | |||||||
| void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, | void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, | ||||||
| 				 int off,std::vector<std::pair<int,int> > & table); | 				 int off,std::vector<std::pair<int,int> > & table); | ||||||
|  |  | ||||||
| /* |  | ||||||
| template<class vobj,class cobj,class compressor> |  | ||||||
| void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)   __attribute__((noinline)); |  | ||||||
|  |  | ||||||
| template<class vobj,class cobj,class compressor> |  | ||||||
| void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) |  | ||||||
| { |  | ||||||
|   int num=table.size(); |  | ||||||
|   std::pair<int,int> *table_v = & table[0]; |  | ||||||
|  |  | ||||||
|   auto rhs_v = rhs.View(AcceleratorRead); |  | ||||||
|   accelerator_forNB( i,num, vobj::Nsimd(), { |  | ||||||
|     compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]); |  | ||||||
|   }); |  | ||||||
|   rhs_v.ViewClose(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /////////////////////////////////////////////////////////////////// |  | ||||||
| // Gather for when there *is* need to SIMD split with compression |  | ||||||
| /////////////////////////////////////////////////////////////////// |  | ||||||
| template<class cobj,class vobj,class compressor> |  | ||||||
| void Gather_plane_exchange_table(const Lattice<vobj> &rhs, |  | ||||||
| 				 commVector<cobj *> pointers, |  | ||||||
| 				 int dimension,int plane, |  | ||||||
| 				 int cbmask,compressor &compress,int type) __attribute__((noinline)); |  | ||||||
|  |  | ||||||
| template<class cobj,class vobj,class compressor> |  | ||||||
| void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table, |  | ||||||
| 				 const Lattice<vobj> &rhs, |  | ||||||
| 				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask, |  | ||||||
| 				 compressor &compress,int type) |  | ||||||
| { |  | ||||||
|   assert( (table.size()&0x1)==0); |  | ||||||
|   int num=table.size()/2; |  | ||||||
|   int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane |  | ||||||
|  |  | ||||||
|   auto rhs_v = rhs.View(AcceleratorRead); |  | ||||||
|   auto rhs_p = &rhs_v[0]; |  | ||||||
|   auto p0=&pointers[0][0]; |  | ||||||
|   auto p1=&pointers[1][0]; |  | ||||||
|   auto tp=&table[0]; |  | ||||||
|   accelerator_forNB(j, num, vobj::Nsimd(), { |  | ||||||
|       compress.CompressExchange(p0,p1, rhs_p, j, |  | ||||||
| 				so+tp[2*j  ].second, |  | ||||||
| 				so+tp[2*j+1].second, |  | ||||||
| 				type); |  | ||||||
|   }); |  | ||||||
|   rhs_v.ViewClose(); |  | ||||||
| } |  | ||||||
| */ |  | ||||||
|  |  | ||||||
| void DslashResetCounts(void); | void DslashResetCounts(void); | ||||||
| void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); | void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); | ||||||
| void DslashLogFull(void); | void DslashLogFull(void); | ||||||
| @@ -258,6 +207,10 @@ public: | |||||||
|   struct Packet { |   struct Packet { | ||||||
|     void * send_buf; |     void * send_buf; | ||||||
|     void * recv_buf; |     void * recv_buf; | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |     void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE | ||||||
|  |     void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE | ||||||
|  | #endif | ||||||
|     Integer to_rank; |     Integer to_rank; | ||||||
|     Integer from_rank; |     Integer from_rank; | ||||||
|     Integer do_send; |     Integer do_send; | ||||||
| @@ -324,7 +277,7 @@ public: | |||||||
|   Vector<int> surface_list; |   Vector<int> surface_list; | ||||||
|  |  | ||||||
|   stencilVector<StencilEntry>  _entries; // Resident in managed memory |   stencilVector<StencilEntry>  _entries; // Resident in managed memory | ||||||
|   commVector<StencilEntry>     _entries_device; // Resident in managed memory |   commVector<StencilEntry>     _entries_device; // Resident in device memory | ||||||
|   std::vector<Packet> Packets; |   std::vector<Packet> Packets; | ||||||
|   std::vector<Merge> Mergers; |   std::vector<Merge> Mergers; | ||||||
|   std::vector<Merge> MergersSHM; |   std::vector<Merge> MergersSHM; | ||||||
| @@ -408,33 +361,16 @@ public: | |||||||
|   // Use OpenMP Tasks for cleaner ??? |   // Use OpenMP Tasks for cleaner ??? | ||||||
|   // must be called *inside* parallel region |   // must be called *inside* parallel region | ||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|   /* |  | ||||||
|   void CommunicateThreaded() |  | ||||||
|   { |  | ||||||
| #ifdef GRID_OMP |  | ||||||
|     int mythread = omp_get_thread_num(); |  | ||||||
|     int nthreads = CartesianCommunicator::nCommThreads; |  | ||||||
| #else |  | ||||||
|     int mythread = 0; |  | ||||||
|     int nthreads = 1; |  | ||||||
| #endif |  | ||||||
|     if (nthreads == -1) nthreads = 1; |  | ||||||
|     if (mythread < nthreads) { |  | ||||||
|       for (int i = mythread; i < Packets.size(); i += nthreads) { |  | ||||||
| 	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, |  | ||||||
| 						      Packets[i].to_rank, |  | ||||||
| 						      Packets[i].recv_buf, |  | ||||||
| 						      Packets[i].from_rank, |  | ||||||
| 						      Packets[i].bytes,i); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   */ |  | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   // Non blocking send and receive. Necessarily parallel. |   // Non blocking send and receive. Necessarily parallel. | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|  |     // All GPU kernel tasks must complete | ||||||
|  |     //    accelerator_barrier();     // All kernels should ALREADY be complete | ||||||
|  |     //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer | ||||||
|  |                                // But the HaloGather had a barrier too. | ||||||
|  | #ifdef ACCELERATOR_AWARE_MPI | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       _grid->StencilSendToRecvFromBegin(MpiReqs, |       _grid->StencilSendToRecvFromBegin(MpiReqs, | ||||||
| 					Packets[i].send_buf, | 					Packets[i].send_buf, | ||||||
| @@ -443,16 +379,54 @@ public: | |||||||
| 					Packets[i].from_rank,Packets[i].do_recv, | 					Packets[i].from_rank,Packets[i].do_recv, | ||||||
| 					Packets[i].xbytes,Packets[i].rbytes,i); | 					Packets[i].xbytes,Packets[i].rbytes,i); | ||||||
|     } |     } | ||||||
|  | #else | ||||||
|  | #warning "Using COPY VIA HOST BUFFERS IN STENCIL" | ||||||
|  |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |       // Introduce a host buffer with a cheap slab allocator and zero cost wipe all | ||||||
|  |       Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes); | ||||||
|  |       Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); | ||||||
|  |       if ( Packets[i].do_send ) { | ||||||
|  | 	acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); | ||||||
|  |       } | ||||||
|  |       _grid->StencilSendToRecvFromBegin(MpiReqs, | ||||||
|  | 					Packets[i].host_send_buf, | ||||||
|  | 					Packets[i].to_rank,Packets[i].do_send, | ||||||
|  | 					Packets[i].host_recv_buf, | ||||||
|  | 					Packets[i].from_rank,Packets[i].do_recv, | ||||||
|  | 					Packets[i].xbytes,Packets[i].rbytes,i); | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  |     // Get comms started then run checksums | ||||||
|  |     // Having this PRIOR to the dslash seems to make Sunspot work... (!) | ||||||
|  |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |       if ( Packets[i].do_send ) | ||||||
|  | 	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes); | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     _grid->StencilSendToRecvFromComplete(MpiReqs,0); |     _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done | ||||||
|     if   ( this->partialDirichlet ) DslashLogPartial(); |     if   ( this->partialDirichlet ) DslashLogPartial(); | ||||||
|     else if ( this->fullDirichlet ) DslashLogDirichlet(); |     else if ( this->fullDirichlet ) DslashLogDirichlet(); | ||||||
|     else DslashLogFull(); |     else DslashLogFull(); | ||||||
|     acceleratorCopySynchronise(); |     // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete | ||||||
|  |     //    accelerator_barrier();  | ||||||
|     _grid->StencilBarrier();  |     _grid->StencilBarrier();  | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  | #warning "Using COPY VIA HOST BUFFERS IN STENCIL" | ||||||
|  |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |       if ( Packets[i].do_recv ) { | ||||||
|  | 	acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     _grid->HostBufferFreeAll(); | ||||||
|  | #endif | ||||||
|  |     // run any checksums | ||||||
|  |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |       if ( Packets[i].do_recv ) | ||||||
|  | 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   // Blocking send and receive. Either sequential or parallel. |   // Blocking send and receive. Either sequential or parallel. | ||||||
| @@ -528,6 +502,7 @@ public: | |||||||
|   template<class compressor> |   template<class compressor> | ||||||
|   void HaloGather(const Lattice<vobj> &source,compressor &compress) |   void HaloGather(const Lattice<vobj> &source,compressor &compress) | ||||||
|   { |   { | ||||||
|  |     //    accelerator_barrier(); | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||||
|  |  | ||||||
|     assert(source.Grid()==_grid); |     assert(source.Grid()==_grid); | ||||||
| @@ -540,10 +515,9 @@ public: | |||||||
|       compress.Point(point); |       compress.Point(point); | ||||||
|       HaloGatherDir(source,compress,point,face_idx); |       HaloGatherDir(source,compress,point,face_idx); | ||||||
|     } |     } | ||||||
|     accelerator_barrier(); |     accelerator_barrier(); // All my local gathers are complete | ||||||
|     face_table_computed=1; |     face_table_computed=1; | ||||||
|     assert(u_comm_offset==_unified_buffer_size); |     assert(u_comm_offset==_unified_buffer_size); | ||||||
|  |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   ///////////////////////// |   ///////////////////////// | ||||||
| @@ -579,6 +553,7 @@ public: | |||||||
|       accelerator_forNB(j, words, cobj::Nsimd(), { |       accelerator_forNB(j, words, cobj::Nsimd(), { | ||||||
| 	  coalescedWrite(to[j] ,coalescedRead(from [j])); | 	  coalescedWrite(to[j] ,coalescedRead(from [j])); | ||||||
|       }); |       }); | ||||||
|  |       acceleratorFenceComputeStream(); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|    |    | ||||||
| @@ -669,6 +644,7 @@ public: | |||||||
|     for(int i=0;i<dd.size();i++){ |     for(int i=0;i<dd.size();i++){ | ||||||
|       decompressor::DecompressFace(decompress,dd[i]); |       decompressor::DecompressFace(decompress,dd[i]); | ||||||
|     } |     } | ||||||
|  |     acceleratorFenceComputeStream(); // dependent kernels | ||||||
|   } |   } | ||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
|   // Set up routines |   // Set up routines | ||||||
| @@ -1224,7 +1200,6 @@ public: | |||||||
| 	  /////////////////////////////////////////////////////////// | 	  /////////////////////////////////////////////////////////// | ||||||
| 	  int do_send = (comms_send|comms_partial_send) && (!shm_send ); | 	  int do_send = (comms_send|comms_partial_send) && (!shm_send ); | ||||||
| 	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv ); | 	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv ); | ||||||
| 	   |  | ||||||
| 	  AddPacket((void *)&send_buf[comm_off], | 	  AddPacket((void *)&send_buf[comm_off], | ||||||
| 		    (void *)&recv_buf[comm_off], | 		    (void *)&recv_buf[comm_off], | ||||||
| 		    xmit_to_rank, do_send, | 		    xmit_to_rank, do_send, | ||||||
|   | |||||||
| @@ -405,11 +405,4 @@ NAMESPACE_BEGIN(Grid); | |||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
| template<typename T> struct |  | ||||||
| sycl::is_device_copyable<T, typename std::enable_if< |  | ||||||
| 			      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value), |  | ||||||
| 			      void>::type> |  | ||||||
|   : public std::true_type {}; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										341
									
								
								Grid/util/FlightRecorder.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										341
									
								
								Grid/util/FlightRecorder.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,341 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |  | ||||||
|  |     Source file: ./lib/Init.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  | Author: Peter Boyle <peterboyle@MacBook-Pro.local> | ||||||
|  | Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
|  | NAMESPACE_BEGIN(Grid); | ||||||
|  | /////////////////////////////////////////////////////// | ||||||
|  | // Grid Norm logging for repro testing | ||||||
|  | /////////////////////////////////////////////////////// | ||||||
|  | int FlightRecorder::PrintEntireLog; | ||||||
|  | int FlightRecorder::ContinueOnFail; | ||||||
|  | int FlightRecorder::LoggingMode; | ||||||
|  | int FlightRecorder::ChecksumComms; | ||||||
|  | int FlightRecorder::ChecksumCommsSend; | ||||||
|  | int32_t  FlightRecorder::XmitLoggingCounter; | ||||||
|  | int32_t  FlightRecorder::RecvLoggingCounter; | ||||||
|  | int32_t  FlightRecorder::CsumLoggingCounter; | ||||||
|  | int32_t  FlightRecorder::NormLoggingCounter; | ||||||
|  | int32_t  FlightRecorder::ReductionLoggingCounter; | ||||||
|  | uint64_t FlightRecorder::ErrorCounter; | ||||||
|  | std::vector<double> FlightRecorder::NormLogVector; | ||||||
|  | std::vector<double> FlightRecorder::ReductionLogVector; | ||||||
|  | std::vector<uint64_t> FlightRecorder::CsumLogVector; | ||||||
|  | std::vector<uint64_t> FlightRecorder::XmitLogVector; | ||||||
|  | std::vector<uint64_t> FlightRecorder::RecvLogVector; | ||||||
|  |  | ||||||
|  | void FlightRecorder::ResetCounters(void) | ||||||
|  | { | ||||||
|  |   XmitLoggingCounter=0; | ||||||
|  |   RecvLoggingCounter=0; | ||||||
|  |   CsumLoggingCounter=0; | ||||||
|  |   NormLoggingCounter=0; | ||||||
|  |   ReductionLoggingCounter=0; | ||||||
|  | } | ||||||
|  | void FlightRecorder::Truncate(void) | ||||||
|  | { | ||||||
|  |   ResetCounters(); | ||||||
|  |   XmitLogVector.resize(0); | ||||||
|  |   RecvLogVector.resize(0); | ||||||
|  |   NormLogVector.resize(0); | ||||||
|  |   CsumLogVector.resize(0); | ||||||
|  |   ReductionLogVector.resize(0); | ||||||
|  | } | ||||||
|  | void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode) | ||||||
|  | { | ||||||
|  |   switch ( mode ) { | ||||||
|  |   case LoggingModePrint: | ||||||
|  |     SetLoggingModePrint(); | ||||||
|  |     break; | ||||||
|  |   case LoggingModeRecord: | ||||||
|  |     SetLoggingModeRecord(); | ||||||
|  |     break; | ||||||
|  |   case LoggingModeVerify: | ||||||
|  |     SetLoggingModeVerify(); | ||||||
|  |     break; | ||||||
|  |   case LoggingModeNone: | ||||||
|  |     LoggingMode = mode; | ||||||
|  |     Truncate(); | ||||||
|  |     break; | ||||||
|  |   default: | ||||||
|  |     assert(0); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void FlightRecorder::SetLoggingModePrint(void) | ||||||
|  | { | ||||||
|  |   std::cout << " FlightRecorder: set to print output " <<std::endl; | ||||||
|  |   Truncate(); | ||||||
|  |   LoggingMode = LoggingModePrint; | ||||||
|  | } | ||||||
|  | void FlightRecorder::SetLoggingModeRecord(void) | ||||||
|  | { | ||||||
|  |   std::cout << " FlightRecorder: set to RECORD " <<std::endl; | ||||||
|  |   Truncate(); | ||||||
|  |   LoggingMode = LoggingModeRecord; | ||||||
|  | } | ||||||
|  | void FlightRecorder::SetLoggingModeVerify(void) | ||||||
|  | { | ||||||
|  |   std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl; | ||||||
|  |   ResetCounters(); | ||||||
|  |   LoggingMode = LoggingModeVerify; | ||||||
|  | } | ||||||
|  | uint64_t FlightRecorder::ErrorCount(void) | ||||||
|  | { | ||||||
|  |   return ErrorCounter; | ||||||
|  | } | ||||||
|  | void FlightRecorder::NormLog(double value) | ||||||
|  | { | ||||||
|  |   uint64_t hex = * ( (uint64_t *)&value ); | ||||||
|  |   if(LoggingMode == LoggingModePrint) { | ||||||
|  |     std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|  |     NormLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeRecord) { | ||||||
|  |     std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|  |     NormLogVector.push_back(value); | ||||||
|  |     NormLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeVerify) { | ||||||
|  |  | ||||||
|  |     if(NormLoggingCounter < NormLogVector.size()){ | ||||||
|  |       uint64_t hexref  = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] ); | ||||||
|  |  | ||||||
|  |       if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) { | ||||||
|  |  | ||||||
|  | 	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter | ||||||
|  | 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" " | ||||||
|  | 		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl; | ||||||
|  |  | ||||||
|  | 	std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl; | ||||||
|  |  | ||||||
|  | 	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n", | ||||||
|  | 		GridHostname(), | ||||||
|  | 		GlobalSharedMemory::WorldShmRank, | ||||||
|  | 		NormLoggingCounter,NormLogVector.size(), | ||||||
|  | 		value, NormLogVector[NormLoggingCounter]); fflush(stderr); | ||||||
|  |  | ||||||
|  | 	if(!ContinueOnFail)assert(0); // Force takedown of job | ||||||
|  | 	   | ||||||
|  | 	ErrorCounter++; | ||||||
|  |       } else { | ||||||
|  | 	if ( PrintEntireLog ) {  | ||||||
|  | 	  std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex | ||||||
|  | 		   <<" "<<hex<<" "<<hexref | ||||||
|  | 		   <<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |         | ||||||
|  |     } | ||||||
|  |     if ( NormLogVector.size()==NormLoggingCounter ) { | ||||||
|  |       std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl; | ||||||
|  |     } | ||||||
|  |     NormLoggingCounter++; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void FlightRecorder::CsumLog(uint64_t hex) | ||||||
|  | { | ||||||
|  |   if(LoggingMode == LoggingModePrint) { | ||||||
|  |     std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|  |     CsumLoggingCounter++; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if(LoggingMode == LoggingModeRecord) { | ||||||
|  |     std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|  |     CsumLogVector.push_back(hex); | ||||||
|  |     CsumLoggingCounter++; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if(LoggingMode == LoggingModeVerify) { | ||||||
|  |      | ||||||
|  |     if(CsumLoggingCounter < CsumLogVector.size()) { | ||||||
|  |  | ||||||
|  |       uint64_t hexref  = CsumLogVector[CsumLoggingCounter] ; | ||||||
|  |  | ||||||
|  |       if ( hex != hexref ) { | ||||||
|  |  | ||||||
|  |         std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter | ||||||
|  | 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl; | ||||||
|  |  | ||||||
|  | 	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n", | ||||||
|  | 		GridHostname(), | ||||||
|  | 		GlobalSharedMemory::WorldShmRank, | ||||||
|  | 		CsumLoggingCounter,hex, hexref); | ||||||
|  | 	fflush(stderr); | ||||||
|  |  | ||||||
|  | 	if(!ContinueOnFail) assert(0); // Force takedown of job | ||||||
|  | 	   | ||||||
|  | 	ErrorCounter++; | ||||||
|  |  | ||||||
|  |       } else { | ||||||
|  |  | ||||||
|  | 	if ( PrintEntireLog ) {  | ||||||
|  | 	  std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex | ||||||
|  | 		   <<" "<<hex<<" "<<hexref<<std::dec<<std::endl; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     }   | ||||||
|  |     if ( CsumLogVector.size()==CsumLoggingCounter ) { | ||||||
|  |       std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl; | ||||||
|  |     } | ||||||
|  |     CsumLoggingCounter++; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void FlightRecorder::ReductionLog(double local,double global) | ||||||
|  | { | ||||||
|  |   uint64_t hex_l = * ( (uint64_t *)&local ); | ||||||
|  |   uint64_t hex_g = * ( (uint64_t *)&global ); | ||||||
|  |   if(LoggingMode == LoggingModePrint) { | ||||||
|  |     std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl; | ||||||
|  |     ReductionLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeRecord) { | ||||||
|  |     std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl; | ||||||
|  |     ReductionLogVector.push_back(global); | ||||||
|  |     ReductionLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeVerify) { | ||||||
|  |     if(ReductionLoggingCounter < ReductionLogVector.size()){ | ||||||
|  |       if ( global != ReductionLogVector[ReductionLoggingCounter] ) { | ||||||
|  | 	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n", | ||||||
|  | 		GridHostname(), | ||||||
|  | 		GlobalSharedMemory::WorldShmRank, | ||||||
|  | 		ReductionLoggingCounter,ReductionLogVector.size(), | ||||||
|  | 		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr); | ||||||
|  | 	 | ||||||
|  | 	if ( !ContinueOnFail ) assert(0); | ||||||
|  |  | ||||||
|  | 	ErrorCounter++; | ||||||
|  |       } else { | ||||||
|  | 	if ( PrintEntireLog ) {  | ||||||
|  | 	  std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if ( ReductionLogVector.size()==ReductionLoggingCounter ) { | ||||||
|  |       std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl; | ||||||
|  |     } | ||||||
|  |     ReductionLoggingCounter++; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void FlightRecorder::xmitLog(void *buf,uint64_t bytes) | ||||||
|  | { | ||||||
|  |   if ( ChecksumCommsSend ){ | ||||||
|  |   uint64_t *ubuf = (uint64_t *)buf; | ||||||
|  |   if(LoggingMode == LoggingModeNone) return; | ||||||
|  | #ifdef GRID_SYCL | ||||||
|  |   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); | ||||||
|  |   if(LoggingMode == LoggingModePrint) { | ||||||
|  |     std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; | ||||||
|  |     XmitLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeRecord) { | ||||||
|  |     std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; | ||||||
|  |     XmitLogVector.push_back(_xor); | ||||||
|  |     XmitLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeVerify) { | ||||||
|  |     if(XmitLoggingCounter < XmitLogVector.size()){ | ||||||
|  |       if ( _xor != XmitLogVector[XmitLoggingCounter] ) { | ||||||
|  | 	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n", | ||||||
|  | 		GridHostname(), | ||||||
|  | 		GlobalSharedMemory::WorldShmRank, | ||||||
|  | 		XmitLoggingCounter,XmitLogVector.size(), | ||||||
|  | 		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr); | ||||||
|  | 	 | ||||||
|  | 	if ( !ContinueOnFail ) assert(0); | ||||||
|  |  | ||||||
|  | 	ErrorCounter++; | ||||||
|  |       } else { | ||||||
|  | 	if ( PrintEntireLog ) {  | ||||||
|  | 	  std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  XmitLogVector[XmitLoggingCounter] <<std::endl; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if ( XmitLogVector.size()==XmitLoggingCounter ) { | ||||||
|  |       std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl; | ||||||
|  |     } | ||||||
|  |     XmitLoggingCounter++; | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |   } else { | ||||||
|  |     uint64_t word = 1; | ||||||
|  |     deviceVector<uint64_t> dev(1); | ||||||
|  |     acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t)); | ||||||
|  |     acceleratorCopySynchronise(); | ||||||
|  | #ifndef GRID_COMMS_NONE | ||||||
|  |     MPI_Barrier(MPI_COMM_WORLD); | ||||||
|  | #endif | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) | ||||||
|  | { | ||||||
|  |   if ( ChecksumComms ){ | ||||||
|  |   uint64_t *ubuf = (uint64_t *)buf; | ||||||
|  |   if(LoggingMode == LoggingModeNone) return; | ||||||
|  | #ifdef GRID_SYCL | ||||||
|  |   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); | ||||||
|  |   if(LoggingMode == LoggingModePrint) { | ||||||
|  |     std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; | ||||||
|  |     RecvLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeRecord) { | ||||||
|  |     std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; | ||||||
|  |     RecvLogVector.push_back(_xor); | ||||||
|  |     RecvLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(LoggingMode == LoggingModeVerify) { | ||||||
|  |     if(RecvLoggingCounter < RecvLogVector.size()){ | ||||||
|  |       if ( _xor != RecvLogVector[RecvLoggingCounter] ) { | ||||||
|  | 	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n", | ||||||
|  | 		GridHostname(), | ||||||
|  | 		GlobalSharedMemory::WorldShmRank, | ||||||
|  | 		RecvLoggingCounter,RecvLogVector.size(), | ||||||
|  | 		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr); | ||||||
|  | 	 | ||||||
|  | 	if ( !ContinueOnFail ) assert(0); | ||||||
|  |  | ||||||
|  | 	ErrorCounter++; | ||||||
|  |       } else { | ||||||
|  | 	if ( PrintEntireLog ) {  | ||||||
|  | 	  std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  RecvLogVector[RecvLoggingCounter] <<std::endl; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if ( RecvLogVector.size()==RecvLoggingCounter ) { | ||||||
|  |       std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl; | ||||||
|  |     } | ||||||
|  |     RecvLoggingCounter++; | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | NAMESPACE_END(Grid); | ||||||
							
								
								
									
										43
									
								
								Grid/util/FlightRecorder.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								Grid/util/FlightRecorder.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | |||||||
|  | #pragma once | ||||||
|  |  | ||||||
|  | NAMESPACE_BEGIN(Grid); | ||||||
|  | class FlightRecorder { | ||||||
|  |  public: | ||||||
|  |   enum LoggingMode_t { | ||||||
|  |     LoggingModeNone, | ||||||
|  |     LoggingModePrint, | ||||||
|  |     LoggingModeRecord, | ||||||
|  |     LoggingModeVerify | ||||||
|  |   }; | ||||||
|  |    | ||||||
|  |   static int                   LoggingMode; | ||||||
|  |   static uint64_t              ErrorCounter; | ||||||
|  |   static int32_t               XmitLoggingCounter; | ||||||
|  |   static int32_t               RecvLoggingCounter; | ||||||
|  |   static int32_t               CsumLoggingCounter; | ||||||
|  |   static int32_t               NormLoggingCounter; | ||||||
|  |   static int32_t               ReductionLoggingCounter; | ||||||
|  |   static std::vector<uint64_t> XmitLogVector; | ||||||
|  |   static std::vector<uint64_t> RecvLogVector; | ||||||
|  |   static std::vector<uint64_t> CsumLogVector; | ||||||
|  |   static std::vector<double>   NormLogVector; | ||||||
|  |   static std::vector<double>   ReductionLogVector; | ||||||
|  |   static int ContinueOnFail; | ||||||
|  |   static int PrintEntireLog; | ||||||
|  |   static int ChecksumComms; | ||||||
|  |   static int ChecksumCommsSend; | ||||||
|  |   static void SetLoggingModePrint(void); | ||||||
|  |   static void SetLoggingModeRecord(void); | ||||||
|  |   static void SetLoggingModeVerify(void); | ||||||
|  |   static void SetLoggingMode(LoggingMode_t mode); | ||||||
|  |   static void NormLog(double value); | ||||||
|  |   static void CsumLog(uint64_t csum); | ||||||
|  |   static void ReductionLog(double lcl, double glbl); | ||||||
|  |   static void Truncate(void); | ||||||
|  |   static void ResetCounters(void); | ||||||
|  |   static uint64_t ErrorCount(void); | ||||||
|  |   static void xmitLog(void *,uint64_t bytes); | ||||||
|  |   static void recvLog(void *,uint64_t bytes,int rank); | ||||||
|  | }; | ||||||
|  | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| @@ -94,7 +94,12 @@ int GridThread::_threads =1; | |||||||
| int GridThread::_hyperthreads=1; | int GridThread::_hyperthreads=1; | ||||||
| int GridThread::_cores=1; | int GridThread::_cores=1; | ||||||
|  |  | ||||||
|  | char hostname[HOST_NAME_MAX+1]; | ||||||
|  |  | ||||||
|  | char *GridHostname(void) | ||||||
|  | { | ||||||
|  |   return hostname; | ||||||
|  | } | ||||||
| const Coordinate &GridDefaultLatt(void)     {return Grid_default_latt;}; | const Coordinate &GridDefaultLatt(void)     {return Grid_default_latt;}; | ||||||
| const Coordinate &GridDefaultMpi(void)      {return Grid_default_mpi;}; | const Coordinate &GridDefaultMpi(void)      {return Grid_default_mpi;}; | ||||||
| const Coordinate GridDefaultSimd(int dims,int nsimd) | const Coordinate GridDefaultSimd(int dims,int nsimd) | ||||||
| @@ -397,7 +402,6 @@ void Grid_init(int *argc,char ***argv) | |||||||
|   std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl; |   std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl; | ||||||
|   std::cout << GridLogMessage << "================================================ "<<std::endl; |   std::cout << GridLogMessage << "================================================ "<<std::endl; | ||||||
|  |  | ||||||
|   char hostname[HOST_NAME_MAX+1]; |  | ||||||
|   gethostname(hostname, HOST_NAME_MAX+1); |   gethostname(hostname, HOST_NAME_MAX+1); | ||||||
|   std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl; |   std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid); | |||||||
| void Grid_init(int *argc,char ***argv); | void Grid_init(int *argc,char ***argv); | ||||||
| void Grid_finalize(void); | void Grid_finalize(void); | ||||||
|  |  | ||||||
|  | char * GridHostname(void); | ||||||
|  |  | ||||||
| // internal, controled with --handle | // internal, controled with --handle | ||||||
| void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); | void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); | ||||||
| void Grid_debug_handler_init(void); | void Grid_debug_handler_init(void); | ||||||
| @@ -68,5 +70,6 @@ void GridParseLayout(char **argv,int argc, | |||||||
| void printHash(void); | void printHash(void); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| #ifndef GRID_UTIL_H | #pragma once | ||||||
| #define GRID_UTIL_H |  | ||||||
| #include <Grid/util/Coordinate.h> | #include <Grid/util/Coordinate.h> | ||||||
| #include <Grid/util/Lexicographic.h> | #include <Grid/util/Lexicographic.h> | ||||||
| #include <Grid/util/Init.h> | #include <Grid/util/Init.h> | ||||||
| #endif | #include <Grid/util/FlightRecorder.h> | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										21
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -226,23 +226,14 @@ case ${ac_SFW_FP16} in | |||||||
| esac | esac | ||||||
|  |  | ||||||
| ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons | ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons | ||||||
| AC_ARG_ENABLE([accelerator-cshift], | AC_ARG_ENABLE([accelerator-aware-mpi], | ||||||
|     [AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])], |     [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], | ||||||
|     [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes]) |     [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) | ||||||
|  |  | ||||||
| AC_ARG_ENABLE([ucx-buggy], | case ${ac_ACCELERATOR_AWARE_MPI} in | ||||||
|     [AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])], |  | ||||||
|     [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no]) |  | ||||||
|  |  | ||||||
| case ${ac_UCXBUGGY} in |  | ||||||
|     yes) |     yes) | ||||||
|     ac_ACC_CSHIFT=no;; |       AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host]) | ||||||
|     *);; |       AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; | ||||||
| esac |  | ||||||
|  |  | ||||||
| case ${ac_ACC_CSHIFT} in |  | ||||||
|     yes) |  | ||||||
|       AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);; |  | ||||||
|     *);; |     *);; | ||||||
| esac | esac | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,16 +1,16 @@ | |||||||
| TOOLS=$HOME/tools |  | ||||||
| ../../configure \ | ../../configure \ | ||||||
| 	--enable-simd=GPU \ | 	--enable-simd=GPU \ | ||||||
| 	--enable-gen-simd-width=64 \ | 	--enable-gen-simd-width=64 \ | ||||||
| 	--enable-comms=mpi-auto \ | 	--enable-comms=mpi-auto \ | ||||||
| 	--enable-accelerator-cshift \ |  | ||||||
| 	--disable-gparity \ | 	--disable-gparity \ | ||||||
| 	--disable-fermion-reps \ | 	--disable-fermion-reps \ | ||||||
| 	--enable-shm=nvlink \ | 	--enable-shm=nvlink \ | ||||||
| 	--enable-accelerator=sycl \ | 	--enable-accelerator=sycl \ | ||||||
|  | 	--enable-accelerator-aware-mpi=no\ | ||||||
| 	--enable-unified=no \ | 	--enable-unified=no \ | ||||||
| 	MPICXX=mpicxx \ | 	MPICXX=mpicxx \ | ||||||
| 	CXX=icpx \ | 	CXX=icpx \ | ||||||
| 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \ | 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ | ||||||
| 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel" | 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								systems/Aurora/sourceme-sunspot-deterministic.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								systems/Aurora/sourceme-sunspot-deterministic.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | |||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
							
								
								
									
										41
									
								
								systems/Aurora/tests/repro128.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								systems/Aurora/tests/repro128.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,41 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ||||||
|  |  | ||||||
|  | #PBS -q EarlyAppAccess | ||||||
|  | #PBS -l select=128 | ||||||
|  | #PBS -l walltime=02:00:00 | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | source ../sourceme.sh | ||||||
|  |  | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | export OMP_NUM_THREADS=3 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  |  | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | # 12 ppn, 16 nodes, 192 ranks | ||||||
|  | # 12 ppn, 128 nodes, 1536 ranks | ||||||
|  | CMD="mpiexec -np 1536 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \ | ||||||
|  | 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap " | ||||||
|  | $CMD  | ||||||
| @@ -2,26 +2,39 @@ | |||||||
|  |  | ||||||
| ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ||||||
|  |  | ||||||
| #PBS -q EarlyAppAccess | #PBS -l select=16:system=sunspot,place=scatter | ||||||
| #PBS -l select=16 |  | ||||||
| #PBS -l walltime=01:00:00 |  | ||||||
| #PBS -A LatticeQCD_aesp_CNDA | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  | #PBS -l walltime=01:00:00 | ||||||
|  | #PBS -N dwf | ||||||
|  | #PBS -k doe | ||||||
|  |  | ||||||
| #export OMP_PROC_BIND=spread | #export OMP_PROC_BIND=spread | ||||||
| #unset OMP_PLACES | #unset OMP_PLACES | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
| source ../sourceme.sh | #source ../sourceme.sh | ||||||
|  |  | ||||||
| cat $PBS_NODEFILE | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | #export MPICH_COLL_SYNC=1 | ||||||
|  | #export MPICH_ENV_DISPLAY=1 | ||||||
|  | export MPICH_ | ||||||
| export OMP_NUM_THREADS=3 | export OMP_NUM_THREADS=3 | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
|  | #export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH | ||||||
|  |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||||
|  | export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling | ||||||
|  | unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
| @@ -32,9 +45,17 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | |||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
| export MPICH_OFI_NIC_POLICY=GPU | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
| # 12 ppn, 16 nodes, 192 ranks | DIR=repro.$PBS_JOBID | ||||||
|  | mkdir $DIR | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
| CMD="mpiexec -np 192 -ppn 12  -envall \ | CMD="mpiexec -np 192 -ppn 12  -envall \ | ||||||
| 	     ./gpu_tile_compact.sh \ | 	     ../gpu_tile_compact.sh \ | ||||||
| 	     ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ | 	     ../Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ | ||||||
| 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" | 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --debug-stdout --log Message,Iterative" | ||||||
|  | #--comms-overlap | ||||||
| $CMD  | $CMD  | ||||||
|  |  | ||||||
|  | grep Oops Grid.stderr.* > failures.$PBS_JOBID | ||||||
|  | rm core.* | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										82
									
								
								systems/Aurora/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								systems/Aurora/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,82 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | #PBS -l select=16:system=sunspot,place=scatter | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  | #PBS -l walltime=02:00:00 | ||||||
|  | #PBS -N repro1gpu | ||||||
|  | #PBS -k doe | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
|  |  | ||||||
|  | # 56 cores / 6 threads ~9 | ||||||
|  | export OMP_NUM_THREADS=6 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling | ||||||
|  | unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | NN=`cat $PBS_NODEFILE | wc -l` | ||||||
|  | echo $PBS_NODEFILE | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | echo $NN nodes in node file | ||||||
|  | for n in `eval echo {1..$NN}` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | echo Node $n is $THIS_NODE | ||||||
|  |  | ||||||
|  |  | ||||||
|  | for g in {0..11} | ||||||
|  | do | ||||||
|  | export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) | ||||||
|  | export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) | ||||||
|  | export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) | ||||||
|  |  | ||||||
|  | export numa=${NUMA_MAP[$g]} | ||||||
|  | export gpu_id=${GPU_MAP[$g]} | ||||||
|  | export tile_id=${TILE_MAP[$g]} | ||||||
|  | export gpu=$gpu_id.$tile_id | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu | ||||||
|  | mkdir -p $DIR | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | echo $THIS_NODE > nodefile | ||||||
|  | echo $gpu > gpu | ||||||
|  |  | ||||||
|  | export ZE_AFFINITY_MASK=$gpu | ||||||
|  | export ONEAPI_DEVICE_FILTER=gpu,level_zero | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 1 -ppn 1  -envall --hostfile nodefile \ | ||||||
|  | 	     numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \ | ||||||
|  | 		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message" | ||||||
|  | echo $CMD | ||||||
|  | $CMD & | ||||||
|  |  | ||||||
|  | done | ||||||
|  | done | ||||||
|  |  | ||||||
|  | wait | ||||||
|  |  | ||||||
							
								
								
									
										98
									
								
								systems/Aurora/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								systems/Aurora/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,98 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | #PBS -l select=32:system=sunspot,place=scatter | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  | #PBS -l walltime=02:00:00 | ||||||
|  | #PBS -N reproN | ||||||
|  | #PBS -k doe | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
|  |  | ||||||
|  | # 56 cores / 6 threads ~9 | ||||||
|  | export OMP_NUM_THREADS=6 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  |  | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 | ||||||
|  | export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||||
|  |  | ||||||
|  | export GRID_PRINT_ENTIRE_LOG=0 | ||||||
|  | export GRID_CHECKSUM_RECV_BUF=0 | ||||||
|  | export GRID_CHECKSUM_SEND_BUF=0 | ||||||
|  |  | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling | ||||||
|  | unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | NN=`cat $PBS_NODEFILE | wc -l` | ||||||
|  | echo $PBS_NODEFILE | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | echo $NN nodes in node file | ||||||
|  | for n in `eval echo {1..$NN}` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | echo Node $n is $THIS_NODE | ||||||
|  |  | ||||||
|  | DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE | ||||||
|  |  | ||||||
|  | mkdir -p $DIR | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | echo $THIS_NODE > nodefile | ||||||
|  |  | ||||||
|  | #CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \ | ||||||
|  | #	     ../../gpu_tile_compact.sh \ | ||||||
|  | #	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ | ||||||
|  | #		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \ | ||||||
|  | 	     ../../gpu_tile_compact.sh \ | ||||||
|  | 	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ | ||||||
|  | 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" | ||||||
|  |  | ||||||
|  | echo $CMD > command-line | ||||||
|  | env > environment | ||||||
|  | $CMD & | ||||||
|  |  | ||||||
|  | done | ||||||
|  |  | ||||||
|  | # Suspicious wait is allowing jobs to collide and knock out | ||||||
|  | #wait | ||||||
|  |  | ||||||
|  | sleep 6500 | ||||||
|  |  | ||||||
|  | for n in ` eval echo {1..$NN} ` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE | ||||||
|  |  | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | grep Oops Grid.stderr.* > failures.$PBS_JOBID | ||||||
|  | rm core.* | ||||||
|  |  | ||||||
|  | done | ||||||
| @@ -36,5 +36,5 @@ export MPICH_OFI_NIC_POLICY=GPU | |||||||
| CMD="mpiexec -np 192 -ppn 12  -envall \ | CMD="mpiexec -np 192 -ppn 12  -envall \ | ||||||
| 	     ./gpu_tile_compact.sh \ | 	     ./gpu_tile_compact.sh \ | ||||||
| 	     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \ | 	     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \ | ||||||
| 	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" | 	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap" | ||||||
| $CMD  | $CMD  | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| TOOLS=$HOME/tools |  | ||||||
| ../../configure \ | ../../configure \ | ||||||
| 	--enable-simd=GPU \ | 	--enable-simd=GPU \ | ||||||
| 	--enable-gen-simd-width=64 \ | 	--enable-gen-simd-width=64 \ | ||||||
| @@ -11,6 +11,6 @@ TOOLS=$HOME/tools | |||||||
| 	--enable-unified=no \ | 	--enable-unified=no \ | ||||||
| 	MPICXX=mpicxx \ | 	MPICXX=mpicxx \ | ||||||
| 	CXX=icpx \ | 	CXX=icpx \ | ||||||
| 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \ | 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ | ||||||
| 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include" | 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								systems/Sunspot/sourceme.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								systems/Sunspot/sourceme.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | |||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
							
								
								
									
										81
									
								
								systems/Sunspot/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								systems/Sunspot/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,81 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | #PBS -l select=16:system=sunspot,place=scatter | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  | #PBS -l walltime=02:00:00 | ||||||
|  | #PBS -N repro1gpu | ||||||
|  | #PBS -k doe | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
|  |  | ||||||
|  | # 56 cores / 6 threads ~9 | ||||||
|  | export OMP_NUM_THREADS=6 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling | ||||||
|  | unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | NN=`cat $PBS_NODEFILE | wc -l` | ||||||
|  | echo $PBS_NODEFILE | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | echo $NN nodes in node file | ||||||
|  | for n in `eval echo {1..$NN}` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | echo Node $n is $THIS_NODE | ||||||
|  |  | ||||||
|  |  | ||||||
|  | for g in {0..11} | ||||||
|  | do | ||||||
|  | export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) | ||||||
|  | export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) | ||||||
|  | export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) | ||||||
|  |  | ||||||
|  | export numa=${NUMA_MAP[$g]} | ||||||
|  | export gpu_id=${GPU_MAP[$g]} | ||||||
|  | export tile_id=${TILE_MAP[$g]} | ||||||
|  | export gpu=$gpu_id.$tile_id | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu | ||||||
|  | mkdir -p $DIR | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | echo $THIS_NODE > nodefile | ||||||
|  | echo $gpu > gpu | ||||||
|  |  | ||||||
|  | export ZE_AFFINITY_MASK=$gpu | ||||||
|  | export ONEAPI_DEVICE_FILTER=gpu,level_zero | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 1 -ppn 1  -envall --hostfile nodefile \ | ||||||
|  | 	     numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \ | ||||||
|  | 		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message" | ||||||
|  | echo $CMD | ||||||
|  | $CMD & | ||||||
|  |  | ||||||
|  | done | ||||||
|  | done | ||||||
|  |  | ||||||
|  | wait | ||||||
|  |  | ||||||
							
								
								
									
										97
									
								
								systems/Sunspot/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								systems/Sunspot/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | #PBS -l select=32:system=sunspot,place=scatter | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  | #PBS -l walltime=02:00:00 | ||||||
|  | #PBS -N reproN | ||||||
|  | #PBS -k doe | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
|  |  | ||||||
|  | # 56 cores / 6 threads ~9 | ||||||
|  | export OMP_NUM_THREADS=6 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  |  | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 | ||||||
|  | export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 | ||||||
|  |  | ||||||
|  | export GRID_PRINT_ENTIRE_LOG=0 | ||||||
|  | export GRID_CHECKSUM_RECV_BUF=1 | ||||||
|  | export GRID_CHECKSUM_SEND_BUF=0 | ||||||
|  |  | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling | ||||||
|  | unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | NN=`cat $PBS_NODEFILE | wc -l` | ||||||
|  | echo $PBS_NODEFILE | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | echo $NN nodes in node file | ||||||
|  | for n in `eval echo {1..$NN}` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | echo Node $n is $THIS_NODE | ||||||
|  |  | ||||||
|  | DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE | ||||||
|  |  | ||||||
|  | mkdir -p $DIR | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | echo $THIS_NODE > nodefile | ||||||
|  |  | ||||||
|  | #CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \ | ||||||
|  | #	     ../../gpu_tile_compact.sh \ | ||||||
|  | #	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ | ||||||
|  | #		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \ | ||||||
|  | 	     ../../gpu_tile_compact.sh \ | ||||||
|  | 	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ | ||||||
|  | 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" | ||||||
|  |  | ||||||
|  | echo $CMD > command-line | ||||||
|  | env > environment | ||||||
|  | $CMD & | ||||||
|  |  | ||||||
|  | done | ||||||
|  |  | ||||||
|  | # Suspicious wait is allowing jobs to collide and knock out | ||||||
|  | #wait | ||||||
|  |  | ||||||
|  | sleep 6500 | ||||||
|  |  | ||||||
|  | for n in ` eval echo {1..$NN} ` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE | ||||||
|  |  | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | grep Oops Grid.stderr.* > failures.$PBS_JOBID | ||||||
|  | rm core.* | ||||||
|  |  | ||||||
|  | done | ||||||
| @@ -34,6 +34,46 @@ using namespace Grid; | |||||||
| #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX | #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
|  | NAMESPACE_BEGIN(Grid); | ||||||
|  | template<class Matrix,class Field> | ||||||
|  |   class SchurDiagMooeeOperatorParanoid :  public SchurOperatorBase<Field> { | ||||||
|  |  public: | ||||||
|  |     Matrix &_Mat; | ||||||
|  |     SchurDiagMooeeOperatorParanoid (Matrix &Mat): _Mat(Mat){}; | ||||||
|  |     virtual  void Mpc      (const Field &in, Field &out) { | ||||||
|  |       Field tmp(in.Grid()); | ||||||
|  |       tmp.Checkerboard() = !in.Checkerboard(); | ||||||
|  |       //      std::cout <<" Mpc starting"<<std::endl; | ||||||
|  |  | ||||||
|  |       RealD nn = norm2(in); // std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl; | ||||||
|  |       _Mat.Meooe(in,tmp); | ||||||
|  |       nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooeinv "<<nn<<std::endl; | ||||||
|  |       _Mat.MooeeInv(tmp,out); | ||||||
|  |       nn = norm2(out); //std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl; | ||||||
|  |       _Mat.Meooe(out,tmp); | ||||||
|  |       nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooee "<<nn<<std::endl; | ||||||
|  |       _Mat.Mooee(in,out); | ||||||
|  |       nn = norm2(out); //std::cout <<" Mpc Prior to axpy "<<nn<<std::endl; | ||||||
|  |       axpy(out,-1.0,tmp,out); | ||||||
|  |     } | ||||||
|  |     virtual void MpcDag   (const Field &in, Field &out){ | ||||||
|  |       Field tmp(in.Grid()); | ||||||
|  |       //      std::cout <<" MpcDag starting"<<std::endl; | ||||||
|  |       RealD nn = norm2(in);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl; | ||||||
|  |       _Mat.MeooeDag(in,tmp); | ||||||
|  |       _Mat.MooeeInvDag(tmp,out); | ||||||
|  |       nn = norm2(out);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl; | ||||||
|  |       _Mat.MeooeDag(out,tmp); | ||||||
|  |       nn = norm2(tmp);// std::cout <<" MpcDag Prior to Mooee "<<nn<<std::endl; | ||||||
|  |       _Mat.MooeeDag(in,out); | ||||||
|  |       nn = norm2(out);// std::cout <<" MpcDag Prior to axpy "<<nn<<std::endl; | ||||||
|  |       axpy(out,-1.0,tmp,out); | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) | int main (int argc, char ** argv) | ||||||
| { | { | ||||||
|   char hostname[HOST_NAME_MAX+1]; |   char hostname[HOST_NAME_MAX+1]; | ||||||
| @@ -82,8 +122,8 @@ int main (int argc, char ** argv) | |||||||
|   result_o_2.Checkerboard() = Odd; |   result_o_2.Checkerboard() = Odd; | ||||||
|   result_o_2 = Zero(); |   result_o_2 = Zero(); | ||||||
|  |  | ||||||
|   SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf); |   SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf); | ||||||
|   SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f); |   SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f); | ||||||
|  |  | ||||||
|   int nsecs=600; |   int nsecs=600; | ||||||
|   if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ |   if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ | ||||||
| @@ -104,10 +144,22 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   time_t start = time(NULL); |   time_t start = time(NULL); | ||||||
|  |  | ||||||
|   uint32_t csum, csumref; |   FlightRecorder::ContinueOnFail = 0; | ||||||
|   csumref=0; |   FlightRecorder::PrintEntireLog = 0; | ||||||
|  |   FlightRecorder::ChecksumComms  = 1; | ||||||
|  |   FlightRecorder::ChecksumCommsSend=0; | ||||||
|  |  | ||||||
|  |   if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s); | ||||||
|  |   if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms      = atoi(s); | ||||||
|  |   if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend  = atoi(s); | ||||||
|  |  | ||||||
|   int iter=0; |   int iter=0; | ||||||
|   do { |   do { | ||||||
|  |     if ( iter == 0 ) { | ||||||
|  |       FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord); | ||||||
|  |     } else { | ||||||
|  |       FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify); | ||||||
|  |     } | ||||||
|     std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl; |     std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl; | ||||||
|     result_o = Zero(); |     result_o = Zero(); | ||||||
|     t1=usecond(); |     t1=usecond(); | ||||||
| @@ -118,27 +170,23 @@ int main (int argc, char ** argv) | |||||||
|     flops+= CGsiteflops*FrbGrid->gSites()*iters; |     flops+= CGsiteflops*FrbGrid->gSites()*iters; | ||||||
|     std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl; |     std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl; | ||||||
|     std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl; |     std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl; | ||||||
|  |     std::cout << " SinglePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl; | ||||||
|  |  | ||||||
|     csum = crc(result_o); |     assert(FlightRecorder::ErrorCount()==0); | ||||||
|  |  | ||||||
|     if ( csumref == 0 ) { |     std::cout << " FlightRecorder is OK! "<<std::endl; | ||||||
|       csumref = csum; |  | ||||||
|     } else { |  | ||||||
|       if ( csum != csumref ) {  |  | ||||||
| 	std::cerr << host<<" FAILURE " <<iter <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl; |  | ||||||
| 	assert(0); |  | ||||||
|       } else { |  | ||||||
| 	std::cout << host <<" OK " <<iter <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     iter ++; |     iter ++; | ||||||
|   } while (time(NULL) < (start + nsecs/2) ); |   } while (time(NULL) < (start + nsecs/10) ); | ||||||
|      |      | ||||||
|   std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl; |   std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl; | ||||||
|   ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); |   ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); | ||||||
|   csumref=0; |  | ||||||
|   int i=0; |   int i=0; | ||||||
|   do {  |   do {  | ||||||
|  |     if ( i == 0 ) { | ||||||
|  |       FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord); | ||||||
|  |     } else { | ||||||
|  |       FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify); | ||||||
|  |     } | ||||||
|     std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl; |     std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl; | ||||||
|     result_o_2 = Zero(); |     result_o_2 = Zero(); | ||||||
|     t1=usecond(); |     t1=usecond(); | ||||||
| @@ -150,19 +198,9 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|     std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl; |     std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl; | ||||||
|     std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl; |     std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl; | ||||||
|  |     std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl; | ||||||
|     csum = crc(result_o); |     assert(FlightRecorder::ErrorCount()==0); | ||||||
|  |     std::cout << " FlightRecorder is OK! "<<std::endl; | ||||||
|     if ( csumref == 0 ) { |  | ||||||
|       csumref = csum; |  | ||||||
|     } else { |  | ||||||
|       if ( csum != csumref ) {  |  | ||||||
| 	std::cerr << i <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl; |  | ||||||
| 	assert(0); |  | ||||||
|       } else { |  | ||||||
| 	std::cout << i <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     i++; |     i++; | ||||||
|   } while (time(NULL) < (start + nsecs) ); |   } while (time(NULL) < (start + nsecs) ); | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user