mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 03:54:33 +00:00 
			
		
		
		
	Compare commits
	
		
			16 Commits
		
	
	
		
			fix/HOST_N
			...
			6d7219b59d
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 6d7219b59d | ||
|  | 79ad567dd5 | ||
|  | fab1efb48c | ||
|  | 660eb76d93 | ||
|  | 461cd045c6 | ||
|  | fee65d7a75 | ||
|  | 31f9971dbf | ||
|  | 62e7bf024a | ||
|  | 95f3d69cf9 | ||
| 89c0519f83 | |||
| 2704b82084 | |||
| cf8632bbac | |||
| d224297972 | |||
|  | a4d11a630f | ||
|  | d87296f3e8 | ||
|  | be94cf1c6f | 
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #define _GRID_FFT_H_ | #define _GRID_FFT_H_ | ||||||
|  |  | ||||||
| #ifdef HAVE_FFTW | #ifdef HAVE_FFTW | ||||||
| #ifdef USE_MKL | #if defined(USE_MKL) || defined(GRID_SYCL) | ||||||
| #include <fftw/fftw3.h> | #include <fftw/fftw3.h> | ||||||
| #else | #else | ||||||
| #include <fftw3.h> | #include <fftw3.h> | ||||||
|   | |||||||
| @@ -281,12 +281,17 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | |||||||
|   return nrm; |   return nrm; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { | inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { | ||||||
|   GridBase *grid = left.Grid(); |   GridBase *grid = left.Grid(); | ||||||
|  |   uint32_t csum=0; | ||||||
|  |   //  Uint32Checksum(left,csum); | ||||||
|   ComplexD nrm = rankInnerProduct(left,right); |   ComplexD nrm = rankInnerProduct(left,right); | ||||||
|   //  std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl; |   RealD local = real(nrm); | ||||||
|  |   GridNormLog(real(nrm),csum); // Could log before and after global sum to distinguish local and MPI | ||||||
|   grid->GlobalSum(nrm); |   grid->GlobalSum(nrm); | ||||||
|  |   GridMPINormLog(local,real(nrm));  | ||||||
|   return nrm; |   return nrm; | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -411,7 +411,7 @@ public: | |||||||
|       std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl; |       std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl; | ||||||
|       SeedFixedIntegers(seeds); |       SeedFixedIntegers(seeds); | ||||||
|     } |     } | ||||||
|   void SeedFixedIntegers(const std::vector<int> &seeds){ |   void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){ | ||||||
|  |  | ||||||
|     // Everyone generates the same seed_seq based on input seeds |     // Everyone generates the same seed_seq based on input seeds | ||||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); |     CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); | ||||||
| @@ -428,7 +428,6 @@ public: | |||||||
|     // MT implementation does not implement fast discard even though |     // MT implementation does not implement fast discard even though | ||||||
|     // in principle this is possible |     // in principle this is possible | ||||||
|     //////////////////////////////////////////////// |     //////////////////////////////////////////////// | ||||||
| #if 1 |  | ||||||
|     thread_for( lidx, _grid->lSites(), { |     thread_for( lidx, _grid->lSites(), { | ||||||
|  |  | ||||||
| 	int gidx; | 	int gidx; | ||||||
| @@ -449,29 +448,12 @@ public: | |||||||
| 	 | 	 | ||||||
| 	int l_idx=generator_idx(o_idx,i_idx); | 	int l_idx=generator_idx(o_idx,i_idx); | ||||||
| 	_generators[l_idx] = master_engine; | 	_generators[l_idx] = master_engine; | ||||||
| 	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence | 	if ( britney ) {  | ||||||
|     }); | 	  Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence | ||||||
| #else | 	} else { 	 | ||||||
|     // Everybody loops over global volume. |  | ||||||
|     thread_for( gidx, _grid->_gsites, { |  | ||||||
|  |  | ||||||
| 	// Where is it? |  | ||||||
| 	int rank; |  | ||||||
| 	int o_idx; |  | ||||||
| 	int i_idx; |  | ||||||
|  |  | ||||||
| 	Coordinate gcoor; |  | ||||||
| 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor); |  | ||||||
| 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); |  | ||||||
| 	 |  | ||||||
| 	// If this is one of mine we take it |  | ||||||
| 	if( rank == _grid->ThisRank() ){ |  | ||||||
| 	  int l_idx=generator_idx(o_idx,i_idx); |  | ||||||
| 	  _generators[l_idx] = master_engine; |  | ||||||
| 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence | 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence | ||||||
| 	} | 	} | ||||||
|     }); |     }); | ||||||
| #endif |  | ||||||
| #else  | #else  | ||||||
|     //////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////// | ||||||
|     // Machine and thread decomposition dependent seeding is efficient |     // Machine and thread decomposition dependent seeding is efficient | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| #pragma once | #pragma once | ||||||
| #include <type_traits> |  | ||||||
| #if defined(GRID_CUDA) | #if defined(GRID_CUDA) | ||||||
|  |  | ||||||
| #include <cub/cub.cuh> | #include <cub/cub.cuh> | ||||||
| @@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V | |||||||
|    |    | ||||||
|  |  | ||||||
| } | } | ||||||
|  | #endif  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { |  | ||||||
|  | #if defined(GRID_SYCL) | ||||||
|  | template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | ||||||
|  | { | ||||||
|  |   size_t subvol_size = e1*e2; | ||||||
|  |  | ||||||
|  |   vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator); | ||||||
|  |   vobj vobj_zero; | ||||||
|  |   zeroit(vobj_zero); | ||||||
|  |   for (int r = 0; r<rd; r++) {  | ||||||
|  |     mysum[r] = vobj_zero;  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   commVector<vobj> reduction_buffer(rd*subvol_size);     | ||||||
|  |  | ||||||
|  |   auto rb_p = &reduction_buffer[0]; | ||||||
|  |  | ||||||
|  |   // autoView(Data_v, Data, AcceleratorRead); | ||||||
|  |  | ||||||
|  |   //prepare reduction buffer  | ||||||
|  |   accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{  | ||||||
|  |    | ||||||
|  |       int n = s / e2; | ||||||
|  |       int b = s % e2; | ||||||
|  |       int so=r*ostride; // base offset for start of plane  | ||||||
|  |       int ss= so+n*stride+b; | ||||||
|  |  | ||||||
|  |       coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss])); | ||||||
|  |  | ||||||
|  |   }); | ||||||
|  |  | ||||||
|  |   for (int r = 0; r < rd; r++) { | ||||||
|  |       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||||
|  |           auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); | ||||||
|  |           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, | ||||||
|  |           Reduction, | ||||||
|  |           [=](cl::sycl::id<1> item, auto &sum) { | ||||||
|  |               auto s = item[0]; | ||||||
|  |               sum += rb_p[r*subvol_size+s]; | ||||||
|  |           }); | ||||||
|  |       }); | ||||||
|  |        | ||||||
|  |       | ||||||
|  |   } | ||||||
|  |   theGridAccelerator->wait(); | ||||||
|  |   for (int r = 0; r < rd; r++) { | ||||||
|  |     lvSum[r] = mysum[r]; | ||||||
|  |   } | ||||||
|  |   free(mysum,*theGridAccelerator); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { | ||||||
|   typedef typename vobj::vector_type vector; |   typedef typename vobj::vector_type vector; | ||||||
|   const int words = sizeof(vobj)/sizeof(vector); |   const int words = sizeof(vobj)/sizeof(vector); | ||||||
|   const int osites = rd*e1*e2; |   const int osites = rd*e1*e2; | ||||||
| @@ -106,8 +159,12 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V | |||||||
| 	    buf[ss] = dat[ss*words+w]; | 	    buf[ss] = dat[ss*words+w]; | ||||||
|     }); |     }); | ||||||
|  |  | ||||||
|     sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); |     #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||||
|        |       sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); | ||||||
|  |     #elif defined(GRID_SYCL) | ||||||
|  |       sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); | ||||||
|  |     #endif | ||||||
|  |  | ||||||
|     for (int r = 0; r < rd; r++) { |     for (int r = 0; r < rd; r++) { | ||||||
|       lvSum_ptr[w+words*r]=lvSum_small[r]; |       lvSum_ptr[w+words*r]=lvSum_small[r]; | ||||||
|     } |     } | ||||||
| @@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V | |||||||
|    |    | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | ||||||
| { | { | ||||||
|   autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case. |   autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. | ||||||
|     if constexpr (sizeof(vobj) <= 256) {  |     if constexpr (sizeof(vobj) <= 256) {  | ||||||
|       sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|  |       #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||||
|  |         sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|  |       #elif defined (GRID_SYCL) | ||||||
|  |         sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|  |       #endif | ||||||
|  |  | ||||||
|     } |     } | ||||||
|     else { |     else { | ||||||
|       sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |       sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|     } |     } | ||||||
| } | } | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #if defined(GRID_SYCL) |  | ||||||
| template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) |  | ||||||
| { |  | ||||||
|   typedef typename vobj::scalar_object sobj; |  | ||||||
|   size_t subvol_size = e1*e2; |  | ||||||
|  |  | ||||||
|   vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); |  | ||||||
|   vobj vobj_zero; |  | ||||||
|   zeroit(vobj_zero); |  | ||||||
|      |  | ||||||
|   commVector<vobj> reduction_buffer(rd*subvol_size);     |  | ||||||
|  |  | ||||||
|   auto rb_p = &reduction_buffer[0]; |  | ||||||
|  |  | ||||||
|   autoView(Data_v, Data, AcceleratorRead); |  | ||||||
|  |  | ||||||
|   //prepare reduction buffer  |  | ||||||
|   accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{  |  | ||||||
|    |  | ||||||
|       int n = s / e2; |  | ||||||
|       int b = s % e2; |  | ||||||
|       int so=r*ostride; // base offset for start of plane  |  | ||||||
|       int ss= so+n*stride+b; |  | ||||||
|  |  | ||||||
|       coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); |  | ||||||
|  |  | ||||||
|   }); |  | ||||||
|  |  | ||||||
|   for (int r = 0; r < rd; r++) { |  | ||||||
|       mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable |  | ||||||
|       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |  | ||||||
|           auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); |  | ||||||
|           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, |  | ||||||
|           Reduction, |  | ||||||
|           [=](cl::sycl::id<1> item, auto &sum) { |  | ||||||
|               auto s = item[0]; |  | ||||||
|               sum += rb_p[r*subvol_size+s]; |  | ||||||
|           }); |  | ||||||
|       }); |  | ||||||
|       theGridAccelerator->wait(); |  | ||||||
|       lvSum[r] = mysum[0]; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   free(mysum,*theGridAccelerator); |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | ||||||
| { | { | ||||||
|   // sum over reduced dimension planes, breaking out orthog dir |   // sum over reduced dimension planes, breaking out orthog dir | ||||||
| @@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data | |||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)  | template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)  | ||||||
| { | { | ||||||
|   #if defined(GRID_CUDA) || defined(GRID_HIP) |   #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) | ||||||
|    |    | ||||||
|   sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |   sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|    |  | ||||||
|   #elif defined(GRID_SYCL) |  | ||||||
|    |  | ||||||
|   sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|    |    | ||||||
|   #else |   #else | ||||||
|   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|   | |||||||
| @@ -90,11 +90,139 @@ NAMESPACE_BEGIN(Grid); | |||||||
| static Coordinate Grid_default_latt; | static Coordinate Grid_default_latt; | ||||||
| static Coordinate Grid_default_mpi; | static Coordinate Grid_default_mpi; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /////////////////////////////////////////////////////// | ||||||
|  | // Grid Norm logging for repro testing | ||||||
|  | /////////////////////////////////////////////////////// | ||||||
|  | int GridNormLoggingMode; | ||||||
|  | int32_t GridNormLoggingCounter; | ||||||
|  | int32_t GridMPINormLoggingCounter; | ||||||
|  | std::vector<double> GridNormLogVector; | ||||||
|  | std::vector<double> GridMPINormLogVector; | ||||||
|  | std::vector<uint32_t> GridCsumLogVector; | ||||||
|  |  | ||||||
|  | void SetGridNormLoggingMode(GridNormLoggingMode_t mode) | ||||||
|  | { | ||||||
|  |   switch ( mode ) { | ||||||
|  |   case GridNormLoggingModePrint: | ||||||
|  |     SetGridNormLoggingModePrint(); | ||||||
|  |     break; | ||||||
|  |   case GridNormLoggingModeRecord: | ||||||
|  |     SetGridNormLoggingModeRecord(); | ||||||
|  |     break; | ||||||
|  |   case GridNormLoggingModeVerify: | ||||||
|  |     SetGridNormLoggingModeVerify(); | ||||||
|  |     break; | ||||||
|  |   case GridNormLoggingModeNone: | ||||||
|  |     GridNormLoggingMode = mode; | ||||||
|  |     GridNormLoggingCounter=0; | ||||||
|  |     GridMPINormLoggingCounter=0; | ||||||
|  |     GridNormLogVector.resize(0); | ||||||
|  |     GridCsumLogVector.resize(0); | ||||||
|  |     GridMPINormLogVector.resize(0); | ||||||
|  |     break; | ||||||
|  |   default: | ||||||
|  |     assert(0); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void SetGridNormLoggingModePrint(void) | ||||||
|  | { | ||||||
|  |   std::cout << " GridNormLogging Reproducibility logging set to print output " <<std::endl; | ||||||
|  |   GridNormLoggingCounter = 0; | ||||||
|  |   GridMPINormLoggingCounter=0; | ||||||
|  |   GridNormLogVector.resize(0); | ||||||
|  |   GridCsumLogVector.resize(0); | ||||||
|  |   GridMPINormLogVector.resize(0); | ||||||
|  |   GridNormLoggingMode = GridNormLoggingModePrint; | ||||||
|  | } | ||||||
|  | void SetGridNormLoggingModeRecord(void) | ||||||
|  | { | ||||||
|  |   std::cout << " GridNormLogging Reproducibility logging set to RECORD " <<std::endl; | ||||||
|  |   GridNormLoggingCounter = 0; | ||||||
|  |   GridMPINormLoggingCounter=0; | ||||||
|  |   GridNormLogVector.resize(0); | ||||||
|  |   GridCsumLogVector.resize(0); | ||||||
|  |   GridMPINormLogVector.resize(0); | ||||||
|  |   GridNormLoggingMode = GridNormLoggingModeRecord; | ||||||
|  | } | ||||||
|  | void SetGridNormLoggingModeVerify(void) | ||||||
|  | { | ||||||
|  |   std::cout << " GridNormLogging Reproducibility logging set to VERIFY " << GridNormLogVector.size()<< " log entries "<<std::endl; | ||||||
|  |   GridNormLoggingCounter = 0; | ||||||
|  |   GridMPINormLoggingCounter=0; | ||||||
|  |   GridNormLoggingMode = GridNormLoggingModeVerify; | ||||||
|  | } | ||||||
|  | void GridNormLog(double value,uint32_t csum) | ||||||
|  | { | ||||||
|  |   if(GridNormLoggingMode == GridNormLoggingModePrint) { | ||||||
|  |     std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value << " csum " <<std::hex<<csum<<std::dec <<std::endl; | ||||||
|  |     GridNormLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(GridNormLoggingMode == GridNormLoggingModeRecord) { | ||||||
|  |     GridNormLogVector.push_back(value); | ||||||
|  |     GridCsumLogVector.push_back(csum); | ||||||
|  |     GridNormLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(GridNormLoggingMode == GridNormLoggingModeVerify) { | ||||||
|  |     assert(GridNormLoggingCounter < GridNormLogVector.size()); | ||||||
|  |     if ( (value != GridNormLogVector[GridNormLoggingCounter]) | ||||||
|  | 	 || (csum!=GridCsumLogVector[GridNormLoggingCounter]) ) { | ||||||
|  |       std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<GridNormLogVector[GridNormLoggingCounter] <<std::endl; | ||||||
|  |       std::cerr << " Oops got csum "<< std::hex<<csum<<" expect "<<GridCsumLogVector[GridNormLoggingCounter] <<std::endl; | ||||||
|  |       fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e %.16e %x %x\n", | ||||||
|  | 	      GridHostname(), | ||||||
|  | 	      GlobalSharedMemory::WorldShmRank, | ||||||
|  | 	      GridNormLoggingCounter,GridNormLogVector.size(), | ||||||
|  | 	      value, GridNormLogVector[GridNormLoggingCounter], | ||||||
|  | 	      csum, GridCsumLogVector[GridNormLoggingCounter]); fflush(stderr); | ||||||
|  |       assert(0); // Force takedown of job | ||||||
|  |     } | ||||||
|  |     if ( GridNormLogVector.size()==GridNormLoggingCounter ) { | ||||||
|  |       std::cout << " GridNormLogging : Verified entire sequence of "<<GridNormLoggingCounter<<" norms "<<std::endl; | ||||||
|  |     } | ||||||
|  |     GridNormLoggingCounter++; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void GridMPINormLog(double local,double result) | ||||||
|  | { | ||||||
|  |   if(GridNormLoggingMode == GridNormLoggingModePrint) { | ||||||
|  |     std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << " -> " <<result <<std::endl; | ||||||
|  |     GridMPINormLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(GridNormLoggingMode == GridNormLoggingModeRecord) { | ||||||
|  |     std::cerr<<"GridMPINormLog RECORDING : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl; | ||||||
|  |     GridMPINormLogVector.push_back(result); | ||||||
|  |     GridMPINormLoggingCounter++; | ||||||
|  |   } | ||||||
|  |   if(GridNormLoggingMode == GridNormLoggingModeVerify) { | ||||||
|  |     std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl; | ||||||
|  |     assert(GridMPINormLoggingCounter < GridMPINormLogVector.size()); | ||||||
|  |     if ( result != GridMPINormLogVector[GridMPINormLoggingCounter] ) { | ||||||
|  |       fprintf(stderr,"%s:%d MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e hist %.16e\n", | ||||||
|  | 	      GridHostname(), | ||||||
|  | 	      GlobalSharedMemory::WorldShmRank, | ||||||
|  | 	      GridMPINormLoggingCounter,GridMPINormLogVector.size(), | ||||||
|  | 	      result, local, GridMPINormLogVector[GridMPINormLoggingCounter]); fflush(stderr); | ||||||
|  |       assert(0); // Force takedown of job | ||||||
|  |     } | ||||||
|  |     if ( GridMPINormLogVector.size()==GridMPINormLoggingCounter ) { | ||||||
|  |       std::cout << " GridMPINormLogging : Verified entire sequence of "<<GridMPINormLoggingCounter<<" norms "<<std::endl; | ||||||
|  |     } | ||||||
|  |     GridMPINormLoggingCounter++; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| int GridThread::_threads =1; | int GridThread::_threads =1; | ||||||
| int GridThread::_hyperthreads=1; | int GridThread::_hyperthreads=1; | ||||||
| int GridThread::_cores=1; | int GridThread::_cores=1; | ||||||
|  |  | ||||||
|  | char hostname[HOST_NAME_MAX+1]; | ||||||
|  |  | ||||||
|  | char *GridHostname(void) | ||||||
|  | { | ||||||
|  |   return hostname; | ||||||
|  | } | ||||||
| const Coordinate &GridDefaultLatt(void)     {return Grid_default_latt;}; | const Coordinate &GridDefaultLatt(void)     {return Grid_default_latt;}; | ||||||
| const Coordinate &GridDefaultMpi(void)      {return Grid_default_mpi;}; | const Coordinate &GridDefaultMpi(void)      {return Grid_default_mpi;}; | ||||||
| const Coordinate GridDefaultSimd(int dims,int nsimd) | const Coordinate GridDefaultSimd(int dims,int nsimd) | ||||||
| @@ -397,7 +525,6 @@ void Grid_init(int *argc,char ***argv) | |||||||
|   std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl; |   std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl; | ||||||
|   std::cout << GridLogMessage << "================================================ "<<std::endl; |   std::cout << GridLogMessage << "================================================ "<<std::endl; | ||||||
|  |  | ||||||
|   char hostname[HOST_NAME_MAX+1]; |  | ||||||
|   gethostname(hostname, HOST_NAME_MAX+1); |   gethostname(hostname, HOST_NAME_MAX+1); | ||||||
|   std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl; |   std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid); | |||||||
| void Grid_init(int *argc,char ***argv); | void Grid_init(int *argc,char ***argv); | ||||||
| void Grid_finalize(void); | void Grid_finalize(void); | ||||||
|  |  | ||||||
|  | char * GridHostname(void); | ||||||
|  |  | ||||||
| // internal, controled with --handle | // internal, controled with --handle | ||||||
| void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); | void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); | ||||||
| void Grid_debug_handler_init(void); | void Grid_debug_handler_init(void); | ||||||
| @@ -68,5 +70,21 @@ void GridParseLayout(char **argv,int argc, | |||||||
| void printHash(void); | void printHash(void); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | enum GridNormLoggingMode_t { | ||||||
|  |   GridNormLoggingModeNone, | ||||||
|  |   GridNormLoggingModePrint, | ||||||
|  |   GridNormLoggingModeRecord, | ||||||
|  |   GridNormLoggingModeVerify | ||||||
|  | }; | ||||||
|  | //extern int GridNormLoggingMode; | ||||||
|  | //extern int32_t GridNormLoggingCounter; | ||||||
|  | //extern std::vector<double> GridNormLogVector; | ||||||
|  | void SetGridNormLoggingModePrint(void); | ||||||
|  | void SetGridNormLoggingModeRecord(void); | ||||||
|  | void SetGridNormLoggingModeVerify(void); | ||||||
|  | void SetGridNormLoggingMode(GridNormLoggingMode_t mode); | ||||||
|  | void GridNormLog(double value,uint32_t csum); | ||||||
|  | void GridMPINormLog(double lcl, double glbl); | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										41
									
								
								systems/Aurora/tests/repro128.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								systems/Aurora/tests/repro128.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,41 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ||||||
|  |  | ||||||
|  | #PBS -q EarlyAppAccess | ||||||
|  | #PBS -l select=128 | ||||||
|  | #PBS -l walltime=02:00:00 | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | source ../sourceme.sh | ||||||
|  |  | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | export OMP_NUM_THREADS=3 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  |  | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | # 12 ppn, 16 nodes, 192 ranks | ||||||
|  | # 12 ppn, 128 nodes, 1536 ranks | ||||||
|  | CMD="mpiexec -np 1536 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \ | ||||||
|  | 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap " | ||||||
|  | $CMD  | ||||||
| @@ -2,26 +2,39 @@ | |||||||
|  |  | ||||||
| ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ||||||
|  |  | ||||||
| #PBS -q EarlyAppAccess | #PBS -l select=16:system=sunspot,place=scatter | ||||||
| #PBS -l select=16 |  | ||||||
| #PBS -l walltime=01:00:00 |  | ||||||
| #PBS -A LatticeQCD_aesp_CNDA | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  | #PBS -l walltime=01:00:00 | ||||||
|  | #PBS -N dwf | ||||||
|  | #PBS -k doe | ||||||
|  |  | ||||||
| #export OMP_PROC_BIND=spread | #export OMP_PROC_BIND=spread | ||||||
| #unset OMP_PLACES | #unset OMP_PLACES | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
| source ../sourceme.sh | #source ../sourceme.sh | ||||||
|  |  | ||||||
| cat $PBS_NODEFILE | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | #export MPICH_COLL_SYNC=1 | ||||||
|  | #export MPICH_ENV_DISPLAY=1 | ||||||
|  | export MPICH_ | ||||||
| export OMP_NUM_THREADS=3 | export OMP_NUM_THREADS=3 | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
|  | #export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH | ||||||
|  |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||||
|  | export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling | ||||||
|  | unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
| @@ -32,9 +45,17 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | |||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
| export MPICH_OFI_NIC_POLICY=GPU | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
| # 12 ppn, 16 nodes, 192 ranks | DIR=repro.$PBS_JOBID | ||||||
|  | mkdir $DIR | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
| CMD="mpiexec -np 192 -ppn 12  -envall \ | CMD="mpiexec -np 192 -ppn 12  -envall \ | ||||||
| 	     ./gpu_tile_compact.sh \ | 	     ../gpu_tile_compact.sh \ | ||||||
| 	     ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ | 	     ../Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ | ||||||
| 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" | 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --debug-stdout --log Message,Iterative" | ||||||
|  | #--comms-overlap | ||||||
| $CMD  | $CMD  | ||||||
|  |  | ||||||
|  | grep Oops Grid.stderr.* > failures.$PBS_JOBID | ||||||
|  | rm core.* | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										81
									
								
								systems/Aurora/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								systems/Aurora/tests/repro1gpu.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,81 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | #PBS -l select=16:system=sunspot,place=scatter | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  | #PBS -l walltime=02:00:00 | ||||||
|  | #PBS -N repro1gpu | ||||||
|  | #PBS -k doe | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
|  |  | ||||||
|  | # 56 cores / 6 threads ~9 | ||||||
|  | export OMP_NUM_THREADS=6 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling | ||||||
|  | unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | NN=`cat $PBS_NODEFILE | wc -l` | ||||||
|  | echo $PBS_NODEFILE | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | echo $NN nodes in node file | ||||||
|  | for n in `eval echo {1..$NN}` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | echo Node $n is $THIS_NODE | ||||||
|  |  | ||||||
|  |  | ||||||
|  | for g in {0..11} | ||||||
|  | do | ||||||
|  | export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) | ||||||
|  | export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) | ||||||
|  | export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) | ||||||
|  |  | ||||||
|  | export numa=${NUMA_MAP[$g]} | ||||||
|  | export gpu_id=${GPU_MAP[$g]} | ||||||
|  | export tile_id=${TILE_MAP[$g]} | ||||||
|  | export gpu=$gpu_id.$tile_id | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu | ||||||
|  | mkdir -p $DIR | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | echo $THIS_NODE > nodefile | ||||||
|  | echo $gpu > gpu | ||||||
|  |  | ||||||
|  | export ZE_AFFINITY_MASK=$gpu | ||||||
|  | export ONEAPI_DEVICE_FILTER=gpu,level_zero | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 1 -ppn 1  -envall --hostfile nodefile \ | ||||||
|  | 	     numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \ | ||||||
|  | 		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message" | ||||||
|  | echo $CMD | ||||||
|  | $CMD & | ||||||
|  |  | ||||||
|  | done | ||||||
|  | done | ||||||
|  |  | ||||||
|  | wait | ||||||
|  |  | ||||||
							
								
								
									
										78
									
								
								systems/Aurora/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								systems/Aurora/tests/reproN.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,78 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | #PBS -l select=16:system=sunspot,place=scatter | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  | #PBS -l walltime=02:00:00 | ||||||
|  | #PBS -N reproN | ||||||
|  | #PBS -k doe | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | module load oneapi/eng-compiler/2023.05.15.003 | ||||||
|  | module load mpich/51.2/icc-all-deterministic-pmix-gpu | ||||||
|  |  | ||||||
|  | # 56 cores / 6 threads ~9 | ||||||
|  | export OMP_NUM_THREADS=6 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 | ||||||
|  | export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling | ||||||
|  | unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  | unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | NN=`cat $PBS_NODEFILE | wc -l` | ||||||
|  | echo $PBS_NODEFILE | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | echo $NN nodes in node file | ||||||
|  | for n in `eval echo {1..$NN}` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | echo Node $n is $THIS_NODE | ||||||
|  |  | ||||||
|  | DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE | ||||||
|  |  | ||||||
|  | mkdir -p $DIR | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | echo $THIS_NODE > nodefile | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \ | ||||||
|  | 	     ../../gpu_tile_compact.sh \ | ||||||
|  | 	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ | ||||||
|  | 		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" | ||||||
|  |  | ||||||
|  | $CMD & | ||||||
|  |  | ||||||
|  | done | ||||||
|  |  | ||||||
|  | wait | ||||||
|  |  | ||||||
|  | for n in ` eval echo {1..$NN} ` | ||||||
|  | do | ||||||
|  |  | ||||||
|  | THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` | ||||||
|  | DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE | ||||||
|  |  | ||||||
|  | cd $DIR | ||||||
|  |  | ||||||
|  | grep Oops Grid.stderr.* > failures.$PBS_JOBID | ||||||
|  | rm core.* | ||||||
|  |  | ||||||
|  | done | ||||||
| @@ -36,5 +36,5 @@ export MPICH_OFI_NIC_POLICY=GPU | |||||||
| CMD="mpiexec -np 192 -ppn 12  -envall \ | CMD="mpiexec -np 192 -ppn 12  -envall \ | ||||||
| 	     ./gpu_tile_compact.sh \ | 	     ./gpu_tile_compact.sh \ | ||||||
| 	     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \ | 	     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \ | ||||||
| 	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" | 	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap" | ||||||
| $CMD  | $CMD  | ||||||
|   | |||||||
| @@ -34,6 +34,45 @@ using namespace Grid; | |||||||
| #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX | #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | NAMESPACE_BEGIN(Grid); | ||||||
|  | template<class Matrix,class Field> | ||||||
|  |   class SchurDiagMooeeOperatorParanoid :  public SchurOperatorBase<Field> { | ||||||
|  |  public: | ||||||
|  |     Matrix &_Mat; | ||||||
|  |     SchurDiagMooeeOperatorParanoid (Matrix &Mat): _Mat(Mat){}; | ||||||
|  |     virtual  void Mpc      (const Field &in, Field &out) { | ||||||
|  |       Field tmp(in.Grid()); | ||||||
|  |       tmp.Checkerboard() = !in.Checkerboard(); | ||||||
|  |       //      std::cout <<" Mpc starting"<<std::endl; | ||||||
|  |  | ||||||
|  |       RealD nn = norm2(in); // std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl; | ||||||
|  |       _Mat.Meooe(in,tmp); | ||||||
|  |       nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooeinv "<<nn<<std::endl; | ||||||
|  |       _Mat.MooeeInv(tmp,out); | ||||||
|  |       nn = norm2(out); //std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl; | ||||||
|  |       _Mat.Meooe(out,tmp); | ||||||
|  |       nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooee "<<nn<<std::endl; | ||||||
|  |       _Mat.Mooee(in,out); | ||||||
|  |       nn = norm2(out); //std::cout <<" Mpc Prior to axpy "<<nn<<std::endl; | ||||||
|  |       axpy(out,-1.0,tmp,out); | ||||||
|  |     } | ||||||
|  |     virtual void MpcDag   (const Field &in, Field &out){ | ||||||
|  |       Field tmp(in.Grid()); | ||||||
|  |       //      std::cout <<" MpcDag starting"<<std::endl; | ||||||
|  |       RealD nn = norm2(in);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl; | ||||||
|  |       _Mat.MeooeDag(in,tmp); | ||||||
|  |       _Mat.MooeeInvDag(tmp,out); | ||||||
|  |       nn = norm2(out);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl; | ||||||
|  |       _Mat.MeooeDag(out,tmp); | ||||||
|  |       nn = norm2(tmp);// std::cout <<" MpcDag Prior to Mooee "<<nn<<std::endl; | ||||||
|  |       _Mat.MooeeDag(in,out); | ||||||
|  |       nn = norm2(out);// std::cout <<" MpcDag Prior to axpy "<<nn<<std::endl; | ||||||
|  |       axpy(out,-1.0,tmp,out); | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) | int main (int argc, char ** argv) | ||||||
| { | { | ||||||
|   char hostname[HOST_NAME_MAX+1]; |   char hostname[HOST_NAME_MAX+1]; | ||||||
| @@ -82,8 +121,8 @@ int main (int argc, char ** argv) | |||||||
|   result_o_2.Checkerboard() = Odd; |   result_o_2.Checkerboard() = Odd; | ||||||
|   result_o_2 = Zero(); |   result_o_2 = Zero(); | ||||||
|  |  | ||||||
|   SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf); |   SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf); | ||||||
|   SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f); |   SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f); | ||||||
|  |  | ||||||
|   int nsecs=600; |   int nsecs=600; | ||||||
|   if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ |   if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ | ||||||
| @@ -108,6 +147,11 @@ int main (int argc, char ** argv) | |||||||
|   csumref=0; |   csumref=0; | ||||||
|   int iter=0; |   int iter=0; | ||||||
|   do { |   do { | ||||||
|  |     if ( iter == 0 ) { | ||||||
|  |       SetGridNormLoggingMode(GridNormLoggingModeRecord); | ||||||
|  |     } else { | ||||||
|  |       SetGridNormLoggingMode(GridNormLoggingModeVerify); | ||||||
|  |     } | ||||||
|     std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl; |     std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl; | ||||||
|     result_o = Zero(); |     result_o = Zero(); | ||||||
|     t1=usecond(); |     t1=usecond(); | ||||||
| @@ -139,6 +183,11 @@ int main (int argc, char ** argv) | |||||||
|   csumref=0; |   csumref=0; | ||||||
|   int i=0; |   int i=0; | ||||||
|   do {  |   do {  | ||||||
|  |     if ( i == 0 ) { | ||||||
|  |       SetGridNormLoggingMode(GridNormLoggingModeRecord); | ||||||
|  |     } else { | ||||||
|  |       SetGridNormLoggingMode(GridNormLoggingModeVerify); | ||||||
|  |     } | ||||||
|     std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl; |     std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl; | ||||||
|     result_o_2 = Zero(); |     result_o_2 = Zero(); | ||||||
|     t1=usecond(); |     t1=usecond(); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user