mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 12:04:33 +00:00 
			
		
		
		
	Compare commits
	
		
			15 Commits
		
	
	
		
			feature/gp
			...
			7f9d06f339
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 7f9d06f339 | ||
|  | ccf147d6c1 | ||
|  | 7aa12b446f | ||
|  | c293228102 | ||
|  | 5c4c9f721a | ||
|  | 057f86c1de | ||
|  | cd52e3cbc2 | ||
|  | 24602e1259 | ||
|  | 8a098889fc | ||
|  | ff2ea5de18 | ||
|  | 461cd045c6 | ||
|  | fee65d7a75 | ||
|  | 31f9971dbf | ||
|  | d87296f3e8 | ||
|  | be94cf1c6f | 
| @@ -1,5 +1,5 @@ | |||||||
| #pragma once | #pragma once | ||||||
| #include <type_traits> |  | ||||||
| #if defined(GRID_CUDA) | #if defined(GRID_CUDA) | ||||||
|  |  | ||||||
| #include <cub/cub.cuh> | #include <cub/cub.cuh> | ||||||
| @@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V | |||||||
|    |    | ||||||
|  |  | ||||||
| } | } | ||||||
|  | #endif  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { |  | ||||||
|  | #if defined(GRID_SYCL) | ||||||
|  | template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | ||||||
|  | { | ||||||
|  |   size_t subvol_size = e1*e2; | ||||||
|  |  | ||||||
|  |   vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator); | ||||||
|  |   vobj vobj_zero; | ||||||
|  |   zeroit(vobj_zero); | ||||||
|  |   for (int r = 0; r<rd; r++) {  | ||||||
|  |     mysum[r] = vobj_zero;  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   commVector<vobj> reduction_buffer(rd*subvol_size);     | ||||||
|  |  | ||||||
|  |   auto rb_p = &reduction_buffer[0]; | ||||||
|  |  | ||||||
|  |   // autoView(Data_v, Data, AcceleratorRead); | ||||||
|  |  | ||||||
|  |   //prepare reduction buffer  | ||||||
|  |   accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{  | ||||||
|  |    | ||||||
|  |       int n = s / e2; | ||||||
|  |       int b = s % e2; | ||||||
|  |       int so=r*ostride; // base offset for start of plane  | ||||||
|  |       int ss= so+n*stride+b; | ||||||
|  |  | ||||||
|  |       coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss])); | ||||||
|  |  | ||||||
|  |   }); | ||||||
|  |  | ||||||
|  |   for (int r = 0; r < rd; r++) { | ||||||
|  |       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||||
|  |           auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); | ||||||
|  |           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, | ||||||
|  |           Reduction, | ||||||
|  |           [=](cl::sycl::id<1> item, auto &sum) { | ||||||
|  |               auto s = item[0]; | ||||||
|  |               sum += rb_p[r*subvol_size+s]; | ||||||
|  |           }); | ||||||
|  |       }); | ||||||
|  |        | ||||||
|  |       | ||||||
|  |   } | ||||||
|  |   theGridAccelerator->wait(); | ||||||
|  |   for (int r = 0; r < rd; r++) { | ||||||
|  |     lvSum[r] = mysum[r]; | ||||||
|  |   } | ||||||
|  |   free(mysum,*theGridAccelerator); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { | ||||||
|   typedef typename vobj::vector_type vector; |   typedef typename vobj::vector_type vector; | ||||||
|   const int words = sizeof(vobj)/sizeof(vector); |   const int words = sizeof(vobj)/sizeof(vector); | ||||||
|   const int osites = rd*e1*e2; |   const int osites = rd*e1*e2; | ||||||
| @@ -106,8 +159,12 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V | |||||||
| 	    buf[ss] = dat[ss*words+w]; | 	    buf[ss] = dat[ss*words+w]; | ||||||
|     }); |     }); | ||||||
|  |  | ||||||
|     sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); |     #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||||
|        |       sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); | ||||||
|  |     #elif defined(GRID_SYCL) | ||||||
|  |       sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); | ||||||
|  |     #endif | ||||||
|  |  | ||||||
|     for (int r = 0; r < rd; r++) { |     for (int r = 0; r < rd; r++) { | ||||||
|       lvSum_ptr[w+words*r]=lvSum_small[r]; |       lvSum_ptr[w+words*r]=lvSum_small[r]; | ||||||
|     } |     } | ||||||
| @@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V | |||||||
|    |    | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | ||||||
| { | { | ||||||
|   autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case. |   autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. | ||||||
|     if constexpr (sizeof(vobj) <= 256) {  |     if constexpr (sizeof(vobj) <= 256) {  | ||||||
|       sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|  |       #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||||
|  |         sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|  |       #elif defined (GRID_SYCL) | ||||||
|  |         sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|  |       #endif | ||||||
|  |  | ||||||
|     } |     } | ||||||
|     else { |     else { | ||||||
|       sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |       sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|     } |     } | ||||||
| } | } | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #if defined(GRID_SYCL) |  | ||||||
| template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) |  | ||||||
| { |  | ||||||
|   typedef typename vobj::scalar_object sobj; |  | ||||||
|   size_t subvol_size = e1*e2; |  | ||||||
|  |  | ||||||
|   vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); |  | ||||||
|   vobj vobj_zero; |  | ||||||
|   zeroit(vobj_zero); |  | ||||||
|      |  | ||||||
|   commVector<vobj> reduction_buffer(rd*subvol_size);     |  | ||||||
|  |  | ||||||
|   auto rb_p = &reduction_buffer[0]; |  | ||||||
|  |  | ||||||
|   autoView(Data_v, Data, AcceleratorRead); |  | ||||||
|  |  | ||||||
|   //prepare reduction buffer  |  | ||||||
|   accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{  |  | ||||||
|    |  | ||||||
|       int n = s / e2; |  | ||||||
|       int b = s % e2; |  | ||||||
|       int so=r*ostride; // base offset for start of plane  |  | ||||||
|       int ss= so+n*stride+b; |  | ||||||
|  |  | ||||||
|       coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); |  | ||||||
|  |  | ||||||
|   }); |  | ||||||
|  |  | ||||||
|   for (int r = 0; r < rd; r++) { |  | ||||||
|       mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable |  | ||||||
|       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |  | ||||||
|           auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); |  | ||||||
|           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, |  | ||||||
|           Reduction, |  | ||||||
|           [=](cl::sycl::id<1> item, auto &sum) { |  | ||||||
|               auto s = item[0]; |  | ||||||
|               sum += rb_p[r*subvol_size+s]; |  | ||||||
|           }); |  | ||||||
|       }); |  | ||||||
|       theGridAccelerator->wait(); |  | ||||||
|       lvSum[r] = mysum[0]; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   free(mysum,*theGridAccelerator); |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | ||||||
| { | { | ||||||
|   // sum over reduced dimension planes, breaking out orthog dir |   // sum over reduced dimension planes, breaking out orthog dir | ||||||
| @@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data | |||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)  | template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)  | ||||||
| { | { | ||||||
|   #if defined(GRID_CUDA) || defined(GRID_HIP) |   #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) | ||||||
|    |    | ||||||
|   sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |   sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|    |  | ||||||
|   #elif defined(GRID_SYCL) |  | ||||||
|    |  | ||||||
|   sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|    |    | ||||||
|   #else |   #else | ||||||
|   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|   | |||||||
| @@ -405,11 +405,4 @@ NAMESPACE_BEGIN(Grid); | |||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
| template<typename T> struct |  | ||||||
| sycl::is_device_copyable<T, typename std::enable_if< |  | ||||||
| 			      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value), |  | ||||||
| 			      void>::type> |  | ||||||
|   : public std::true_type {}; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -210,8 +210,8 @@ void acceleratorInit(void) | |||||||
|   cl::sycl::gpu_selector selector; |   cl::sycl::gpu_selector selector; | ||||||
|   cl::sycl::device selectedDevice { selector }; |   cl::sycl::device selectedDevice { selector }; | ||||||
|   theGridAccelerator = new sycl::queue (selectedDevice); |   theGridAccelerator = new sycl::queue (selectedDevice); | ||||||
|   //  theCopyAccelerator = new sycl::queue (selectedDevice); |   theCopyAccelerator = new sycl::queue (selectedDevice); | ||||||
|   theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. |   //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. | ||||||
|  |  | ||||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|   zeInit(0); |   zeInit(0); | ||||||
|   | |||||||
| @@ -247,9 +247,12 @@ void FlightRecorder::ReductionLog(double local,double global) | |||||||
| } | } | ||||||
| void FlightRecorder::xmitLog(void *buf,uint64_t bytes) | void FlightRecorder::xmitLog(void *buf,uint64_t bytes) | ||||||
| { | { | ||||||
|  |   if(LoggingMode == LoggingModeNone) return; | ||||||
|  |  | ||||||
|   if ( ChecksumCommsSend ){ |   if ( ChecksumCommsSend ){ | ||||||
|   uint64_t *ubuf = (uint64_t *)buf; |   uint64_t *ubuf = (uint64_t *)buf; | ||||||
|   if(LoggingMode == LoggingModeNone) return; |   if(LoggingMode == LoggingModeNone) return; | ||||||
|  |    | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); |   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); | ||||||
|   if(LoggingMode == LoggingModePrint) { |   if(LoggingMode == LoggingModePrint) { | ||||||
| @@ -285,12 +288,6 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes) | |||||||
|     XmitLoggingCounter++; |     XmitLoggingCounter++; | ||||||
|   } |   } | ||||||
| #endif | #endif | ||||||
|   } else { |  | ||||||
|     uint64_t word = 1; |  | ||||||
|     deviceVector<uint64_t> dev(1); |  | ||||||
|     acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t)); |  | ||||||
|     acceleratorCopySynchronise(); |  | ||||||
|     MPI_Barrier(MPI_COMM_WORLD); |  | ||||||
|   } |   } | ||||||
| } | } | ||||||
| void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) | void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) | ||||||
|   | |||||||
							
								
								
									
										67
									
								
								systems/Aurora/benchmarks/bench1.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								systems/Aurora/benchmarks/bench1.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,67 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | #PBS -q debug | ||||||
|  | #PBS -l select=1 | ||||||
|  | #PBS -l walltime=00:20:00 | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | source ../sourceme.sh | ||||||
|  | module load pti-gpu | ||||||
|  |  | ||||||
|  | #cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | export OMP_NUM_THREADS=4 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  |  | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||||
|  |  | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | # 12 ppn, 2 nodes, 24 ranks | ||||||
|  | # | ||||||
|  | CMD="mpiexec -np 12 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \ | ||||||
|  | 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"  | ||||||
|  | #$CMD | tee 1node.comms | ||||||
|  |  | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 1 -ppn 1  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \ | ||||||
|  | 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 " | ||||||
|  | #$CMD | tee 1tile.dwf | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 12 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \ | ||||||
|  | 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 1node.32.32.32.48.dwf | ||||||
|  |  | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 12 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \ | ||||||
|  | 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | #$CMD | tee 1node.64.64.32.96.dwf | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 12 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \ | ||||||
|  | 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | #$CMD | tee 1node.64.32.32.48.dwf | ||||||
|  |  | ||||||
| @@ -1,10 +1,8 @@ | |||||||
| #!/bin/bash | #!/bin/bash | ||||||
| 
 | 
 | ||||||
| ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | #PBS -q workq | ||||||
| 
 |  | ||||||
| #PBS -q EarlyAppAccess |  | ||||||
| #PBS -l select=2 | #PBS -l select=2 | ||||||
| #PBS -l walltime=01:00:00 | #PBS -l walltime=00:20:00 | ||||||
| #PBS -A LatticeQCD_aesp_CNDA | #PBS -A LatticeQCD_aesp_CNDA | ||||||
| 
 | 
 | ||||||
| #export OMP_PROC_BIND=spread | #export OMP_PROC_BIND=spread | ||||||
| @@ -13,11 +11,13 @@ | |||||||
| cd $PBS_O_WORKDIR | cd $PBS_O_WORKDIR | ||||||
| 
 | 
 | ||||||
| source ../sourceme.sh | source ../sourceme.sh | ||||||
|  | module load pti-gpu | ||||||
| 
 | 
 | ||||||
| export OMP_NUM_THREADS=3 | #cat $PBS_NODEFILE | ||||||
|  | 
 | ||||||
|  | export OMP_NUM_THREADS=4 | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||||
| @@ -31,30 +31,25 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | |||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
| export MPICH_OFI_NIC_POLICY=GPU | export MPICH_OFI_NIC_POLICY=GPU | ||||||
| 
 | 
 | ||||||
|  | # 12 ppn, 2 nodes, 24 ranks | ||||||
|  | # | ||||||
| CMD="mpiexec -np 24 -ppn 12  -envall \ | CMD="mpiexec -np 24 -ppn 12  -envall \ | ||||||
| 	     ./gpu_tile_compact.sh \ | 	     ./gpu_tile_compact.sh \ | ||||||
| 	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \ | 	     ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \ | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"  | ||||||
|  | $CMD | tee 2node.comms | ||||||
| 
 | 
 | ||||||
| #$CMD  |  | ||||||
| 
 | 
 | ||||||
| CMD="mpiexec -np 24 -ppn 12  -envall \ | CMD="mpiexec -np 24 -ppn 12  -envall \ | ||||||
| 	     ./gpu_tile_compact.sh \ | 	     ./gpu_tile_compact.sh \ | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \ | 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 2node.32.32.64.48.dwf | ||||||
| 
 | 
 | ||||||
| #$CMD  |  | ||||||
| 
 | 
 | ||||||
| CMD="mpiexec -np 1 -ppn 1  -envall \ | CMD="mpiexec -np 24 -ppn 12  -envall \ | ||||||
| 	     ./gpu_tile_compact.sh \ | 	     ./gpu_tile_compact.sh \ | ||||||
| 	     ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ | 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \ | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 2node.64.64.64.96.dwf | ||||||
| 
 | 
 | ||||||
| $CMD  |  | ||||||
| 
 |  | ||||||
| CMD="mpiexec -np 1 -ppn 1  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" |  | ||||||
| 
 |  | ||||||
| $CMD  |  | ||||||
| @@ -1,33 +1,34 @@ | |||||||
| #!/bin/bash | #!/bin/bash | ||||||
|  |  | ||||||
| export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) | #export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) | ||||||
| #export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) | #export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1); | ||||||
| export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) | #export  GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1) | ||||||
| export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 ) |  | ||||||
| export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) | export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ); | ||||||
| export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) | export  GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 ) | ||||||
|  |  | ||||||
| export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} | export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} | ||||||
| export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]} |  | ||||||
| export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} |  | ||||||
| export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} | export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} | ||||||
| export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} |  | ||||||
|    |    | ||||||
| #export GRID_MPICH_NIC_BIND=$NIC |  | ||||||
| #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id |  | ||||||
|  |  | ||||||
| unset EnableWalkerPartition | unset EnableWalkerPartition | ||||||
| export EnableImplicitScaling=0 | export EnableImplicitScaling=0 | ||||||
| export ZE_AFFINITY_MASK=$gpu_id.$tile_id | export ZE_AFFINITY_MASK=$gpu_id | ||||||
| export ONEAPI_DEVICE_FILTER=gpu,level_zero | export ONEAPI_DEVICE_FILTER=gpu,level_zero | ||||||
|  |  | ||||||
| #export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 | export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 | ||||||
| #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | ||||||
| #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5 | ||||||
| #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 | #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 | ||||||
| #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 | export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 | ||||||
| #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 | #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 | ||||||
|  |  | ||||||
| #echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " | echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " | ||||||
|  |  | ||||||
| numactl -m $NUMA -N $NUMAP  "$@" | if [ $PALS_RANKID = "0" ] | ||||||
|  | then | ||||||
|  | #    numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline  "$@" | ||||||
|  | #    numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" | ||||||
|  |     numactl -m $NUMA -N $NUMA  "$@" | ||||||
|  | else  | ||||||
|  |     numactl -m $NUMA -N $NUMA  "$@" | ||||||
|  | fi | ||||||
|   | |||||||
| @@ -7,7 +7,7 @@ | |||||||
| 	--disable-fermion-reps \ | 	--disable-fermion-reps \ | ||||||
| 	--enable-shm=nvlink \ | 	--enable-shm=nvlink \ | ||||||
| 	--enable-accelerator=sycl \ | 	--enable-accelerator=sycl \ | ||||||
| 	--enable-accelerator-aware-mpi=no\ | 	--enable-accelerator-aware-mpi=yes\ | ||||||
| 	--enable-unified=no \ | 	--enable-unified=no \ | ||||||
| 	MPICXX=mpicxx \ | 	MPICXX=mpicxx \ | ||||||
| 	CXX=icpx \ | 	CXX=icpx \ | ||||||
|   | |||||||
| @@ -1,7 +1,9 @@ | |||||||
| #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 | #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 | ||||||
|  |  | ||||||
| module use /soft/modulefiles | module load oneapi/release/2023.12.15.001 | ||||||
| module load intel_compute_runtime/release/agama-devel-682.22 |  | ||||||
|  | #module use /soft/modulefiles | ||||||
|  | #module load intel_compute_runtime/release/agama-devel-682.22 | ||||||
|  |  | ||||||
| export FI_CXI_DEFAULT_CQ_SIZE=131072 | export FI_CXI_DEFAULT_CQ_SIZE=131072 | ||||||
| export FI_CXI_CQ_FILL_PERCENT=20 | export FI_CXI_CQ_FILL_PERCENT=20 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user