mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 03:54:33 +00:00 
			
		
		
		
	Compare commits
	
		
			7 Commits
		
	
	
		
			27a5508ea1
			...
			7a7f823f00
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 7a7f823f00 | ||
| 73c0b29535 | |||
| 303b83cdb8 | |||
| 5ef4da3f29 | |||
| 1502860004 | |||
| 585efc6f3f | |||
| 62055e04dd | 
| @@ -69,7 +69,7 @@ public: | |||||||
|   /*! Construct stout smearing object from explicitly specified rho matrix */ |   /*! Construct stout smearing object from explicitly specified rho matrix */ | ||||||
|   Smear_Stout(const std::vector<double>& rho_) |   Smear_Stout(const std::vector<double>& rho_) | ||||||
|     : OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} { |     : OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} { | ||||||
|     std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl |     std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl; | ||||||
|     assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3"); |     assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3"); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -7,6 +7,8 @@ uint32_t accelerator_threads=2; | |||||||
| uint32_t acceleratorThreads(void)       {return accelerator_threads;}; | uint32_t acceleratorThreads(void)       {return accelerator_threads;}; | ||||||
| void     acceleratorThreads(uint32_t t) {accelerator_threads = t;}; | void     acceleratorThreads(uint32_t t) {accelerator_threads = t;}; | ||||||
|  |  | ||||||
|  | #define ENV_LOCAL_RANK_PALS    "PALS_LOCAL_RANKID" | ||||||
|  | #define ENV_RANK_PALS          "PALS_RANKID" | ||||||
| #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK" | #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK" | ||||||
| #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK" | #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK" | ||||||
| #define ENV_LOCAL_RANK_SLURM   "SLURM_LOCALID" | #define ENV_LOCAL_RANK_SLURM   "SLURM_LOCALID" | ||||||
| @@ -231,8 +233,17 @@ void acceleratorInit(void) | |||||||
|   { |   { | ||||||
|     rank = atoi(localRankStr);		 |     rank = atoi(localRankStr);		 | ||||||
|   } |   } | ||||||
|  |   if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL) | ||||||
|  |   { | ||||||
|  |     rank = atoi(localRankStr);		 | ||||||
|  |   } | ||||||
|   if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);} |   if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);} | ||||||
|   if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} |   if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} | ||||||
|  |   if ((localRankStr = getenv(ENV_RANK_PALS   )) != NULL) { world_rank = atoi(localRankStr);} | ||||||
|  |  | ||||||
|  |   char hostname[HOST_NAME_MAX+1]; | ||||||
|  |   gethostname(hostname, HOST_NAME_MAX+1); | ||||||
|  |   if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); | ||||||
|  |  | ||||||
|   auto devices = cl::sycl::device::get_devices(); |   auto devices = cl::sycl::device::get_devices(); | ||||||
|   for(int d = 0;d<devices.size();d++){ |   for(int d = 0;d<devices.size();d++){ | ||||||
| @@ -244,9 +255,10 @@ void acceleratorInit(void) | |||||||
|     printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>()); |     printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>()); | ||||||
|  |  | ||||||
| #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld"); | #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld"); | ||||||
|  |     if ( world_rank == 0) { | ||||||
|  |  | ||||||
|     GPU_PROP_STR(vendor); |       GPU_PROP_STR(vendor); | ||||||
|     GPU_PROP_STR(version); |       GPU_PROP_STR(version); | ||||||
|     //    GPU_PROP_STR(device_type); |     //    GPU_PROP_STR(device_type); | ||||||
|     /* |     /* | ||||||
|     GPU_PROP(max_compute_units); |     GPU_PROP(max_compute_units); | ||||||
| @@ -262,7 +274,8 @@ void acceleratorInit(void) | |||||||
|     GPU_PROP(single_fp_config); |     GPU_PROP(single_fp_config); | ||||||
|     */ |     */ | ||||||
|     //    GPU_PROP(double_fp_config); |     //    GPU_PROP(double_fp_config); | ||||||
|     GPU_PROP(global_mem_size); |       GPU_PROP(global_mem_size); | ||||||
|  |     } | ||||||
|  |  | ||||||
|   } |   } | ||||||
|   if ( world_rank == 0 ) { |   if ( world_rank == 0 ) { | ||||||
|   | |||||||
| @@ -90,11 +90,11 @@ int main (int argc, char ** argv) | |||||||
|   std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; |   std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; | ||||||
|  |  | ||||||
|   for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0; |   for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0; | ||||||
|   Dirichlet[0] = 0; |   //  Dirichlet[0] = 0; | ||||||
|   Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; |   //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; | ||||||
|   Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; |   //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; | ||||||
|   Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; |   //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; | ||||||
|   Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; |   //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; | ||||||
|  |  | ||||||
|   Benchmark(Ls,Dirichlet); |   Benchmark(Ls,Dirichlet); | ||||||
|  |  | ||||||
| @@ -105,11 +105,11 @@ int main (int argc, char ** argv) | |||||||
|   std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; |   std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; | ||||||
|  |  | ||||||
|   for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0; |   for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0; | ||||||
|   Dirichlet[0] = 0; |   //  Dirichlet[0] = 0; | ||||||
|   Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; |   //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; | ||||||
|   Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; |   //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; | ||||||
|   Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; |   //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; | ||||||
|   Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; |   //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; | ||||||
|    |    | ||||||
|   Benchmark(Ls,Dirichlet); |   Benchmark(Ls,Dirichlet); | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										56
									
								
								systems/Aurora/benchmarks/bench1024.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								systems/Aurora/benchmarks/bench1024.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,56 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ||||||
|  |  | ||||||
|  | #PBS -q EarlyAppAccess | ||||||
|  | #PBS -l select=1024 | ||||||
|  | #PBS -l walltime=01:00:00 | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | source ../sourceme.sh | ||||||
|  |  | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | export OMP_NUM_THREADS=3 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  |  | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | # 12 ppn, 32 nodes, 384 ranks | ||||||
|  | # | ||||||
|  | CMD="mpiexec -np 12288 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_comms_host_device --mpi 8.6.16.16 --grid 64.48.64.284 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | ||||||
|  |  | ||||||
|  | $CMD  | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 12288 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 1024node.dwf.small | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 12288 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 1024node.dwf | ||||||
|  |  | ||||||
|  |  | ||||||
							
								
								
									
										56
									
								
								systems/Aurora/benchmarks/bench2048.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								systems/Aurora/benchmarks/bench2048.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,56 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ||||||
|  |  | ||||||
|  | #PBS -q EarlyAppAccess | ||||||
|  | #PBS -l select=2048 | ||||||
|  | #PBS -l walltime=01:00:00 | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | source ../sourceme.sh | ||||||
|  |  | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | export OMP_NUM_THREADS=3 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  |  | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | # 12 ppn, 32 nodes, 384 ranks | ||||||
|  | # | ||||||
|  | CMD="mpiexec -np 24576 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_comms_host_device --mpi 8.12.16.16 --grid 64.48.64.284 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | ||||||
|  |  | ||||||
|  | $CMD  | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 24576 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 128.128.128.384 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 2048node.dwf.small | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 24576 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 256.256.256.768 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 2048node.dwf | ||||||
|  |  | ||||||
|  |  | ||||||
							
								
								
									
										48
									
								
								systems/Aurora/benchmarks/bench256.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								systems/Aurora/benchmarks/bench256.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ||||||
|  |  | ||||||
|  | #PBS -q EarlyAppAccess | ||||||
|  | #PBS -l select=256 | ||||||
|  | #PBS -l walltime=01:00:00 | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | source ../sourceme.sh | ||||||
|  |  | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | export OMP_NUM_THREADS=3 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  |  | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | # 12 ppn, 32 nodes, 384 ranks | ||||||
|  | # | ||||||
|  | CMD="mpiexec -np 3072 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_comms_host_device --mpi 8.6.8.8 --grid 32.24.32.192 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | ||||||
|  |  | ||||||
|  | $CMD  | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 3072 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 8.8.4.12 --grid 128.128.128.768 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 256node.dwf.large | ||||||
							
								
								
									
										48
									
								
								systems/Aurora/benchmarks/bench512.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								systems/Aurora/benchmarks/bench512.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ||||||
|  |  | ||||||
|  | #PBS -q EarlyAppAccess | ||||||
|  | #PBS -l select=512 | ||||||
|  | #PBS -l walltime=01:00:00 | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | source ../sourceme.sh | ||||||
|  |  | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | export OMP_NUM_THREADS=3 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  |  | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | # 12 ppn, 32 nodes, 384 ranks | ||||||
|  | # | ||||||
|  | CMD="mpiexec -np 6144 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_comms_host_device --mpi 8.6.8.16 --grid 32.24.32.192 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | ||||||
|  |  | ||||||
|  | $CMD  | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 6144 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.12 --grid 256.128.128.768 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 512node.dwf.large | ||||||
							
								
								
									
										80
									
								
								systems/Aurora/benchmarks/bench_scaling.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								systems/Aurora/benchmarks/bench_scaling.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ||||||
|  |  | ||||||
|  | #PBS -q EarlyAppAccess | ||||||
|  | #PBS -l select=32 | ||||||
|  | #PBS -l walltime=01:00:00 | ||||||
|  | #PBS -A LatticeQCD_aesp_CNDA | ||||||
|  |  | ||||||
|  | #export OMP_PROC_BIND=spread | ||||||
|  | #unset OMP_PLACES | ||||||
|  |  | ||||||
|  | cd $PBS_O_WORKDIR | ||||||
|  |  | ||||||
|  | source ../sourceme.sh | ||||||
|  |  | ||||||
|  | cat $PBS_NODEFILE | ||||||
|  |  | ||||||
|  | export OMP_NUM_THREADS=3 | ||||||
|  | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||||
|  |  | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||||
|  | #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||||
|  |  | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||||
|  | export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||||
|  | export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||||
|  | export MPICH_OFI_NIC_POLICY=GPU | ||||||
|  |  | ||||||
|  | # 12 ppn, 32 nodes, 384 ranks | ||||||
|  | # | ||||||
|  | CMD="mpiexec -np 384 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | ||||||
|  |  | ||||||
|  | $CMD  | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 12 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 1node.dwf | ||||||
|  |  | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 24 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 2node.dwf | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 48 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 4node.dwf | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 96 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 8node.dwf | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 192 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 16node.dwf | ||||||
|  |  | ||||||
|  |  | ||||||
|  | CMD="mpiexec -np 384 -ppn 12  -envall \ | ||||||
|  | 	     ./gpu_tile_compact.sh \ | ||||||
|  | 	     ./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \ | ||||||
|  | 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||||
|  | $CMD | tee 32node.dwf | ||||||
| @@ -28,6 +28,6 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero | |||||||
| #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 | #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 | ||||||
| #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 | #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 | ||||||
|  |  | ||||||
| echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " | #echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " | ||||||
|  |  | ||||||
| numactl -m $NUMA -N $NUMAP  "$@" | numactl -m $NUMA -N $NUMAP  "$@" | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user