mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-30 19:44:32 +00:00 
			
		
		
		
	Compare commits
	
		
			9 Commits
		
	
	
		
			feature/gp
			...
			ccf147d6c1
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | ccf147d6c1 | ||
|  | 7aa12b446f | ||
|  | c293228102 | ||
|  | 5c4c9f721a | ||
|  | 057f86c1de | ||
|  | cd52e3cbc2 | ||
|  | 24602e1259 | ||
|  | 8a098889fc | ||
|  | ff2ea5de18 | 
| @@ -405,11 +405,4 @@ NAMESPACE_BEGIN(Grid); | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|  | ||||
| #ifdef GRID_SYCL | ||||
| template<typename T> struct | ||||
| sycl::is_device_copyable<T, typename std::enable_if< | ||||
| 			      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value), | ||||
| 			      void>::type> | ||||
|   : public std::true_type {}; | ||||
| #endif | ||||
|  | ||||
|   | ||||
| @@ -210,8 +210,8 @@ void acceleratorInit(void) | ||||
|   cl::sycl::gpu_selector selector; | ||||
|   cl::sycl::device selectedDevice { selector }; | ||||
|   theGridAccelerator = new sycl::queue (selectedDevice); | ||||
|   //  theCopyAccelerator = new sycl::queue (selectedDevice); | ||||
|   theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. | ||||
|   theCopyAccelerator = new sycl::queue (selectedDevice); | ||||
|   //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. | ||||
|  | ||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||
|   zeInit(0); | ||||
|   | ||||
| @@ -247,9 +247,12 @@ void FlightRecorder::ReductionLog(double local,double global) | ||||
| } | ||||
| void FlightRecorder::xmitLog(void *buf,uint64_t bytes) | ||||
| { | ||||
|   if(LoggingMode == LoggingModeNone) return; | ||||
|  | ||||
|   if ( ChecksumCommsSend ){ | ||||
|   uint64_t *ubuf = (uint64_t *)buf; | ||||
|   if(LoggingMode == LoggingModeNone) return; | ||||
|    | ||||
| #ifdef GRID_SYCL | ||||
|   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); | ||||
|   if(LoggingMode == LoggingModePrint) { | ||||
| @@ -285,12 +288,6 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes) | ||||
|     XmitLoggingCounter++; | ||||
|   } | ||||
| #endif | ||||
|   } else { | ||||
|     uint64_t word = 1; | ||||
|     deviceVector<uint64_t> dev(1); | ||||
|     acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t)); | ||||
|     acceleratorCopySynchronise(); | ||||
|     MPI_Barrier(MPI_COMM_WORLD); | ||||
|   } | ||||
| } | ||||
| void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) | ||||
|   | ||||
							
								
								
									
										67
									
								
								systems/Aurora/benchmarks/bench1.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								systems/Aurora/benchmarks/bench1.pbs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,67 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| #PBS -q debug | ||||
| #PBS -l select=1 | ||||
| #PBS -l walltime=00:20:00 | ||||
| #PBS -A LatticeQCD_aesp_CNDA | ||||
|  | ||||
| #export OMP_PROC_BIND=spread | ||||
| #unset OMP_PLACES | ||||
|  | ||||
| cd $PBS_O_WORKDIR | ||||
|  | ||||
| source ../sourceme.sh | ||||
| module load pti-gpu | ||||
|  | ||||
| #cat $PBS_NODEFILE | ||||
|  | ||||
| export OMP_NUM_THREADS=4 | ||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||
|  | ||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||
|  | ||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 | ||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 | ||||
| #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 | ||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 | ||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 | ||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||
| export MPICH_OFI_NIC_POLICY=GPU | ||||
|  | ||||
| # 12 ppn, 2 nodes, 24 ranks | ||||
| # | ||||
| CMD="mpiexec -np 12 -ppn 12  -envall \ | ||||
| 	     ./gpu_tile_compact.sh \ | ||||
| 	     ./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \ | ||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"  | ||||
| #$CMD | tee 1node.comms | ||||
|  | ||||
|  | ||||
| CMD="mpiexec -np 1 -ppn 1  -envall \ | ||||
| 	     ./gpu_tile_compact.sh \ | ||||
| 	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \ | ||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 " | ||||
| #$CMD | tee 1tile.dwf | ||||
|  | ||||
| CMD="mpiexec -np 12 -ppn 12  -envall \ | ||||
| 	     ./gpu_tile_compact.sh \ | ||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \ | ||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||
| $CMD | tee 1node.32.32.32.48.dwf | ||||
|  | ||||
|  | ||||
| CMD="mpiexec -np 12 -ppn 12  -envall \ | ||||
| 	     ./gpu_tile_compact.sh \ | ||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \ | ||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||
| #$CMD | tee 1node.64.64.32.96.dwf | ||||
|  | ||||
| CMD="mpiexec -np 12 -ppn 12  -envall \ | ||||
| 	     ./gpu_tile_compact.sh \ | ||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \ | ||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||
| #$CMD | tee 1node.64.32.32.48.dwf | ||||
|  | ||||
| @@ -1,10 +1,8 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 | ||||
| 
 | ||||
| #PBS -q EarlyAppAccess | ||||
| #PBS -q workq | ||||
| #PBS -l select=2 | ||||
| #PBS -l walltime=01:00:00 | ||||
| #PBS -l walltime=00:20:00 | ||||
| #PBS -A LatticeQCD_aesp_CNDA | ||||
| 
 | ||||
| #export OMP_PROC_BIND=spread | ||||
| @@ -13,11 +11,13 @@ | ||||
| cd $PBS_O_WORKDIR | ||||
| 
 | ||||
| source ../sourceme.sh | ||||
| module load pti-gpu | ||||
| 
 | ||||
| export OMP_NUM_THREADS=3 | ||||
| #cat $PBS_NODEFILE | ||||
| 
 | ||||
| export OMP_NUM_THREADS=4 | ||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 | ||||
| 
 | ||||
| 
 | ||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE | ||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE | ||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST | ||||
| @@ -31,30 +31,25 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 | ||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 | ||||
| export MPICH_OFI_NIC_POLICY=GPU | ||||
| 
 | ||||
| # 12 ppn, 2 nodes, 24 ranks | ||||
| # | ||||
| CMD="mpiexec -np 24 -ppn 12  -envall \ | ||||
| 	     ./gpu_tile_compact.sh \ | ||||
| 	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \ | ||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | ||||
| 	     ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \ | ||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"  | ||||
| $CMD | tee 2node.comms | ||||
| 
 | ||||
| #$CMD  | ||||
| 
 | ||||
| CMD="mpiexec -np 24 -ppn 12  -envall \ | ||||
| 	     ./gpu_tile_compact.sh \ | ||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \ | ||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | ||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ | ||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||
| $CMD | tee 2node.32.32.64.48.dwf | ||||
| 
 | ||||
| #$CMD  | ||||
| 
 | ||||
| CMD="mpiexec -np 1 -ppn 1  -envall \ | ||||
| CMD="mpiexec -np 24 -ppn 12  -envall \ | ||||
| 	     ./gpu_tile_compact.sh \ | ||||
| 	     ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ | ||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | ||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \ | ||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" | ||||
| $CMD | tee 2node.64.64.64.96.dwf | ||||
| 
 | ||||
| $CMD  | ||||
| 
 | ||||
| CMD="mpiexec -np 1 -ppn 1  -envall \ | ||||
| 	     ./gpu_tile_compact.sh \ | ||||
| 	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ | ||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" | ||||
| 
 | ||||
| $CMD  | ||||
| @@ -1,33 +1,34 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) | ||||
| #export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) | ||||
| export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) | ||||
| export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 ) | ||||
| export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) | ||||
| export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) | ||||
| #export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) | ||||
| #export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1); | ||||
| #export  GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1) | ||||
|  | ||||
| export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ); | ||||
| export  GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 ) | ||||
|  | ||||
| export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} | ||||
| export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]} | ||||
| export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} | ||||
| export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} | ||||
| export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} | ||||
|    | ||||
| #export GRID_MPICH_NIC_BIND=$NIC | ||||
| #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id | ||||
|  | ||||
| unset EnableWalkerPartition | ||||
| export EnableImplicitScaling=0 | ||||
| export ZE_AFFINITY_MASK=$gpu_id.$tile_id | ||||
| export ZE_AFFINITY_MASK=$gpu_id | ||||
| export ONEAPI_DEVICE_FILTER=gpu,level_zero | ||||
|  | ||||
| #export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 | ||||
| #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 | ||||
| #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | ||||
| export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 | ||||
| export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | ||||
| export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5 | ||||
| #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 | ||||
| #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 | ||||
| export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 | ||||
| #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 | ||||
|  | ||||
| #echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " | ||||
| echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " | ||||
|  | ||||
| numactl -m $NUMA -N $NUMAP  "$@" | ||||
| if [ $PALS_RANKID = "0" ] | ||||
| then | ||||
| #    numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline  "$@" | ||||
| #    numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" | ||||
|     numactl -m $NUMA -N $NUMA  "$@" | ||||
| else  | ||||
|     numactl -m $NUMA -N $NUMA  "$@" | ||||
| fi | ||||
|   | ||||
| @@ -7,7 +7,7 @@ | ||||
| 	--disable-fermion-reps \ | ||||
| 	--enable-shm=nvlink \ | ||||
| 	--enable-accelerator=sycl \ | ||||
| 	--enable-accelerator-aware-mpi=no\ | ||||
| 	--enable-accelerator-aware-mpi=yes\ | ||||
| 	--enable-unified=no \ | ||||
| 	MPICXX=mpicxx \ | ||||
| 	CXX=icpx \ | ||||
|   | ||||
| @@ -1,7 +1,9 @@ | ||||
| #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 | ||||
|  | ||||
| module use /soft/modulefiles | ||||
| module load intel_compute_runtime/release/agama-devel-682.22 | ||||
| module load oneapi/release/2023.12.15.001 | ||||
|  | ||||
| #module use /soft/modulefiles | ||||
| #module load intel_compute_runtime/release/agama-devel-682.22 | ||||
|  | ||||
| export FI_CXI_DEFAULT_CQ_SIZE=131072 | ||||
| export FI_CXI_CQ_FILL_PERCENT=20 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user