Initial commit
This commit is contained in:
		
							
								
								
									
										5
									
								
								2-racks/size-C0/16-nodes/.geom
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								2-racks/size-C0/16-nodes/.geom
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,5 @@ | ||||
| nnodes    : 16 | ||||
| ntasks    : 64 | ||||
| partition : gpu | ||||
| mpi-geom  : 2.2.2.8 | ||||
| grid-geom : 48.48.48.96 | ||||
							
								
								
									
										13
									
								
								2-racks/size-C0/16-nodes/cpu-mpi-wrapper.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								2-racks/size-C0/16-nodes/cpu-mpi-wrapper.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,13 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| lrank=$OMPI_COMM_WORLD_LOCAL_RANK | ||||
| numa=${lrank} | ||||
| cpus="$(( lrank*16 ))-$(( (lrank+1)*16-1 ))" | ||||
| places="$(( lrank*16 )):$(( (lrank+1)*16 ))" | ||||
|  | ||||
| BINDING="taskset -c ${cpus} numactl -m ${numa}" | ||||
| export OMP_PLACES=${places} | ||||
|  | ||||
| echo "$(hostname) - ${lrank} binding='${BINDING}'" | ||||
|  | ||||
| ${BINDING} "$@" | ||||
							
								
								
									
										1
									
								
								2-racks/size-C0/16-nodes/dwf_fp32.tok
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								2-racks/size-C0/16-nodes/dwf_fp32.tok
									
									
									
									
									
										Symbolic link
									
								
							| @@ -0,0 +1 @@ | ||||
| ../dwf_fp32.tok | ||||
							
								
								
									
										14
									
								
								2-racks/size-C0/16-nodes/gpu-mpi-wrapper.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										14
									
								
								2-racks/size-C0/16-nodes/gpu-mpi-wrapper.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,14 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| lrank=$OMPI_COMM_WORLD_LOCAL_RANK | ||||
| numa1=$(( 2 * lrank)) | ||||
| numa2=$(( 2 * lrank + 1 )) | ||||
| netdev=mlx5_${lrank}:1 | ||||
|  | ||||
| export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK | ||||
| export UCX_NET_DEVICES=${netdev} | ||||
| BINDING="--interleave=$numa1,$numa2" | ||||
|  | ||||
| echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING" | ||||
|  | ||||
| numactl ${BINDING} "$@" | ||||
| @@ -0,0 +1 @@ | ||||
| 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32 | ||||
							
								
								
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/elf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/elf
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 20:25:12 BST 2022 | ||||
| epoch 1661023512 | ||||
							
								
								
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/env
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/ldd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/ldd
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| 	linux-vdso.so.1 (0x00007ffef5f3f000) | ||||
| 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015459e0bd000) | ||||
| 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015459dcf5000) | ||||
| 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015459d803000) | ||||
| 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000015459d4d9000) | ||||
| 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000015459d1f8000) | ||||
| 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015459cf97000) | ||||
| 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015459e044000) | ||||
| 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015459cbb7000) | ||||
| 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000015459b45b000) | ||||
| 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000015459b08b000) | ||||
| 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000015459adea000) | ||||
| 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015459acbf000) | ||||
| 	libm.so.6 => /lib64/libm.so.6 (0x000015459a93d000) | ||||
| 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000015459a706000) | ||||
| 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000015459a4ee000) | ||||
| 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000015459a2ce000) | ||||
| 	libc.so.6 => /lib64/libc.so.6 (0x0000154599f09000) | ||||
| 	libdl.so.2 => /lib64/libdl.so.2 (0x0000154599d05000) | ||||
| 	/lib64/ld-linux-x86-64.so.2 (0x000015459df0d000) | ||||
| 	librt.so.1 => /lib64/librt.so.1 (0x0000154599afd000) | ||||
| 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015459df78000) | ||||
| 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015459df73000) | ||||
| 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001545999f1000) | ||||
| 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001545997e7000) | ||||
| 	libutil.so.1 => /lib64/libutil.so.1 (0x00001545995e3000) | ||||
							
								
								
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | ||||
| tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| local rank 1 device 0 bus id: 0000:44:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 2 device 0 bus id: 0000:84:00.0 | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 3 device 0 bus id: 0000:C4:00.0 | ||||
| SharedMemoryMpi:  World communicator of size 64 | ||||
| SharedMemoryMpi:  Node  communicator of size 4 | ||||
| 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ea00000000 for comms buffers  | ||||
| Setting up IPC | ||||
|  | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||
| __|_                                    _|__ | ||||
| __|_   GGGG    RRRR    III    DDDD      _|__ | ||||
| __|_  G        R   R    I     D   D     _|__ | ||||
| __|_  G        R   R    I     D    D    _|__ | ||||
| __|_  G  GG    RRRR     I     D    D    _|__ | ||||
| __|_  G   G    R  R     I     D   D     _|__ | ||||
| __|_   GGGG    R   R   III    DDDD      _|__ | ||||
| __|_                                    _|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
|   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||
|  | ||||
|  | ||||
| Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
| Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes | ||||
|  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : MPI is initialised and logging filters activated  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : Requested 2147483648 byte stencil comms buffers  | ||||
| Grid : Message : MemoryManager Cache 34004218675 bytes  | ||||
| Grid : Message : MemoryManager::Init() setting up | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 | ||||
| Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||
| Grid : Message : MemoryManager::Init() Using cudaMalloc | ||||
| Grid : Message : 1.499143 s : Grid Layout | ||||
| Grid : Message : 1.499148 s : 	Global lattice size  : 48 48 48 96  | ||||
| Grid : Message : 1.499155 s : 	OpenMP threads       : 4 | ||||
| Grid : Message : 1.499157 s : 	MPI tasks            : 2 2 2 8  | ||||
| Grid : Message : 1.515541 s : Making s innermost grids | ||||
| Grid : Message : 1.532470 s : Initialising 4d RNG | ||||
| Grid : Message : 1.550455 s : Intialising parallel RNG with unique string 'The 4D RNG' | ||||
| Grid : Message : 1.550491 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 | ||||
| Grid : Message : 1.937366 s : Initialising 5d RNG | ||||
| Grid : Message : 2.163040 s : Intialising parallel RNG with unique string 'The 5D RNG' | ||||
| Grid : Message : 2.163078 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a | ||||
| Grid : Message : 7.467109 s : Initialised RNGs | ||||
| Grid : Message : 8.261272 s : Drawing gauge field | ||||
| Grid : Message : 8.380110 s : Random gauge initialised  | ||||
| Grid : Message : 8.388989 s : Setting up Cshift based reference  | ||||
| Grid : Message : 13.599668 s : ***************************************************************** | ||||
| Grid : Message : 13.599694 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm | ||||
| Grid : Message : 13.599696 s : ***************************************************************** | ||||
| Grid : Message : 13.599700 s : ***************************************************************** | ||||
| Grid : Message : 13.599702 s : * Benchmarking DomainWallFermionR::Dhop                   | ||||
| Grid : Message : 13.599705 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 13.599708 s : * VComplexF size is 64 B | ||||
| Grid : Message : 13.599710 s : * SINGLE precision  | ||||
| Grid : Message : 13.599712 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 13.599716 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 13.599719 s : ***************************************************************** | ||||
| Grid : Message : 14.992290 s : Called warmup | ||||
| Grid : Message : 104.236264 s : Called Dw 30000 times in 9.01365e+07 us | ||||
| Grid : Message : 104.236329 s : mflop/s =   7.46293e+07 | ||||
| Grid : Message : 104.236331 s : mflop/s per rank =  1.16608e+06 | ||||
| Grid : Message : 104.236333 s : mflop/s per node =  4.66433e+06 | ||||
| Grid : Message : 104.236335 s : RF  GiB/s (base 2) =   151645 | ||||
| Grid : Message : 104.236337 s : mem GiB/s (base 2) =   94778.1 | ||||
| Grid : Message : 104.236908 s : norm diff   1.05775e-13 | ||||
| Grid : Message : 104.247209 s : #### Dhop calls report  | ||||
| Grid : Message : 104.247215 s : WilsonFermion5D Number of DhopEO Calls   : 60002 | ||||
| Grid : Message : 104.247219 s : WilsonFermion5D TotalTime   /Calls        : 1503.52 us | ||||
| Grid : Message : 104.247221 s : WilsonFermion5D CommTime    /Calls        : 1054.2 us | ||||
| Grid : Message : 104.247223 s : WilsonFermion5D FaceTime    /Calls        : 225.375 us | ||||
| Grid : Message : 104.247225 s : WilsonFermion5D ComputeTime1/Calls        : 3.01152 us | ||||
| Grid : Message : 104.247227 s : WilsonFermion5D ComputeTime2/Calls        : 236.377 us | ||||
| Grid : Message : 104.247294 s : Average mflops/s per call                : 3.59587e+10 | ||||
| Grid : Message : 104.247300 s : Average mflops/s per call per rank       : 5.61855e+08 | ||||
| Grid : Message : 104.247303 s : Average mflops/s per call per node       : 2.24742e+09 | ||||
| Grid : Message : 104.247305 s : Average mflops/s per call (full)         : 7.59233e+07 | ||||
| Grid : Message : 104.247307 s : Average mflops/s per call per rank (full): 1.1863e+06 | ||||
| Grid : Message : 104.247309 s : Average mflops/s per call per node (full): 4.7452e+06 | ||||
| Grid : Message : 104.247311 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 104.247312 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 104.247313 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 104.247314 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 104.247315 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 104.247316 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 112.998074 s : Compare to naive wilson implementation Dag to verify correctness | ||||
| Grid : Message : 112.998099 s : Called DwDag | ||||
| Grid : Message : 112.998100 s : norm dag result 12.0422 | ||||
| Grid : Message : 113.585000 s : norm dag ref    12.0422 | ||||
| Grid : Message : 113.380300 s : norm dag diff   7.28899e-14 | ||||
| Grid : Message : 113.140290 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec | ||||
| Grid : Message : 113.790730 s : src_e0.5 | ||||
| Grid : Message : 113.153215 s : src_o0.5 | ||||
| Grid : Message : 113.170341 s : ********************************************************* | ||||
| Grid : Message : 113.170346 s : * Benchmarking DomainWallFermionF::DhopEO                 | ||||
| Grid : Message : 113.170347 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 113.170353 s : * SINGLE precision  | ||||
| Grid : Message : 113.170356 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 113.170357 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 113.170361 s : ********************************************************* | ||||
| Grid : Message : 161.702832 s : Deo mflop/s =   6.93159e+07 | ||||
| Grid : Message : 161.702861 s : Deo mflop/s per rank   1.08306e+06 | ||||
| Grid : Message : 161.702863 s : Deo mflop/s per node   4.33224e+06 | ||||
| Grid : Message : 161.702866 s : #### Dhop calls report  | ||||
| Grid : Message : 161.702868 s : WilsonFermion5D Number of DhopEO Calls   : 30001 | ||||
| Grid : Message : 161.702870 s : WilsonFermion5D TotalTime   /Calls        : 1617.57 us | ||||
| Grid : Message : 161.702872 s : WilsonFermion5D CommTime    /Calls        : 1105.14 us | ||||
| Grid : Message : 161.702874 s : WilsonFermion5D FaceTime    /Calls        : 294.218 us | ||||
| Grid : Message : 161.702876 s : WilsonFermion5D ComputeTime1/Calls        : 4.85114 us | ||||
| Grid : Message : 161.702878 s : WilsonFermion5D ComputeTime2/Calls        : 241.569 us | ||||
| Grid : Message : 161.702900 s : Average mflops/s per call                : 2.0686e+10 | ||||
| Grid : Message : 161.702904 s : Average mflops/s per call per rank       : 3.23219e+08 | ||||
| Grid : Message : 161.702906 s : Average mflops/s per call per node       : 1.29288e+09 | ||||
| Grid : Message : 161.702908 s : Average mflops/s per call (full)         : 7.05701e+07 | ||||
| Grid : Message : 161.702912 s : Average mflops/s per call per rank (full): 1.10266e+06 | ||||
| Grid : Message : 161.702914 s : Average mflops/s per call per node (full): 4.41063e+06 | ||||
| Grid : Message : 161.702920 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 161.702922 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 161.702923 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 161.702926 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 161.702927 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 161.702928 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 161.722751 s : r_e6.02106 | ||||
| Grid : Message : 161.724439 s : r_o6.0211 | ||||
| Grid : Message : 161.725861 s : res12.0422 | ||||
| Grid : Message : 161.827558 s : norm diff   0 | ||||
| Grid : Message : 161.972191 s : norm diff even  0 | ||||
| Grid : Message : 162.433730 s : norm diff odd   0 | ||||
							
								
								
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/nodes
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/nodes
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21] | ||||
							
								
								
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/script
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1005.64059/script
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| #!/usr/bin/env bash | ||||
| # shellcheck disable=SC1091,SC2050,SC2170 | ||||
|  | ||||
| # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa | ||||
|  | ||||
| #SBATCH -J power-16A-1005 | ||||
| #SBATCH -A dp207 | ||||
| #SBATCH -t 48:00:00 | ||||
| #SBATCH --nodes=16 | ||||
| #SBATCH --ntasks=64 | ||||
| #SBATCH --ntasks-per-node=4 | ||||
| #SBATCH --cpus-per-task=8 | ||||
| #SBATCH --partition=gpu | ||||
| #SBATCH --gres=gpu:4 | ||||
| #SBATCH --output=%x.%j.out | ||||
| #SBATCH --error=%x.%j.err | ||||
| #SBATCH --reservation=dc-port1_61 | ||||
| #SBATCH --qos=reservation | ||||
| #SBATCH --no-requeue | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # OpenMP/OpenMPI/UCX environment ############################################### | ||||
| export OMP_NUM_THREADS=4 | ||||
| export OMPI_MCA_btl=^uct,openib | ||||
| export OMPI_MCA_pml=ucx | ||||
| export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc | ||||
| export UCX_RNDV_SCHEME=put_zcopy | ||||
| export UCX_RNDV_THRESH=16384 | ||||
| export UCX_IB_GPU_DIRECT_RDMA=yes | ||||
| export UCX_MEMTYPE_CACHE=n | ||||
|  | ||||
| # IO environment ############################################################### | ||||
|  | ||||
| if [ 16 -eq 1 ]; then | ||||
| 	export OMPI_MCA_io=ompio | ||||
| else | ||||
| 	export OMPI_MCA_io=romio321 | ||||
| fi | ||||
| export OMPI_MCA_btl_openib_allow_ib=true | ||||
| export OMPI_MCA_btl_openib_device_type=infiniband | ||||
| export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 | ||||
|  | ||||
| # load environment ############################################################# | ||||
| env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)" | ||||
| source "${env_dir}/env-base.sh" | ||||
| if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then | ||||
| 	source "${env_dir}/env-gpu.sh" | ||||
| else | ||||
| 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 | ||||
|   exit 1 | ||||
| fi | ||||
| spack load sshpass | ||||
|  | ||||
| # application and parameters ################################################### | ||||
| app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32' | ||||
| opt=('--comms-overlap' '--comms-concurrent') | ||||
| par='' | ||||
|  | ||||
| # collect job information ###################################################### | ||||
| job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} | ||||
| mkdir -p "${job_info_dir}" | ||||
|  | ||||
| date                         > "${job_info_dir}/start-date" | ||||
| echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date" | ||||
| set                          > "${job_info_dir}/env" | ||||
| ldd ${app}                   > "${job_info_dir}/ldd" | ||||
| md5sum ${app}                > "${job_info_dir}/app-hash" | ||||
| readelf -a ${app}            > "${job_info_dir}/elf" | ||||
| echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" | ||||
| cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script" | ||||
| if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi | ||||
|  | ||||
| # GPU frequency control ######################################################## | ||||
| power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/' | ||||
| freq=1005 | ||||
|  | ||||
| # set frequency | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}" | ||||
| done | ||||
| # start NVIDIA SMI monitoring | ||||
| tmp=$(mktemp) | ||||
| sleep 1 | ||||
| coproc nvidia-smi dmon -o DT &> "${tmp}" | ||||
|  | ||||
| # run! ######################################################################### | ||||
| mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ | ||||
| 	./gpu-mpi-wrapper.sh \ | ||||
|   ${app} "${par}" "${opt[@]}" \ | ||||
| 	--mpi 2.2.2.8 \ | ||||
|   --accelerator-threads 8 \ | ||||
| 	--grid 48.48.48.96 \ | ||||
| 	--shm 2048 &> "${job_info_dir}/log" | ||||
|  | ||||
| # if we reach that point the application exited successfully ################### | ||||
| touch "${job_info_dir}/success" | ||||
| date > "${job_info_dir}/end-date" | ||||
| echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date" | ||||
|  | ||||
| # reset GPUS ################################################################### | ||||
| # stop monitoring | ||||
| kill -INT "${COPROC_PID}" | ||||
|  | ||||
| # make monitoring DB | ||||
| ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}" | ||||
|  | ||||
| # reset clocks | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'  | ||||
| done | ||||
| ################################################################################ | ||||
| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 20:22:21 BST 2022 | ||||
| epoch 1661023341 | ||||
| @@ -0,0 +1 @@ | ||||
| 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32 | ||||
							
								
								
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/elf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/elf
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 20:37:35 BST 2022 | ||||
| epoch 1661024255 | ||||
							
								
								
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/env
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/ldd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/ldd
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| 	linux-vdso.so.1 (0x00007ffff456d000) | ||||
| 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000154c9a375000) | ||||
| 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000154c99fad000) | ||||
| 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000154c99abb000) | ||||
| 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000154c99791000) | ||||
| 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000154c994b0000) | ||||
| 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000154c9924f000) | ||||
| 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000154c9a2fc000) | ||||
| 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000154c98e6f000) | ||||
| 	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000154c97713000) | ||||
| 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000154c97343000) | ||||
| 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000154c970a2000) | ||||
| 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000154c96f77000) | ||||
| 	libm.so.6 => /lib64/libm.so.6 (0x0000154c96bf5000) | ||||
| 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000154c969be000) | ||||
| 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000154c967a6000) | ||||
| 	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000154c96586000) | ||||
| 	libc.so.6 => /lib64/libc.so.6 (0x0000154c961c1000) | ||||
| 	libdl.so.2 => /lib64/libdl.so.2 (0x0000154c95fbd000) | ||||
| 	/lib64/ld-linux-x86-64.so.2 (0x0000154c9a1c5000) | ||||
| 	librt.so.1 => /lib64/librt.so.1 (0x0000154c95db5000) | ||||
| 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000154c9a230000) | ||||
| 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000154c9a22b000) | ||||
| 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000154c95ca9000) | ||||
| 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000154c95a9f000) | ||||
| 	libutil.so.1 => /lib64/libutil.so.1 (0x0000154c9589b000) | ||||
							
								
								
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | ||||
| tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| local rank 1 device 0 bus id: 0000:44:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 3 device 0 bus id: 0000:C4:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 2 device 0 bus id: 0000:84:00.0 | ||||
| SharedMemoryMpi:  World communicator of size 64 | ||||
| SharedMemoryMpi:  Node  communicator of size 4 | ||||
| 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14d8e0000000 for comms buffers  | ||||
| Setting up IPC | ||||
|  | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||
| __|_                                    _|__ | ||||
| __|_   GGGG    RRRR    III    DDDD      _|__ | ||||
| __|_  G        R   R    I     D   D     _|__ | ||||
| __|_  G        R   R    I     D    D    _|__ | ||||
| __|_  G  GG    RRRR     I     D    D    _|__ | ||||
| __|_  G   G    R  R     I     D   D     _|__ | ||||
| __|_   GGGG    R   R   III    DDDD      _|__ | ||||
| __|_                                    _|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
|   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||
|  | ||||
|  | ||||
| Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
| Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes | ||||
|  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : MPI is initialised and logging filters activated  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : Requested 2147483648 byte stencil comms buffers  | ||||
| Grid : Message : MemoryManager Cache 34004218675 bytes  | ||||
| Grid : Message : MemoryManager::Init() setting up | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 | ||||
| Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||
| Grid : Message : MemoryManager::Init() Using cudaMalloc | ||||
| Grid : Message : 1.312638 s : Grid Layout | ||||
| Grid : Message : 1.312643 s : 	Global lattice size  : 48 48 48 96  | ||||
| Grid : Message : 1.312650 s : 	OpenMP threads       : 4 | ||||
| Grid : Message : 1.312652 s : 	MPI tasks            : 2 2 2 8  | ||||
| Grid : Message : 1.327971 s : Making s innermost grids | ||||
| Grid : Message : 1.344471 s : Initialising 4d RNG | ||||
| Grid : Message : 1.361018 s : Intialising parallel RNG with unique string 'The 4D RNG' | ||||
| Grid : Message : 1.361045 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 | ||||
| Grid : Message : 1.837887 s : Initialising 5d RNG | ||||
| Grid : Message : 2.844490 s : Intialising parallel RNG with unique string 'The 5D RNG' | ||||
| Grid : Message : 2.845110 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a | ||||
| Grid : Message : 7.428202 s : Initialised RNGs | ||||
| Grid : Message : 8.439960 s : Drawing gauge field | ||||
| Grid : Message : 8.560999 s : Random gauge initialised  | ||||
| Grid : Message : 8.573339 s : Setting up Cshift based reference  | ||||
| Grid : Message : 13.695651 s : ***************************************************************** | ||||
| Grid : Message : 13.695676 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm | ||||
| Grid : Message : 13.695677 s : ***************************************************************** | ||||
| Grid : Message : 13.695678 s : ***************************************************************** | ||||
| Grid : Message : 13.695679 s : * Benchmarking DomainWallFermionR::Dhop                   | ||||
| Grid : Message : 13.695680 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 13.695681 s : * VComplexF size is 64 B | ||||
| Grid : Message : 13.695682 s : * SINGLE precision  | ||||
| Grid : Message : 13.695684 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 13.695685 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 13.695686 s : ***************************************************************** | ||||
| Grid : Message : 14.234933 s : Called warmup | ||||
| Grid : Message : 103.428452 s : Called Dw 30000 times in 8.91932e+07 us | ||||
| Grid : Message : 103.428517 s : mflop/s =   7.54186e+07 | ||||
| Grid : Message : 103.428519 s : mflop/s per rank =  1.17842e+06 | ||||
| Grid : Message : 103.428521 s : mflop/s per node =  4.71366e+06 | ||||
| Grid : Message : 103.428523 s : RF  GiB/s (base 2) =   153249 | ||||
| Grid : Message : 103.428525 s : mem GiB/s (base 2) =   95780.5 | ||||
| Grid : Message : 103.429097 s : norm diff   1.05775e-13 | ||||
| Grid : Message : 103.439111 s : #### Dhop calls report  | ||||
| Grid : Message : 103.439118 s : WilsonFermion5D Number of DhopEO Calls   : 60002 | ||||
| Grid : Message : 103.439122 s : WilsonFermion5D TotalTime   /Calls        : 1487.69 us | ||||
| Grid : Message : 103.439124 s : WilsonFermion5D CommTime    /Calls        : 1041.46 us | ||||
| Grid : Message : 103.439126 s : WilsonFermion5D FaceTime    /Calls        : 222.459 us | ||||
| Grid : Message : 103.439128 s : WilsonFermion5D ComputeTime1/Calls        : 2.85969 us | ||||
| Grid : Message : 103.439130 s : WilsonFermion5D ComputeTime2/Calls        : 236.325 us | ||||
| Grid : Message : 103.439201 s : Average mflops/s per call                : 3.60313e+10 | ||||
| Grid : Message : 103.439207 s : Average mflops/s per call per rank       : 5.62989e+08 | ||||
| Grid : Message : 103.439209 s : Average mflops/s per call per node       : 2.25196e+09 | ||||
| Grid : Message : 103.439211 s : Average mflops/s per call (full)         : 7.67311e+07 | ||||
| Grid : Message : 103.439213 s : Average mflops/s per call per rank (full): 1.19892e+06 | ||||
| Grid : Message : 103.439215 s : Average mflops/s per call per node (full): 4.7957e+06 | ||||
| Grid : Message : 103.439217 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 103.439218 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 103.439219 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 103.439220 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 103.439221 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 103.439222 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 112.177904 s : Compare to naive wilson implementation Dag to verify correctness | ||||
| Grid : Message : 112.177939 s : Called DwDag | ||||
| Grid : Message : 112.177940 s : norm dag result 12.0422 | ||||
| Grid : Message : 112.186235 s : norm dag ref    12.0422 | ||||
| Grid : Message : 112.189309 s : norm dag diff   7.28899e-14 | ||||
| Grid : Message : 112.200523 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec | ||||
| Grid : Message : 112.263704 s : src_e0.5 | ||||
| Grid : Message : 112.335429 s : src_o0.5 | ||||
| Grid : Message : 112.352238 s : ********************************************************* | ||||
| Grid : Message : 112.352244 s : * Benchmarking DomainWallFermionF::DhopEO                 | ||||
| Grid : Message : 112.352246 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 112.352248 s : * SINGLE precision  | ||||
| Grid : Message : 112.352250 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 112.352253 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 112.352254 s : ********************************************************* | ||||
| Grid : Message : 160.328889 s : Deo mflop/s =   7.01193e+07 | ||||
| Grid : Message : 160.328922 s : Deo mflop/s per rank   1.09561e+06 | ||||
| Grid : Message : 160.328924 s : Deo mflop/s per node   4.38246e+06 | ||||
| Grid : Message : 160.328927 s : #### Dhop calls report  | ||||
| Grid : Message : 160.328929 s : WilsonFermion5D Number of DhopEO Calls   : 30001 | ||||
| Grid : Message : 160.328931 s : WilsonFermion5D TotalTime   /Calls        : 1599.04 us | ||||
| Grid : Message : 160.328933 s : WilsonFermion5D CommTime    /Calls        : 1088.05 us | ||||
| Grid : Message : 160.328935 s : WilsonFermion5D FaceTime    /Calls        : 294.436 us | ||||
| Grid : Message : 160.328937 s : WilsonFermion5D ComputeTime1/Calls        : 4.78577 us | ||||
| Grid : Message : 160.328939 s : WilsonFermion5D ComputeTime2/Calls        : 241.411 us | ||||
| Grid : Message : 160.328966 s : Average mflops/s per call                : 2.07599e+10 | ||||
| Grid : Message : 160.328971 s : Average mflops/s per call per rank       : 3.24373e+08 | ||||
| Grid : Message : 160.328975 s : Average mflops/s per call per node       : 1.29749e+09 | ||||
| Grid : Message : 160.328980 s : Average mflops/s per call (full)         : 7.13878e+07 | ||||
| Grid : Message : 160.328983 s : Average mflops/s per call per rank (full): 1.11543e+06 | ||||
| Grid : Message : 160.328987 s : Average mflops/s per call per node (full): 4.46174e+06 | ||||
| Grid : Message : 160.328989 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 160.328990 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 160.328992 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 160.328995 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 160.328997 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 160.329000 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 160.348014 s : r_e6.02106 | ||||
| Grid : Message : 160.350033 s : r_o6.0211 | ||||
| Grid : Message : 160.351497 s : res12.0422 | ||||
| Grid : Message : 160.466811 s : norm diff   0 | ||||
| Grid : Message : 160.599190 s : norm diff even  0 | ||||
| Grid : Message : 160.669838 s : norm diff odd   0 | ||||
							
								
								
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/nodes
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/nodes
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21] | ||||
							
								
								
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/script
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1020.64063/script
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| #!/usr/bin/env bash | ||||
| # shellcheck disable=SC1091,SC2050,SC2170 | ||||
|  | ||||
| # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa | ||||
|  | ||||
| #SBATCH -J power-16A-1020 | ||||
| #SBATCH -A dp207 | ||||
| #SBATCH -t 48:00:00 | ||||
| #SBATCH --nodes=16 | ||||
| #SBATCH --ntasks=64 | ||||
| #SBATCH --ntasks-per-node=4 | ||||
| #SBATCH --cpus-per-task=8 | ||||
| #SBATCH --partition=gpu | ||||
| #SBATCH --gres=gpu:4 | ||||
| #SBATCH --output=%x.%j.out | ||||
| #SBATCH --error=%x.%j.err | ||||
| #SBATCH --reservation=dc-port1_61 | ||||
| #SBATCH --qos=reservation | ||||
| #SBATCH --no-requeue | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # OpenMP/OpenMPI/UCX environment ############################################### | ||||
| export OMP_NUM_THREADS=4 | ||||
| export OMPI_MCA_btl=^uct,openib | ||||
| export OMPI_MCA_pml=ucx | ||||
| export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc | ||||
| export UCX_RNDV_SCHEME=put_zcopy | ||||
| export UCX_RNDV_THRESH=16384 | ||||
| export UCX_IB_GPU_DIRECT_RDMA=yes | ||||
| export UCX_MEMTYPE_CACHE=n | ||||
|  | ||||
| # IO environment ############################################################### | ||||
|  | ||||
| if [ 16 -eq 1 ]; then | ||||
| 	export OMPI_MCA_io=ompio | ||||
| else | ||||
| 	export OMPI_MCA_io=romio321 | ||||
| fi | ||||
| export OMPI_MCA_btl_openib_allow_ib=true | ||||
| export OMPI_MCA_btl_openib_device_type=infiniband | ||||
| export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 | ||||
|  | ||||
| # load environment ############################################################# | ||||
| env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)" | ||||
| source "${env_dir}/env-base.sh" | ||||
| if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then | ||||
| 	source "${env_dir}/env-gpu.sh" | ||||
| else | ||||
| 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 | ||||
|   exit 1 | ||||
| fi | ||||
| spack load sshpass | ||||
|  | ||||
| # application and parameters ################################################### | ||||
| app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32' | ||||
| opt=('--comms-overlap' '--comms-concurrent') | ||||
| par='' | ||||
|  | ||||
| # collect job information ###################################################### | ||||
| job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} | ||||
| mkdir -p "${job_info_dir}" | ||||
|  | ||||
| date                         > "${job_info_dir}/start-date" | ||||
| echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date" | ||||
| set                          > "${job_info_dir}/env" | ||||
| ldd ${app}                   > "${job_info_dir}/ldd" | ||||
| md5sum ${app}                > "${job_info_dir}/app-hash" | ||||
| readelf -a ${app}            > "${job_info_dir}/elf" | ||||
| echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" | ||||
| cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script" | ||||
| if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi | ||||
|  | ||||
| # GPU frequency control ######################################################## | ||||
| power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/' | ||||
| freq=1020 | ||||
|  | ||||
| # set frequency | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}" | ||||
| done | ||||
| # start NVIDIA SMI monitoring | ||||
| tmp=$(mktemp) | ||||
| sleep 1 | ||||
| coproc nvidia-smi dmon -o DT &> "${tmp}" | ||||
|  | ||||
| # run! ######################################################################### | ||||
| mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ | ||||
| 	./gpu-mpi-wrapper.sh \ | ||||
|   ${app} "${par}" "${opt[@]}" \ | ||||
| 	--mpi 2.2.2.8 \ | ||||
|   --accelerator-threads 8 \ | ||||
| 	--grid 48.48.48.96 \ | ||||
| 	--shm 2048 &> "${job_info_dir}/log" | ||||
|  | ||||
| # if we reach that point the application exited successfully ################### | ||||
| touch "${job_info_dir}/success" | ||||
| date > "${job_info_dir}/end-date" | ||||
| echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date" | ||||
|  | ||||
| # reset GPUS ################################################################### | ||||
| # stop monitoring | ||||
| kill -INT "${COPROC_PID}" | ||||
|  | ||||
| # make monitoring DB | ||||
| ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}" | ||||
|  | ||||
| # reset clocks | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'  | ||||
| done | ||||
| ################################################################################ | ||||
| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 20:34:46 BST 2022 | ||||
| epoch 1661024086 | ||||
| @@ -0,0 +1 @@ | ||||
| 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32 | ||||
							
								
								
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/elf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/elf
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 20:43:25 BST 2022 | ||||
| epoch 1661024605 | ||||
							
								
								
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/env
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/ldd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/ldd
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| 	linux-vdso.so.1 (0x00007ffd625a8000) | ||||
| 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ff21a6a000) | ||||
| 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ff216a2000) | ||||
| 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ff211b0000) | ||||
| 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ff20e86000) | ||||
| 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ff20ba5000) | ||||
| 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ff20944000) | ||||
| 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ff219f1000) | ||||
| 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ff20564000) | ||||
| 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ff1ee08000) | ||||
| 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ff1ea38000) | ||||
| 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ff1e797000) | ||||
| 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ff1e66c000) | ||||
| 	libm.so.6 => /lib64/libm.so.6 (0x000014ff1e2ea000) | ||||
| 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ff1e0b3000) | ||||
| 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ff1de9b000) | ||||
| 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ff1dc7b000) | ||||
| 	libc.so.6 => /lib64/libc.so.6 (0x000014ff1d8b6000) | ||||
| 	libdl.so.2 => /lib64/libdl.so.2 (0x000014ff1d6b2000) | ||||
| 	/lib64/ld-linux-x86-64.so.2 (0x000014ff218ba000) | ||||
| 	librt.so.1 => /lib64/librt.so.1 (0x000014ff1d4aa000) | ||||
| 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ff21925000) | ||||
| 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ff21920000) | ||||
| 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ff1d39e000) | ||||
| 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ff1d194000) | ||||
| 	libutil.so.1 => /lib64/libutil.so.1 (0x000014ff1cf90000) | ||||
							
								
								
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | ||||
| tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| local rank 1 device 0 bus id: 0000:44:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 2 device 0 bus id: 0000:84:00.0 | ||||
| local rank 3 device 0 bus id: 0000:C4:00.0 | ||||
| SharedMemoryMpi:  World communicator of size 64 | ||||
| SharedMemoryMpi:  Node  communicator of size 4 | ||||
| 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146a80000000 for comms buffers  | ||||
| Setting up IPC | ||||
|  | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||
| __|_                                    _|__ | ||||
| __|_   GGGG    RRRR    III    DDDD      _|__ | ||||
| __|_  G        R   R    I     D   D     _|__ | ||||
| __|_  G        R   R    I     D    D    _|__ | ||||
| __|_  G  GG    RRRR     I     D    D    _|__ | ||||
| __|_  G   G    R  R     I     D   D     _|__ | ||||
| __|_   GGGG    R   R   III    DDDD      _|__ | ||||
| __|_                                    _|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
|   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||
|  | ||||
|  | ||||
| Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
| Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes | ||||
|  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : MPI is initialised and logging filters activated  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : Requested 2147483648 byte stencil comms buffers  | ||||
| Grid : Message : MemoryManager Cache 34004218675 bytes  | ||||
| Grid : Message : MemoryManager::Init() setting up | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 | ||||
| Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||
| Grid : Message : MemoryManager::Init() Using cudaMalloc | ||||
| Grid : Message : 1.623478 s : Grid Layout | ||||
| Grid : Message : 1.623482 s : 	Global lattice size  : 48 48 48 96  | ||||
| Grid : Message : 1.623486 s : 	OpenMP threads       : 4 | ||||
| Grid : Message : 1.623488 s : 	MPI tasks            : 2 2 2 8  | ||||
| Grid : Message : 1.637678 s : Making s innermost grids | ||||
| Grid : Message : 1.654638 s : Initialising 4d RNG | ||||
| Grid : Message : 1.670417 s : Intialising parallel RNG with unique string 'The 4D RNG' | ||||
| Grid : Message : 1.670443 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 | ||||
| Grid : Message : 2.165386 s : Initialising 5d RNG | ||||
| Grid : Message : 2.399472 s : Intialising parallel RNG with unique string 'The 5D RNG' | ||||
| Grid : Message : 2.399504 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a | ||||
| Grid : Message : 7.787095 s : Initialised RNGs | ||||
| Grid : Message : 8.568006 s : Drawing gauge field | ||||
| Grid : Message : 8.661012 s : Random gauge initialised  | ||||
| Grid : Message : 8.665024 s : Setting up Cshift based reference  | ||||
| Grid : Message : 13.760660 s : ***************************************************************** | ||||
| Grid : Message : 13.760685 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm | ||||
| Grid : Message : 13.760687 s : ***************************************************************** | ||||
| Grid : Message : 13.760690 s : ***************************************************************** | ||||
| Grid : Message : 13.760691 s : * Benchmarking DomainWallFermionR::Dhop                   | ||||
| Grid : Message : 13.760692 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 13.760694 s : * VComplexF size is 64 B | ||||
| Grid : Message : 13.760696 s : * SINGLE precision  | ||||
| Grid : Message : 13.760697 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 13.760698 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 13.760700 s : ***************************************************************** | ||||
| Grid : Message : 14.326353 s : Called warmup | ||||
| Grid : Message : 102.469231 s : Called Dw 30000 times in 8.81428e+07 us | ||||
| Grid : Message : 102.469296 s : mflop/s =   7.63173e+07 | ||||
| Grid : Message : 102.469299 s : mflop/s per rank =  1.19246e+06 | ||||
| Grid : Message : 102.469307 s : mflop/s per node =  4.76983e+06 | ||||
| Grid : Message : 102.469310 s : RF  GiB/s (base 2) =   155075 | ||||
| Grid : Message : 102.469313 s : mem GiB/s (base 2) =   96921.9 | ||||
| Grid : Message : 102.469886 s : norm diff   1.05775e-13 | ||||
| Grid : Message : 102.480527 s : #### Dhop calls report  | ||||
| Grid : Message : 102.480534 s : WilsonFermion5D Number of DhopEO Calls   : 60002 | ||||
| Grid : Message : 102.480538 s : WilsonFermion5D TotalTime   /Calls        : 1470.47 us | ||||
| Grid : Message : 102.480540 s : WilsonFermion5D CommTime    /Calls        : 1029.89 us | ||||
| Grid : Message : 102.480542 s : WilsonFermion5D FaceTime    /Calls        : 217.938 us | ||||
| Grid : Message : 102.480544 s : WilsonFermion5D ComputeTime1/Calls        : 3.09645 us | ||||
| Grid : Message : 102.480546 s : WilsonFermion5D ComputeTime2/Calls        : 235.402 us | ||||
| Grid : Message : 102.480575 s : Average mflops/s per call                : 3.61099e+10 | ||||
| Grid : Message : 102.480579 s : Average mflops/s per call per rank       : 5.64217e+08 | ||||
| Grid : Message : 102.480581 s : Average mflops/s per call per node       : 2.25687e+09 | ||||
| Grid : Message : 102.480583 s : Average mflops/s per call (full)         : 7.76299e+07 | ||||
| Grid : Message : 102.480587 s : Average mflops/s per call per rank (full): 1.21297e+06 | ||||
| Grid : Message : 102.480590 s : Average mflops/s per call per node (full): 4.85187e+06 | ||||
| Grid : Message : 102.480593 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 102.480596 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 102.480598 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 102.480600 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 102.480603 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 102.480605 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 111.202302 s : Compare to naive wilson implementation Dag to verify correctness | ||||
| Grid : Message : 111.202331 s : Called DwDag | ||||
| Grid : Message : 111.202332 s : norm dag result 12.0422 | ||||
| Grid : Message : 111.204652 s : norm dag ref    12.0422 | ||||
| Grid : Message : 111.207748 s : norm dag diff   7.28899e-14 | ||||
| Grid : Message : 111.218376 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec | ||||
| Grid : Message : 111.273653 s : src_e0.5 | ||||
| Grid : Message : 111.352934 s : src_o0.5 | ||||
| Grid : Message : 111.369965 s : ********************************************************* | ||||
| Grid : Message : 111.369970 s : * Benchmarking DomainWallFermionF::DhopEO                 | ||||
| Grid : Message : 111.369974 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 111.369976 s : * SINGLE precision  | ||||
| Grid : Message : 111.369977 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 111.369981 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 111.369983 s : ********************************************************* | ||||
| Grid : Message : 158.806725 s : Deo mflop/s =   7.09164e+07 | ||||
| Grid : Message : 158.806755 s : Deo mflop/s per rank   1.10807e+06 | ||||
| Grid : Message : 158.806757 s : Deo mflop/s per node   4.43227e+06 | ||||
| Grid : Message : 158.806760 s : #### Dhop calls report  | ||||
| Grid : Message : 158.806762 s : WilsonFermion5D Number of DhopEO Calls   : 30001 | ||||
| Grid : Message : 158.806764 s : WilsonFermion5D TotalTime   /Calls        : 1581.06 us | ||||
| Grid : Message : 158.806766 s : WilsonFermion5D CommTime    /Calls        : 1077.77 us | ||||
| Grid : Message : 158.806768 s : WilsonFermion5D FaceTime    /Calls        : 286.721 us | ||||
| Grid : Message : 158.806770 s : WilsonFermion5D ComputeTime1/Calls        : 4.98297 us | ||||
| Grid : Message : 158.806772 s : WilsonFermion5D ComputeTime2/Calls        : 240.035 us | ||||
| Grid : Message : 158.806792 s : Average mflops/s per call                : 2.0753e+10 | ||||
| Grid : Message : 158.806796 s : Average mflops/s per call per rank       : 3.24266e+08 | ||||
| Grid : Message : 158.806798 s : Average mflops/s per call per node       : 1.29706e+09 | ||||
| Grid : Message : 158.806800 s : Average mflops/s per call (full)         : 7.21996e+07 | ||||
| Grid : Message : 158.806804 s : Average mflops/s per call per rank (full): 1.12812e+06 | ||||
| Grid : Message : 158.806807 s : Average mflops/s per call per node (full): 4.51247e+06 | ||||
| Grid : Message : 158.806809 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 158.806810 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 158.806812 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 158.806814 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 158.806816 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 158.806818 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 158.823821 s : r_e6.02106 | ||||
| Grid : Message : 158.827207 s : r_o6.0211 | ||||
| Grid : Message : 158.828617 s : res12.0422 | ||||
| Grid : Message : 158.938772 s : norm diff   0 | ||||
| Grid : Message : 159.724700 s : norm diff even  0 | ||||
| Grid : Message : 159.148761 s : norm diff odd   0 | ||||
							
								
								
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/nodes
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/nodes
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21] | ||||
							
								
								
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/script
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1035.64067/script
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| #!/usr/bin/env bash | ||||
| # shellcheck disable=SC1091,SC2050,SC2170 | ||||
|  | ||||
| # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa | ||||
|  | ||||
| #SBATCH -J power-16A-1035 | ||||
| #SBATCH -A dp207 | ||||
| #SBATCH -t 48:00:00 | ||||
| #SBATCH --nodes=16 | ||||
| #SBATCH --ntasks=64 | ||||
| #SBATCH --ntasks-per-node=4 | ||||
| #SBATCH --cpus-per-task=8 | ||||
| #SBATCH --partition=gpu | ||||
| #SBATCH --gres=gpu:4 | ||||
| #SBATCH --output=%x.%j.out | ||||
| #SBATCH --error=%x.%j.err | ||||
| #SBATCH --reservation=dc-port1_61 | ||||
| #SBATCH --qos=reservation | ||||
| #SBATCH --no-requeue | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # OpenMP/OpenMPI/UCX environment ############################################### | ||||
| export OMP_NUM_THREADS=4 | ||||
| export OMPI_MCA_btl=^uct,openib | ||||
| export OMPI_MCA_pml=ucx | ||||
| export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc | ||||
| export UCX_RNDV_SCHEME=put_zcopy | ||||
| export UCX_RNDV_THRESH=16384 | ||||
| export UCX_IB_GPU_DIRECT_RDMA=yes | ||||
| export UCX_MEMTYPE_CACHE=n | ||||
|  | ||||
| # IO environment ############################################################### | ||||
|  | ||||
| if [ 16 -eq 1 ]; then | ||||
| 	export OMPI_MCA_io=ompio | ||||
| else | ||||
| 	export OMPI_MCA_io=romio321 | ||||
| fi | ||||
| export OMPI_MCA_btl_openib_allow_ib=true | ||||
| export OMPI_MCA_btl_openib_device_type=infiniband | ||||
| export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 | ||||
|  | ||||
| # load environment ############################################################# | ||||
| env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)" | ||||
| source "${env_dir}/env-base.sh" | ||||
| if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then | ||||
| 	source "${env_dir}/env-gpu.sh" | ||||
| else | ||||
| 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 | ||||
|   exit 1 | ||||
| fi | ||||
| spack load sshpass | ||||
|  | ||||
| # application and parameters ################################################### | ||||
| app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32' | ||||
| opt=('--comms-overlap' '--comms-concurrent') | ||||
| par='' | ||||
|  | ||||
| # collect job information ###################################################### | ||||
| job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} | ||||
| mkdir -p "${job_info_dir}" | ||||
|  | ||||
| date                         > "${job_info_dir}/start-date" | ||||
| echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date" | ||||
| set                          > "${job_info_dir}/env" | ||||
| ldd ${app}                   > "${job_info_dir}/ldd" | ||||
| md5sum ${app}                > "${job_info_dir}/app-hash" | ||||
| readelf -a ${app}            > "${job_info_dir}/elf" | ||||
| echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" | ||||
| cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script" | ||||
| if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi | ||||
|  | ||||
| # GPU frequency control ######################################################## | ||||
| power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/' | ||||
| freq=1035 | ||||
|  | ||||
| # set frequency | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}" | ||||
| done | ||||
| # start NVIDIA SMI monitoring | ||||
| tmp=$(mktemp) | ||||
| sleep 1 | ||||
| coproc nvidia-smi dmon -o DT &> "${tmp}" | ||||
|  | ||||
| # run! ######################################################################### | ||||
| mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ | ||||
| 	./gpu-mpi-wrapper.sh \ | ||||
|   ${app} "${par}" "${opt[@]}" \ | ||||
| 	--mpi 2.2.2.8 \ | ||||
|   --accelerator-threads 8 \ | ||||
| 	--grid 48.48.48.96 \ | ||||
| 	--shm 2048 &> "${job_info_dir}/log" | ||||
|  | ||||
| # if we reach that point the application exited successfully ################### | ||||
| touch "${job_info_dir}/success" | ||||
| date > "${job_info_dir}/end-date" | ||||
| echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date" | ||||
|  | ||||
| # reset GPUS ################################################################### | ||||
| # stop monitoring | ||||
| kill -INT "${COPROC_PID}" | ||||
|  | ||||
| # make monitoring DB | ||||
| ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}" | ||||
|  | ||||
| # reset clocks | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'  | ||||
| done | ||||
| ################################################################################ | ||||
| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 20:40:36 BST 2022 | ||||
| epoch 1661024436 | ||||
| @@ -0,0 +1 @@ | ||||
| 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32 | ||||
							
								
								
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/elf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/elf
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 20:49:15 BST 2022 | ||||
| epoch 1661024955 | ||||
							
								
								
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/env
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/ldd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/ldd
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| 	linux-vdso.so.1 (0x00007ffe2b5fb000) | ||||
| 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001470cbce5000) | ||||
| 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001470cb91d000) | ||||
| 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001470cb42b000) | ||||
| 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001470cb101000) | ||||
| 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001470cae20000) | ||||
| 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001470cabbf000) | ||||
| 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001470cbc6c000) | ||||
| 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001470ca7df000) | ||||
| 	libcuda.so.1 => /lib64/libcuda.so.1 (0x00001470c9083000) | ||||
| 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001470c8cb3000) | ||||
| 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001470c8a12000) | ||||
| 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001470c88e7000) | ||||
| 	libm.so.6 => /lib64/libm.so.6 (0x00001470c8565000) | ||||
| 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001470c832e000) | ||||
| 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001470c8116000) | ||||
| 	libpthread.so.0 => /lib64/libpthread.so.0 (0x00001470c7ef6000) | ||||
| 	libc.so.6 => /lib64/libc.so.6 (0x00001470c7b31000) | ||||
| 	libdl.so.2 => /lib64/libdl.so.2 (0x00001470c792d000) | ||||
| 	/lib64/ld-linux-x86-64.so.2 (0x00001470cbb35000) | ||||
| 	librt.so.1 => /lib64/librt.so.1 (0x00001470c7725000) | ||||
| 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001470cbba0000) | ||||
| 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001470cbb9b000) | ||||
| 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001470c7619000) | ||||
| 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001470c740f000) | ||||
| 	libutil.so.1 => /lib64/libutil.so.1 (0x00001470c720b000) | ||||
							
								
								
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | ||||
| tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| local rank 3 device 0 bus id: 0000:C4:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 2 device 0 bus id: 0000:84:00.0 | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 1 device 0 bus id: 0000:44:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| SharedMemoryMpi:  World communicator of size 64 | ||||
| SharedMemoryMpi:  Node  communicator of size 4 | ||||
| 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14f600000000 for comms buffers  | ||||
| Setting up IPC | ||||
|  | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||
| __|_                                    _|__ | ||||
| __|_   GGGG    RRRR    III    DDDD      _|__ | ||||
| __|_  G        R   R    I     D   D     _|__ | ||||
| __|_  G        R   R    I     D    D    _|__ | ||||
| __|_  G  GG    RRRR     I     D    D    _|__ | ||||
| __|_  G   G    R  R     I     D   D     _|__ | ||||
| __|_   GGGG    R   R   III    DDDD      _|__ | ||||
| __|_                                    _|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
|   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||
|  | ||||
|  | ||||
| Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
| Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes | ||||
|  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : MPI is initialised and logging filters activated  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : Requested 2147483648 byte stencil comms buffers  | ||||
| Grid : Message : MemoryManager Cache 34004218675 bytes  | ||||
| Grid : Message : MemoryManager::Init() setting up | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 | ||||
| Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||
| Grid : Message : MemoryManager::Init() Using cudaMalloc | ||||
| Grid : Message : 1.720184 s : Grid Layout | ||||
| Grid : Message : 1.720188 s : 	Global lattice size  : 48 48 48 96  | ||||
| Grid : Message : 1.720196 s : 	OpenMP threads       : 4 | ||||
| Grid : Message : 1.720199 s : 	MPI tasks            : 2 2 2 8  | ||||
| Grid : Message : 1.735275 s : Making s innermost grids | ||||
| Grid : Message : 1.752323 s : Initialising 4d RNG | ||||
| Grid : Message : 1.768478 s : Intialising parallel RNG with unique string 'The 4D RNG' | ||||
| Grid : Message : 1.768504 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 | ||||
| Grid : Message : 2.201838 s : Initialising 5d RNG | ||||
| Grid : Message : 2.438683 s : Intialising parallel RNG with unique string 'The 5D RNG' | ||||
| Grid : Message : 2.438714 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a | ||||
| Grid : Message : 7.906459 s : Initialised RNGs | ||||
| Grid : Message : 8.718015 s : Drawing gauge field | ||||
| Grid : Message : 8.851801 s : Random gauge initialised  | ||||
| Grid : Message : 8.862438 s : Setting up Cshift based reference  | ||||
| Grid : Message : 13.896599 s : ***************************************************************** | ||||
| Grid : Message : 13.896621 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm | ||||
| Grid : Message : 13.896622 s : ***************************************************************** | ||||
| Grid : Message : 13.896623 s : ***************************************************************** | ||||
| Grid : Message : 13.896624 s : * Benchmarking DomainWallFermionR::Dhop                   | ||||
| Grid : Message : 13.896625 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 13.896626 s : * VComplexF size is 64 B | ||||
| Grid : Message : 13.896627 s : * SINGLE precision  | ||||
| Grid : Message : 13.896628 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 13.896629 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 13.896630 s : ***************************************************************** | ||||
| Grid : Message : 14.428387 s : Called warmup | ||||
| Grid : Message : 101.915473 s : Called Dw 30000 times in 8.74869e+07 us | ||||
| Grid : Message : 101.915527 s : mflop/s =   7.68895e+07 | ||||
| Grid : Message : 101.915529 s : mflop/s per rank =  1.2014e+06 | ||||
| Grid : Message : 101.915531 s : mflop/s per node =  4.80559e+06 | ||||
| Grid : Message : 101.915533 s : RF  GiB/s (base 2) =   156238 | ||||
| Grid : Message : 101.915535 s : mem GiB/s (base 2) =   97648.5 | ||||
| Grid : Message : 101.916107 s : norm diff   1.05775e-13 | ||||
| Grid : Message : 101.926218 s : #### Dhop calls report  | ||||
| Grid : Message : 101.926225 s : WilsonFermion5D Number of DhopEO Calls   : 60002 | ||||
| Grid : Message : 101.926228 s : WilsonFermion5D TotalTime   /Calls        : 1459.21 us | ||||
| Grid : Message : 101.926230 s : WilsonFermion5D CommTime    /Calls        : 1016.78 us | ||||
| Grid : Message : 101.926232 s : WilsonFermion5D FaceTime    /Calls        : 219.506 us | ||||
| Grid : Message : 101.926234 s : WilsonFermion5D ComputeTime1/Calls        : 2.78512 us | ||||
| Grid : Message : 101.926236 s : WilsonFermion5D ComputeTime2/Calls        : 235.25 us | ||||
| Grid : Message : 101.926330 s : Average mflops/s per call                : 3.60206e+10 | ||||
| Grid : Message : 101.926334 s : Average mflops/s per call per rank       : 5.62822e+08 | ||||
| Grid : Message : 101.926336 s : Average mflops/s per call per node       : 2.25129e+09 | ||||
| Grid : Message : 101.926338 s : Average mflops/s per call (full)         : 7.82287e+07 | ||||
| Grid : Message : 101.926340 s : Average mflops/s per call per rank (full): 1.22232e+06 | ||||
| Grid : Message : 101.926342 s : Average mflops/s per call per node (full): 4.88929e+06 | ||||
| Grid : Message : 101.926344 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 101.926345 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 101.926346 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 101.926347 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 101.926348 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 101.926349 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 110.616405 s : Compare to naive wilson implementation Dag to verify correctness | ||||
| Grid : Message : 110.616430 s : Called DwDag | ||||
| Grid : Message : 110.616431 s : norm dag result 12.0422 | ||||
| Grid : Message : 110.621134 s : norm dag ref    12.0422 | ||||
| Grid : Message : 110.624323 s : norm dag diff   7.28899e-14 | ||||
| Grid : Message : 110.637247 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec | ||||
| Grid : Message : 110.698940 s : src_e0.5 | ||||
| Grid : Message : 110.766761 s : src_o0.5 | ||||
| Grid : Message : 110.783307 s : ********************************************************* | ||||
| Grid : Message : 110.783311 s : * Benchmarking DomainWallFermionF::DhopEO                 | ||||
| Grid : Message : 110.783313 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 110.783315 s : * SINGLE precision  | ||||
| Grid : Message : 110.783316 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 110.783317 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 110.783318 s : ********************************************************* | ||||
| Grid : Message : 157.764942 s : Deo mflop/s =   7.16075e+07 | ||||
| Grid : Message : 157.764976 s : Deo mflop/s per rank   1.11887e+06 | ||||
| Grid : Message : 157.764978 s : Deo mflop/s per node   4.47547e+06 | ||||
| Grid : Message : 157.764981 s : #### Dhop calls report  | ||||
| Grid : Message : 157.764983 s : WilsonFermion5D Number of DhopEO Calls   : 30001 | ||||
| Grid : Message : 157.764985 s : WilsonFermion5D TotalTime   /Calls        : 1565.89 us | ||||
| Grid : Message : 157.764987 s : WilsonFermion5D CommTime    /Calls        : 1058.27 us | ||||
| Grid : Message : 157.764989 s : WilsonFermion5D FaceTime    /Calls        : 292.487 us | ||||
| Grid : Message : 157.764991 s : WilsonFermion5D ComputeTime1/Calls        : 4.72584 us | ||||
| Grid : Message : 157.764993 s : WilsonFermion5D ComputeTime2/Calls        : 239.678 us | ||||
| Grid : Message : 157.765020 s : Average mflops/s per call                : 2.07994e+10 | ||||
| Grid : Message : 157.765024 s : Average mflops/s per call per rank       : 3.2499e+08 | ||||
| Grid : Message : 157.765027 s : Average mflops/s per call per node       : 1.29996e+09 | ||||
| Grid : Message : 157.765031 s : Average mflops/s per call (full)         : 7.28994e+07 | ||||
| Grid : Message : 157.765035 s : Average mflops/s per call per rank (full): 1.13905e+06 | ||||
| Grid : Message : 157.765039 s : Average mflops/s per call per node (full): 4.55621e+06 | ||||
| Grid : Message : 157.765042 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 157.765044 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 157.765046 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 157.765049 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 157.765051 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 157.765053 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 157.783731 s : r_e6.02106 | ||||
| Grid : Message : 157.786036 s : r_o6.0211 | ||||
| Grid : Message : 157.787470 s : res12.0422 | ||||
| Grid : Message : 157.905573 s : norm diff   0 | ||||
| Grid : Message : 158.337590 s : norm diff even  0 | ||||
| Grid : Message : 158.959010 s : norm diff odd   0 | ||||
							
								
								
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/nodes
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/nodes
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21] | ||||
							
								
								
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/script
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1050.64071/script
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| #!/usr/bin/env bash | ||||
| # shellcheck disable=SC1091,SC2050,SC2170 | ||||
|  | ||||
| # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa | ||||
|  | ||||
| #SBATCH -J power-16A-1050 | ||||
| #SBATCH -A dp207 | ||||
| #SBATCH -t 48:00:00 | ||||
| #SBATCH --nodes=16 | ||||
| #SBATCH --ntasks=64 | ||||
| #SBATCH --ntasks-per-node=4 | ||||
| #SBATCH --cpus-per-task=8 | ||||
| #SBATCH --partition=gpu | ||||
| #SBATCH --gres=gpu:4 | ||||
| #SBATCH --output=%x.%j.out | ||||
| #SBATCH --error=%x.%j.err | ||||
| #SBATCH --reservation=dc-port1_61 | ||||
| #SBATCH --qos=reservation | ||||
| #SBATCH --no-requeue | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # OpenMP/OpenMPI/UCX environment ############################################### | ||||
| export OMP_NUM_THREADS=4 | ||||
| export OMPI_MCA_btl=^uct,openib | ||||
| export OMPI_MCA_pml=ucx | ||||
| export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc | ||||
| export UCX_RNDV_SCHEME=put_zcopy | ||||
| export UCX_RNDV_THRESH=16384 | ||||
| export UCX_IB_GPU_DIRECT_RDMA=yes | ||||
| export UCX_MEMTYPE_CACHE=n | ||||
|  | ||||
| # IO environment ############################################################### | ||||
|  | ||||
| if [ 16 -eq 1 ]; then | ||||
| 	export OMPI_MCA_io=ompio | ||||
| else | ||||
| 	export OMPI_MCA_io=romio321 | ||||
| fi | ||||
| export OMPI_MCA_btl_openib_allow_ib=true | ||||
| export OMPI_MCA_btl_openib_device_type=infiniband | ||||
| export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 | ||||
|  | ||||
| # load environment ############################################################# | ||||
| env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)" | ||||
| source "${env_dir}/env-base.sh" | ||||
| if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then | ||||
| 	source "${env_dir}/env-gpu.sh" | ||||
| else | ||||
| 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 | ||||
|   exit 1 | ||||
| fi | ||||
| spack load sshpass | ||||
|  | ||||
| # application and parameters ################################################### | ||||
| app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32' | ||||
| opt=('--comms-overlap' '--comms-concurrent') | ||||
| par='' | ||||
|  | ||||
| # collect job information ###################################################### | ||||
| job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} | ||||
| mkdir -p "${job_info_dir}" | ||||
|  | ||||
| date                         > "${job_info_dir}/start-date" | ||||
| echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date" | ||||
| set                          > "${job_info_dir}/env" | ||||
| ldd ${app}                   > "${job_info_dir}/ldd" | ||||
| md5sum ${app}                > "${job_info_dir}/app-hash" | ||||
| readelf -a ${app}            > "${job_info_dir}/elf" | ||||
| echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" | ||||
| cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script" | ||||
| if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi | ||||
|  | ||||
| # GPU frequency control ######################################################## | ||||
| power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/' | ||||
| freq=1050 | ||||
|  | ||||
| # set frequency | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}" | ||||
| done | ||||
| # start NVIDIA SMI monitoring | ||||
| tmp=$(mktemp) | ||||
| sleep 1 | ||||
| coproc nvidia-smi dmon -o DT &> "${tmp}" | ||||
|  | ||||
| # run! ######################################################################### | ||||
| mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ | ||||
| 	./gpu-mpi-wrapper.sh \ | ||||
|   ${app} "${par}" "${opt[@]}" \ | ||||
| 	--mpi 2.2.2.8 \ | ||||
|   --accelerator-threads 8 \ | ||||
| 	--grid 48.48.48.96 \ | ||||
| 	--shm 2048 &> "${job_info_dir}/log" | ||||
|  | ||||
| # if we reach that point the application exited successfully ################### | ||||
| touch "${job_info_dir}/success" | ||||
| date > "${job_info_dir}/end-date" | ||||
| echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date" | ||||
|  | ||||
| # reset GPUS ################################################################### | ||||
| # stop monitoring | ||||
| kill -INT "${COPROC_PID}" | ||||
|  | ||||
| # make monitoring DB | ||||
| ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}" | ||||
|  | ||||
| # reset clocks | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'  | ||||
| done | ||||
| ################################################################################ | ||||
| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 20:46:27 BST 2022 | ||||
| epoch 1661024788 | ||||
| @@ -0,0 +1 @@ | ||||
| 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32 | ||||
							
								
								
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/elf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/elf
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 20:55:03 BST 2022 | ||||
| epoch 1661025303 | ||||
							
								
								
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/env
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/ldd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/ldd
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| 	linux-vdso.so.1 (0x00007ffd9b1d1000) | ||||
| 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014a2805dc000) | ||||
| 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014a280214000) | ||||
| 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014a27fd22000) | ||||
| 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014a27f9f8000) | ||||
| 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014a27f717000) | ||||
| 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014a27f4b6000) | ||||
| 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014a280563000) | ||||
| 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014a27f0d6000) | ||||
| 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014a27d97a000) | ||||
| 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014a27d5aa000) | ||||
| 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014a27d309000) | ||||
| 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014a27d1de000) | ||||
| 	libm.so.6 => /lib64/libm.so.6 (0x000014a27ce5c000) | ||||
| 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014a27cc25000) | ||||
| 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014a27ca0d000) | ||||
| 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014a27c7ed000) | ||||
| 	libc.so.6 => /lib64/libc.so.6 (0x000014a27c428000) | ||||
| 	libdl.so.2 => /lib64/libdl.so.2 (0x000014a27c224000) | ||||
| 	/lib64/ld-linux-x86-64.so.2 (0x000014a28042c000) | ||||
| 	librt.so.1 => /lib64/librt.so.1 (0x000014a27c01c000) | ||||
| 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014a280497000) | ||||
| 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014a280492000) | ||||
| 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014a27bf10000) | ||||
| 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014a27bd06000) | ||||
| 	libutil.so.1 => /lib64/libutil.so.1 (0x000014a27bb02000) | ||||
							
								
								
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | ||||
| tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| local rank 2 device 0 bus id: 0000:84:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 3 device 0 bus id: 0000:C4:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 1 device 0 bus id: 0000:44:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| SharedMemoryMpi:  World communicator of size 64 | ||||
| SharedMemoryMpi:  Node  communicator of size 4 | ||||
| 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x150120000000 for comms buffers  | ||||
| Setting up IPC | ||||
|  | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||
| __|_                                    _|__ | ||||
| __|_   GGGG    RRRR    III    DDDD      _|__ | ||||
| __|_  G        R   R    I     D   D     _|__ | ||||
| __|_  G        R   R    I     D    D    _|__ | ||||
| __|_  G  GG    RRRR     I     D    D    _|__ | ||||
| __|_  G   G    R  R     I     D   D     _|__ | ||||
| __|_   GGGG    R   R   III    DDDD      _|__ | ||||
| __|_                                    _|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
|   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||
|  | ||||
|  | ||||
| Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
| Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes | ||||
|  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : MPI is initialised and logging filters activated  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : Requested 2147483648 byte stencil comms buffers  | ||||
| Grid : Message : MemoryManager Cache 34004218675 bytes  | ||||
| Grid : Message : MemoryManager::Init() setting up | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 | ||||
| Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||
| Grid : Message : MemoryManager::Init() Using cudaMalloc | ||||
| Grid : Message : 1.428183 s : Grid Layout | ||||
| Grid : Message : 1.428187 s : 	Global lattice size  : 48 48 48 96  | ||||
| Grid : Message : 1.428193 s : 	OpenMP threads       : 4 | ||||
| Grid : Message : 1.428196 s : 	MPI tasks            : 2 2 2 8  | ||||
| Grid : Message : 1.443217 s : Making s innermost grids | ||||
| Grid : Message : 1.455165 s : Initialising 4d RNG | ||||
| Grid : Message : 1.471981 s : Intialising parallel RNG with unique string 'The 4D RNG' | ||||
| Grid : Message : 1.472007 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 | ||||
| Grid : Message : 1.853366 s : Initialising 5d RNG | ||||
| Grid : Message : 2.875960 s : Intialising parallel RNG with unique string 'The 5D RNG' | ||||
| Grid : Message : 2.876470 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a | ||||
| Grid : Message : 7.305707 s : Initialised RNGs | ||||
| Grid : Message : 8.397843 s : Drawing gauge field | ||||
| Grid : Message : 8.484443 s : Random gauge initialised  | ||||
| Grid : Message : 8.488387 s : Setting up Cshift based reference  | ||||
| Grid : Message : 13.563627 s : ***************************************************************** | ||||
| Grid : Message : 13.563653 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm | ||||
| Grid : Message : 13.563655 s : ***************************************************************** | ||||
| Grid : Message : 13.563658 s : ***************************************************************** | ||||
| Grid : Message : 13.563659 s : * Benchmarking DomainWallFermionR::Dhop                   | ||||
| Grid : Message : 13.563660 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 13.563663 s : * VComplexF size is 64 B | ||||
| Grid : Message : 13.563665 s : * SINGLE precision  | ||||
| Grid : Message : 13.563667 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 13.563668 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 13.563669 s : ***************************************************************** | ||||
| Grid : Message : 14.958310 s : Called warmup | ||||
| Grid : Message : 101.445133 s : Called Dw 30000 times in 8.73489e+07 us | ||||
| Grid : Message : 101.445198 s : mflop/s =   7.7011e+07 | ||||
| Grid : Message : 101.445200 s : mflop/s per rank =  1.2033e+06 | ||||
| Grid : Message : 101.445202 s : mflop/s per node =  4.81319e+06 | ||||
| Grid : Message : 101.445204 s : RF  GiB/s (base 2) =   156485 | ||||
| Grid : Message : 101.445206 s : mem GiB/s (base 2) =   97802.9 | ||||
| Grid : Message : 101.445777 s : norm diff   1.05775e-13 | ||||
| Grid : Message : 101.455931 s : #### Dhop calls report  | ||||
| Grid : Message : 101.455939 s : WilsonFermion5D Number of DhopEO Calls   : 60002 | ||||
| Grid : Message : 101.455943 s : WilsonFermion5D TotalTime   /Calls        : 1457.12 us | ||||
| Grid : Message : 101.455945 s : WilsonFermion5D CommTime    /Calls        : 1014.92 us | ||||
| Grid : Message : 101.455947 s : WilsonFermion5D FaceTime    /Calls        : 219.441 us | ||||
| Grid : Message : 101.455949 s : WilsonFermion5D ComputeTime1/Calls        : 2.84344 us | ||||
| Grid : Message : 101.455951 s : WilsonFermion5D ComputeTime2/Calls        : 235.367 us | ||||
| Grid : Message : 101.455978 s : Average mflops/s per call                : 3.61947e+10 | ||||
| Grid : Message : 101.455982 s : Average mflops/s per call per rank       : 5.65543e+08 | ||||
| Grid : Message : 101.455984 s : Average mflops/s per call per node       : 2.26217e+09 | ||||
| Grid : Message : 101.455986 s : Average mflops/s per call (full)         : 7.83407e+07 | ||||
| Grid : Message : 101.455990 s : Average mflops/s per call per rank (full): 1.22407e+06 | ||||
| Grid : Message : 101.455992 s : Average mflops/s per call per node (full): 4.8963e+06 | ||||
| Grid : Message : 101.455994 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 101.455995 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 101.455999 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 101.456001 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 101.456002 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 101.456004 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 110.188024 s : Compare to naive wilson implementation Dag to verify correctness | ||||
| Grid : Message : 110.188051 s : Called DwDag | ||||
| Grid : Message : 110.188052 s : norm dag result 12.0422 | ||||
| Grid : Message : 110.200211 s : norm dag ref    12.0422 | ||||
| Grid : Message : 110.203215 s : norm dag diff   7.28899e-14 | ||||
| Grid : Message : 110.213199 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec | ||||
| Grid : Message : 110.281787 s : src_e0.5 | ||||
| Grid : Message : 110.353808 s : src_o0.5 | ||||
| Grid : Message : 110.370985 s : ********************************************************* | ||||
| Grid : Message : 110.370991 s : * Benchmarking DomainWallFermionF::DhopEO                 | ||||
| Grid : Message : 110.370992 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 110.370995 s : * SINGLE precision  | ||||
| Grid : Message : 110.370997 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 110.370998 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 110.371000 s : ********************************************************* | ||||
| Grid : Message : 157.314519 s : Deo mflop/s =   7.16631e+07 | ||||
| Grid : Message : 157.314545 s : Deo mflop/s per rank   1.11974e+06 | ||||
| Grid : Message : 157.314547 s : Deo mflop/s per node   4.47894e+06 | ||||
| Grid : Message : 157.314550 s : #### Dhop calls report  | ||||
| Grid : Message : 157.314552 s : WilsonFermion5D Number of DhopEO Calls   : 30001 | ||||
| Grid : Message : 157.314554 s : WilsonFermion5D TotalTime   /Calls        : 1564.64 us | ||||
| Grid : Message : 157.314556 s : WilsonFermion5D CommTime    /Calls        : 1060.37 us | ||||
| Grid : Message : 157.314558 s : WilsonFermion5D FaceTime    /Calls        : 287.98 us | ||||
| Grid : Message : 157.314560 s : WilsonFermion5D ComputeTime1/Calls        : 4.91794 us | ||||
| Grid : Message : 157.314562 s : WilsonFermion5D ComputeTime2/Calls        : 239.551 us | ||||
| Grid : Message : 157.314587 s : Average mflops/s per call                : 2.07265e+10 | ||||
| Grid : Message : 157.314591 s : Average mflops/s per call per rank       : 3.23852e+08 | ||||
| Grid : Message : 157.314593 s : Average mflops/s per call per node       : 1.29541e+09 | ||||
| Grid : Message : 157.314596 s : Average mflops/s per call (full)         : 7.29577e+07 | ||||
| Grid : Message : 157.314600 s : Average mflops/s per call per rank (full): 1.13996e+06 | ||||
| Grid : Message : 157.314602 s : Average mflops/s per call per node (full): 4.55985e+06 | ||||
| Grid : Message : 157.314605 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 157.314606 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 157.314608 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 157.314610 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 157.314613 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 157.314614 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 157.334523 s : r_e6.02106 | ||||
| Grid : Message : 157.336050 s : r_o6.0211 | ||||
| Grid : Message : 157.337424 s : res12.0422 | ||||
| Grid : Message : 157.450236 s : norm diff   0 | ||||
| Grid : Message : 157.586163 s : norm diff even  0 | ||||
| Grid : Message : 157.657558 s : norm diff odd   0 | ||||
							
								
								
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/nodes
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/nodes
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21] | ||||
							
								
								
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/script
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1065.64076/script
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| #!/usr/bin/env bash | ||||
| # shellcheck disable=SC1091,SC2050,SC2170 | ||||
|  | ||||
| # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa | ||||
|  | ||||
| #SBATCH -J power-16A-1065 | ||||
| #SBATCH -A dp207 | ||||
| #SBATCH -t 48:00:00 | ||||
| #SBATCH --nodes=16 | ||||
| #SBATCH --ntasks=64 | ||||
| #SBATCH --ntasks-per-node=4 | ||||
| #SBATCH --cpus-per-task=8 | ||||
| #SBATCH --partition=gpu | ||||
| #SBATCH --gres=gpu:4 | ||||
| #SBATCH --output=%x.%j.out | ||||
| #SBATCH --error=%x.%j.err | ||||
| #SBATCH --reservation=dc-port1_61 | ||||
| #SBATCH --qos=reservation | ||||
| #SBATCH --no-requeue | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # OpenMP/OpenMPI/UCX environment ############################################### | ||||
| export OMP_NUM_THREADS=4 | ||||
| export OMPI_MCA_btl=^uct,openib | ||||
| export OMPI_MCA_pml=ucx | ||||
| export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc | ||||
| export UCX_RNDV_SCHEME=put_zcopy | ||||
| export UCX_RNDV_THRESH=16384 | ||||
| export UCX_IB_GPU_DIRECT_RDMA=yes | ||||
| export UCX_MEMTYPE_CACHE=n | ||||
|  | ||||
| # IO environment ############################################################### | ||||
|  | ||||
| if [ 16 -eq 1 ]; then | ||||
| 	export OMPI_MCA_io=ompio | ||||
| else | ||||
| 	export OMPI_MCA_io=romio321 | ||||
| fi | ||||
| export OMPI_MCA_btl_openib_allow_ib=true | ||||
| export OMPI_MCA_btl_openib_device_type=infiniband | ||||
| export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 | ||||
|  | ||||
| # load environment ############################################################# | ||||
| env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)" | ||||
| source "${env_dir}/env-base.sh" | ||||
| if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then | ||||
| 	source "${env_dir}/env-gpu.sh" | ||||
| else | ||||
| 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 | ||||
|   exit 1 | ||||
| fi | ||||
| spack load sshpass | ||||
|  | ||||
| # application and parameters ################################################### | ||||
| app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32' | ||||
| opt=('--comms-overlap' '--comms-concurrent') | ||||
| par='' | ||||
|  | ||||
| # collect job information ###################################################### | ||||
| job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} | ||||
| mkdir -p "${job_info_dir}" | ||||
|  | ||||
| date                         > "${job_info_dir}/start-date" | ||||
| echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date" | ||||
| set                          > "${job_info_dir}/env" | ||||
| ldd ${app}                   > "${job_info_dir}/ldd" | ||||
| md5sum ${app}                > "${job_info_dir}/app-hash" | ||||
| readelf -a ${app}            > "${job_info_dir}/elf" | ||||
| echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" | ||||
| cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script" | ||||
| if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi | ||||
|  | ||||
| # GPU frequency control ######################################################## | ||||
| power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/' | ||||
| freq=1065 | ||||
|  | ||||
| # set frequency | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}" | ||||
| done | ||||
| # start NVIDIA SMI monitoring | ||||
| tmp=$(mktemp) | ||||
| sleep 1 | ||||
| coproc nvidia-smi dmon -o DT &> "${tmp}" | ||||
|  | ||||
| # run! ######################################################################### | ||||
| mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ | ||||
| 	./gpu-mpi-wrapper.sh \ | ||||
|   ${app} "${par}" "${opt[@]}" \ | ||||
| 	--mpi 2.2.2.8 \ | ||||
|   --accelerator-threads 8 \ | ||||
| 	--grid 48.48.48.96 \ | ||||
| 	--shm 2048 &> "${job_info_dir}/log" | ||||
|  | ||||
| # if we reach that point the application exited successfully ################### | ||||
| touch "${job_info_dir}/success" | ||||
| date > "${job_info_dir}/end-date" | ||||
| echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date" | ||||
|  | ||||
| # reset GPUS ################################################################### | ||||
| # stop monitoring | ||||
| kill -INT "${COPROC_PID}" | ||||
|  | ||||
| # make monitoring DB | ||||
| ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}" | ||||
|  | ||||
| # reset clocks | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'  | ||||
| done | ||||
| ################################################################################ | ||||
| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 20:52:16 BST 2022 | ||||
| epoch 1661025136 | ||||
| @@ -0,0 +1 @@ | ||||
| 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32 | ||||
							
								
								
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/elf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/elf
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 21:00:52 BST 2022 | ||||
| epoch 1661025652 | ||||
							
								
								
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/env
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/ldd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/ldd
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| 	linux-vdso.so.1 (0x00007ffceffcb000) | ||||
| 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014c73048f000) | ||||
| 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014c7300c7000) | ||||
| 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014c72fbd5000) | ||||
| 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014c72f8ab000) | ||||
| 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014c72f5ca000) | ||||
| 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014c72f369000) | ||||
| 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014c730416000) | ||||
| 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014c72ef89000) | ||||
| 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014c72d82d000) | ||||
| 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014c72d45d000) | ||||
| 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014c72d1bc000) | ||||
| 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014c72d091000) | ||||
| 	libm.so.6 => /lib64/libm.so.6 (0x000014c72cd0f000) | ||||
| 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014c72cad8000) | ||||
| 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014c72c8c0000) | ||||
| 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014c72c6a0000) | ||||
| 	libc.so.6 => /lib64/libc.so.6 (0x000014c72c2db000) | ||||
| 	libdl.so.2 => /lib64/libdl.so.2 (0x000014c72c0d7000) | ||||
| 	/lib64/ld-linux-x86-64.so.2 (0x000014c7302df000) | ||||
| 	librt.so.1 => /lib64/librt.so.1 (0x000014c72becf000) | ||||
| 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014c73034a000) | ||||
| 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014c730345000) | ||||
| 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014c72bdc3000) | ||||
| 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014c72bbb9000) | ||||
| 	libutil.so.1 => /lib64/libutil.so.1 (0x000014c72b9b5000) | ||||
							
								
								
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | ||||
| tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 1 device 0 bus id: 0000:44:00.0 | ||||
| local rank 2 device 0 bus id: 0000:84:00.0 | ||||
| local rank 3 device 0 bus id: 0000:C4:00.0 | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| SharedMemoryMpi:  World communicator of size 64 | ||||
| SharedMemoryMpi:  Node  communicator of size 4 | ||||
| 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1548a0000000 for comms buffers  | ||||
| Setting up IPC | ||||
|  | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||
| __|_                                    _|__ | ||||
| __|_   GGGG    RRRR    III    DDDD      _|__ | ||||
| __|_  G        R   R    I     D   D     _|__ | ||||
| __|_  G        R   R    I     D    D    _|__ | ||||
| __|_  G  GG    RRRR     I     D    D    _|__ | ||||
| __|_  G   G    R  R     I     D   D     _|__ | ||||
| __|_   GGGG    R   R   III    DDDD      _|__ | ||||
| __|_                                    _|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
|   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||
|  | ||||
|  | ||||
| Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
| Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes | ||||
|  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : MPI is initialised and logging filters activated  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : Requested 2147483648 byte stencil comms buffers  | ||||
| Grid : Message : MemoryManager Cache 34004218675 bytes  | ||||
| Grid : Message : MemoryManager::Init() setting up | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 | ||||
| Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||
| Grid : Message : MemoryManager::Init() Using cudaMalloc | ||||
| Grid : Message : 1.498999 s : Grid Layout | ||||
| Grid : Message : 1.499003 s : 	Global lattice size  : 48 48 48 96  | ||||
| Grid : Message : 1.499009 s : 	OpenMP threads       : 4 | ||||
| Grid : Message : 1.499010 s : 	MPI tasks            : 2 2 2 8  | ||||
| Grid : Message : 1.516697 s : Making s innermost grids | ||||
| Grid : Message : 1.528026 s : Initialising 4d RNG | ||||
| Grid : Message : 1.543296 s : Intialising parallel RNG with unique string 'The 4D RNG' | ||||
| Grid : Message : 1.543322 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 | ||||
| Grid : Message : 1.803104 s : Initialising 5d RNG | ||||
| Grid : Message : 2.280210 s : Intialising parallel RNG with unique string 'The 5D RNG' | ||||
| Grid : Message : 2.280810 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a | ||||
| Grid : Message : 7.463560 s : Initialised RNGs | ||||
| Grid : Message : 8.316566 s : Drawing gauge field | ||||
| Grid : Message : 8.441882 s : Random gauge initialised  | ||||
| Grid : Message : 8.454498 s : Setting up Cshift based reference  | ||||
| Grid : Message : 13.615874 s : ***************************************************************** | ||||
| Grid : Message : 13.615901 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm | ||||
| Grid : Message : 13.615903 s : ***************************************************************** | ||||
| Grid : Message : 13.615904 s : ***************************************************************** | ||||
| Grid : Message : 13.615905 s : * Benchmarking DomainWallFermionR::Dhop                   | ||||
| Grid : Message : 13.615906 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 13.615910 s : * VComplexF size is 64 B | ||||
| Grid : Message : 13.615912 s : * SINGLE precision  | ||||
| Grid : Message : 13.615914 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 13.615916 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 13.615918 s : ***************************************************************** | ||||
| Grid : Message : 14.175758 s : Called warmup | ||||
| Grid : Message : 100.948265 s : Called Dw 30000 times in 8.67724e+07 us | ||||
| Grid : Message : 100.948328 s : mflop/s =   7.75226e+07 | ||||
| Grid : Message : 100.948330 s : mflop/s per rank =  1.21129e+06 | ||||
| Grid : Message : 100.948332 s : mflop/s per node =  4.84516e+06 | ||||
| Grid : Message : 100.948334 s : RF  GiB/s (base 2) =   157524 | ||||
| Grid : Message : 100.948336 s : mem GiB/s (base 2) =   98452.5 | ||||
| Grid : Message : 100.948912 s : norm diff   1.05775e-13 | ||||
| Grid : Message : 100.958922 s : #### Dhop calls report  | ||||
| Grid : Message : 100.958930 s : WilsonFermion5D Number of DhopEO Calls   : 60002 | ||||
| Grid : Message : 100.958934 s : WilsonFermion5D TotalTime   /Calls        : 1447.35 us | ||||
| Grid : Message : 100.958936 s : WilsonFermion5D CommTime    /Calls        : 1006.18 us | ||||
| Grid : Message : 100.958938 s : WilsonFermion5D FaceTime    /Calls        : 218.625 us | ||||
| Grid : Message : 100.958940 s : WilsonFermion5D ComputeTime1/Calls        : 2.6472 us | ||||
| Grid : Message : 100.958942 s : WilsonFermion5D ComputeTime2/Calls        : 235.108 us | ||||
| Grid : Message : 100.958970 s : Average mflops/s per call                : 3.6261e+10 | ||||
| Grid : Message : 100.958974 s : Average mflops/s per call per rank       : 5.66578e+08 | ||||
| Grid : Message : 100.958976 s : Average mflops/s per call per node       : 2.26631e+09 | ||||
| Grid : Message : 100.958978 s : Average mflops/s per call (full)         : 7.88698e+07 | ||||
| Grid : Message : 100.958981 s : Average mflops/s per call per rank (full): 1.23234e+06 | ||||
| Grid : Message : 100.958983 s : Average mflops/s per call per node (full): 4.92936e+06 | ||||
| Grid : Message : 100.958986 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 100.958987 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 100.958988 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 100.958991 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 100.958992 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 100.958995 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 109.635912 s : Compare to naive wilson implementation Dag to verify correctness | ||||
| Grid : Message : 109.635940 s : Called DwDag | ||||
| Grid : Message : 109.635941 s : norm dag result 12.0422 | ||||
| Grid : Message : 109.641498 s : norm dag ref    12.0422 | ||||
| Grid : Message : 109.644623 s : norm dag diff   7.28899e-14 | ||||
| Grid : Message : 109.654599 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec | ||||
| Grid : Message : 109.718075 s : src_e0.5 | ||||
| Grid : Message : 109.790285 s : src_o0.5 | ||||
| Grid : Message : 109.807211 s : ********************************************************* | ||||
| Grid : Message : 109.807217 s : * Benchmarking DomainWallFermionF::DhopEO                 | ||||
| Grid : Message : 109.807219 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 109.807221 s : * SINGLE precision  | ||||
| Grid : Message : 109.807224 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 109.807225 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 109.807226 s : ********************************************************* | ||||
| Grid : Message : 156.357075 s : Deo mflop/s =   7.22704e+07 | ||||
| Grid : Message : 156.357109 s : Deo mflop/s per rank   1.12923e+06 | ||||
| Grid : Message : 156.357111 s : Deo mflop/s per node   4.5169e+06 | ||||
| Grid : Message : 156.357114 s : #### Dhop calls report  | ||||
| Grid : Message : 156.357116 s : WilsonFermion5D Number of DhopEO Calls   : 30001 | ||||
| Grid : Message : 156.357118 s : WilsonFermion5D TotalTime   /Calls        : 1551.51 us | ||||
| Grid : Message : 156.357120 s : WilsonFermion5D CommTime    /Calls        : 1049.38 us | ||||
| Grid : Message : 156.357122 s : WilsonFermion5D FaceTime    /Calls        : 285.792 us | ||||
| Grid : Message : 156.357124 s : WilsonFermion5D ComputeTime1/Calls        : 4.81357 us | ||||
| Grid : Message : 156.357126 s : WilsonFermion5D ComputeTime2/Calls        : 239.16 us | ||||
| Grid : Message : 156.357146 s : Average mflops/s per call                : 2.07719e+10 | ||||
| Grid : Message : 156.357150 s : Average mflops/s per call per rank       : 3.24561e+08 | ||||
| Grid : Message : 156.357152 s : Average mflops/s per call per node       : 1.29824e+09 | ||||
| Grid : Message : 156.357154 s : Average mflops/s per call (full)         : 7.35747e+07 | ||||
| Grid : Message : 156.357158 s : Average mflops/s per call per rank (full): 1.1496e+06 | ||||
| Grid : Message : 156.357161 s : Average mflops/s per call per node (full): 4.59842e+06 | ||||
| Grid : Message : 156.357163 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 156.357165 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 156.357166 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 156.357168 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 156.357175 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 156.357176 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 156.375718 s : r_e6.02106 | ||||
| Grid : Message : 156.378883 s : r_o6.0211 | ||||
| Grid : Message : 156.380335 s : res12.0422 | ||||
| Grid : Message : 156.489162 s : norm diff   0 | ||||
| Grid : Message : 156.617774 s : norm diff even  0 | ||||
| Grid : Message : 156.694536 s : norm diff odd   0 | ||||
							
								
								
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/nodes
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/nodes
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21] | ||||
							
								
								
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/script
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1080.64082/script
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| #!/usr/bin/env bash | ||||
| # shellcheck disable=SC1091,SC2050,SC2170 | ||||
|  | ||||
| # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa | ||||
|  | ||||
| #SBATCH -J power-16A-1080 | ||||
| #SBATCH -A dp207 | ||||
| #SBATCH -t 48:00:00 | ||||
| #SBATCH --nodes=16 | ||||
| #SBATCH --ntasks=64 | ||||
| #SBATCH --ntasks-per-node=4 | ||||
| #SBATCH --cpus-per-task=8 | ||||
| #SBATCH --partition=gpu | ||||
| #SBATCH --gres=gpu:4 | ||||
| #SBATCH --output=%x.%j.out | ||||
| #SBATCH --error=%x.%j.err | ||||
| #SBATCH --reservation=dc-port1_61 | ||||
| #SBATCH --qos=reservation | ||||
| #SBATCH --no-requeue | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # OpenMP/OpenMPI/UCX environment ############################################### | ||||
| export OMP_NUM_THREADS=4 | ||||
| export OMPI_MCA_btl=^uct,openib | ||||
| export OMPI_MCA_pml=ucx | ||||
| export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc | ||||
| export UCX_RNDV_SCHEME=put_zcopy | ||||
| export UCX_RNDV_THRESH=16384 | ||||
| export UCX_IB_GPU_DIRECT_RDMA=yes | ||||
| export UCX_MEMTYPE_CACHE=n | ||||
|  | ||||
| # IO environment ############################################################### | ||||
|  | ||||
| if [ 16 -eq 1 ]; then | ||||
| 	export OMPI_MCA_io=ompio | ||||
| else | ||||
| 	export OMPI_MCA_io=romio321 | ||||
| fi | ||||
| export OMPI_MCA_btl_openib_allow_ib=true | ||||
| export OMPI_MCA_btl_openib_device_type=infiniband | ||||
| export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 | ||||
|  | ||||
| # load environment ############################################################# | ||||
| env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)" | ||||
| source "${env_dir}/env-base.sh" | ||||
| if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then | ||||
| 	source "${env_dir}/env-gpu.sh" | ||||
| else | ||||
| 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 | ||||
|   exit 1 | ||||
| fi | ||||
| spack load sshpass | ||||
|  | ||||
| # application and parameters ################################################### | ||||
| app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32' | ||||
| opt=('--comms-overlap' '--comms-concurrent') | ||||
| par='' | ||||
|  | ||||
| # collect job information ###################################################### | ||||
| job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} | ||||
| mkdir -p "${job_info_dir}" | ||||
|  | ||||
| date                         > "${job_info_dir}/start-date" | ||||
| echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date" | ||||
| set                          > "${job_info_dir}/env" | ||||
| ldd ${app}                   > "${job_info_dir}/ldd" | ||||
| md5sum ${app}                > "${job_info_dir}/app-hash" | ||||
| readelf -a ${app}            > "${job_info_dir}/elf" | ||||
| echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" | ||||
| cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script" | ||||
| if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi | ||||
|  | ||||
| # GPU frequency control ######################################################## | ||||
| power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/' | ||||
| freq=1080 | ||||
|  | ||||
| # set frequency | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}" | ||||
| done | ||||
| # start NVIDIA SMI monitoring | ||||
| tmp=$(mktemp) | ||||
| sleep 1 | ||||
| coproc nvidia-smi dmon -o DT &> "${tmp}" | ||||
|  | ||||
| # run! ######################################################################### | ||||
| mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ | ||||
| 	./gpu-mpi-wrapper.sh \ | ||||
|   ${app} "${par}" "${opt[@]}" \ | ||||
| 	--mpi 2.2.2.8 \ | ||||
|   --accelerator-threads 8 \ | ||||
| 	--grid 48.48.48.96 \ | ||||
| 	--shm 2048 &> "${job_info_dir}/log" | ||||
|  | ||||
| # if we reach that point the application exited successfully ################### | ||||
| touch "${job_info_dir}/success" | ||||
| date > "${job_info_dir}/end-date" | ||||
| echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date" | ||||
|  | ||||
| # reset GPUS ################################################################### | ||||
| # stop monitoring | ||||
| kill -INT "${COPROC_PID}" | ||||
|  | ||||
| # make monitoring DB | ||||
| ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}" | ||||
|  | ||||
| # reset clocks | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'  | ||||
| done | ||||
| ################################################################################ | ||||
| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 20:58:06 BST 2022 | ||||
| epoch 1661025486 | ||||
| @@ -0,0 +1 @@ | ||||
| 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32 | ||||
							
								
								
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/elf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/elf
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 21:06:38 BST 2022 | ||||
| epoch 1661025998 | ||||
							
								
								
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/env
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/ldd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/ldd
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| 	linux-vdso.so.1 (0x00007ffc219f0000) | ||||
| 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014aa89605000) | ||||
| 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014aa8923d000) | ||||
| 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014aa88d4b000) | ||||
| 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014aa88a21000) | ||||
| 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014aa88740000) | ||||
| 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014aa884df000) | ||||
| 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014aa8958c000) | ||||
| 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014aa880ff000) | ||||
| 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014aa869a3000) | ||||
| 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014aa865d3000) | ||||
| 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014aa86332000) | ||||
| 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014aa86207000) | ||||
| 	libm.so.6 => /lib64/libm.so.6 (0x000014aa85e85000) | ||||
| 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014aa85c4e000) | ||||
| 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014aa85a36000) | ||||
| 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014aa85816000) | ||||
| 	libc.so.6 => /lib64/libc.so.6 (0x000014aa85451000) | ||||
| 	libdl.so.2 => /lib64/libdl.so.2 (0x000014aa8524d000) | ||||
| 	/lib64/ld-linux-x86-64.so.2 (0x000014aa89455000) | ||||
| 	librt.so.1 => /lib64/librt.so.1 (0x000014aa85045000) | ||||
| 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014aa894c0000) | ||||
| 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014aa894bb000) | ||||
| 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014aa84f39000) | ||||
| 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014aa84d2f000) | ||||
| 	libutil.so.1 => /lib64/libutil.so.1 (0x000014aa84b2b000) | ||||
							
								
								
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | ||||
| tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 2 device 0 bus id: 0000:84:00.0 | ||||
| local rank 3 device 0 bus id: 0000:C4:00.0 | ||||
| local rank 1 device 0 bus id: 0000:44:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| SharedMemoryMpi:  World communicator of size 64 | ||||
| SharedMemoryMpi:  Node  communicator of size 4 | ||||
| 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146d00000000 for comms buffers  | ||||
| Setting up IPC | ||||
|  | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||
| __|_                                    _|__ | ||||
| __|_   GGGG    RRRR    III    DDDD      _|__ | ||||
| __|_  G        R   R    I     D   D     _|__ | ||||
| __|_  G        R   R    I     D    D    _|__ | ||||
| __|_  G  GG    RRRR     I     D    D    _|__ | ||||
| __|_  G   G    R  R     I     D   D     _|__ | ||||
| __|_   GGGG    R   R   III    DDDD      _|__ | ||||
| __|_                                    _|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
|   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||
|  | ||||
|  | ||||
| Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
| Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes | ||||
|  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : MPI is initialised and logging filters activated  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : Requested 2147483648 byte stencil comms buffers  | ||||
| Grid : Message : MemoryManager Cache 34004218675 bytes  | ||||
| Grid : Message : MemoryManager::Init() setting up | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 | ||||
| Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||
| Grid : Message : MemoryManager::Init() Using cudaMalloc | ||||
| Grid : Message : 1.412895 s : Grid Layout | ||||
| Grid : Message : 1.412899 s : 	Global lattice size  : 48 48 48 96  | ||||
| Grid : Message : 1.412905 s : 	OpenMP threads       : 4 | ||||
| Grid : Message : 1.412909 s : 	MPI tasks            : 2 2 2 8  | ||||
| Grid : Message : 1.428319 s : Making s innermost grids | ||||
| Grid : Message : 1.445373 s : Initialising 4d RNG | ||||
| Grid : Message : 1.461658 s : Intialising parallel RNG with unique string 'The 4D RNG' | ||||
| Grid : Message : 1.461680 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 | ||||
| Grid : Message : 1.902912 s : Initialising 5d RNG | ||||
| Grid : Message : 2.141255 s : Intialising parallel RNG with unique string 'The 5D RNG' | ||||
| Grid : Message : 2.141291 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a | ||||
| Grid : Message : 7.353326 s : Initialised RNGs | ||||
| Grid : Message : 8.518633 s : Drawing gauge field | ||||
| Grid : Message : 8.626652 s : Random gauge initialised  | ||||
| Grid : Message : 8.630634 s : Setting up Cshift based reference  | ||||
| Grid : Message : 13.722925 s : ***************************************************************** | ||||
| Grid : Message : 13.722949 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm | ||||
| Grid : Message : 13.722950 s : ***************************************************************** | ||||
| Grid : Message : 13.722951 s : ***************************************************************** | ||||
| Grid : Message : 13.722952 s : * Benchmarking DomainWallFermionR::Dhop                   | ||||
| Grid : Message : 13.722953 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 13.722954 s : * VComplexF size is 64 B | ||||
| Grid : Message : 13.722955 s : * SINGLE precision  | ||||
| Grid : Message : 13.722956 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 13.722957 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 13.722958 s : ***************************************************************** | ||||
| Grid : Message : 14.254628 s : Called warmup | ||||
| Grid : Message : 100.327406 s : Called Dw 30000 times in 8.60725e+07 us | ||||
| Grid : Message : 100.327470 s : mflop/s =   7.8153e+07 | ||||
| Grid : Message : 100.327472 s : mflop/s per rank =  1.22114e+06 | ||||
| Grid : Message : 100.327474 s : mflop/s per node =  4.88456e+06 | ||||
| Grid : Message : 100.327476 s : RF  GiB/s (base 2) =   158805 | ||||
| Grid : Message : 100.327478 s : mem GiB/s (base 2) =   99253.2 | ||||
| Grid : Message : 100.328051 s : norm diff   1.05775e-13 | ||||
| Grid : Message : 100.337927 s : #### Dhop calls report  | ||||
| Grid : Message : 100.337935 s : WilsonFermion5D Number of DhopEO Calls   : 60002 | ||||
| Grid : Message : 100.337943 s : WilsonFermion5D TotalTime   /Calls        : 1435.69 us | ||||
| Grid : Message : 100.337946 s : WilsonFermion5D CommTime    /Calls        : 996.547 us | ||||
| Grid : Message : 100.337949 s : WilsonFermion5D FaceTime    /Calls        : 217.079 us | ||||
| Grid : Message : 100.337953 s : WilsonFermion5D ComputeTime1/Calls        : 2.78067 us | ||||
| Grid : Message : 100.337955 s : WilsonFermion5D ComputeTime2/Calls        : 234.472 us | ||||
| Grid : Message : 100.337971 s : Average mflops/s per call                : 3.63872e+10 | ||||
| Grid : Message : 100.337974 s : Average mflops/s per call per rank       : 5.68549e+08 | ||||
| Grid : Message : 100.337976 s : Average mflops/s per call per node       : 2.2742e+09 | ||||
| Grid : Message : 100.337980 s : Average mflops/s per call (full)         : 7.95104e+07 | ||||
| Grid : Message : 100.337982 s : Average mflops/s per call per rank (full): 1.24235e+06 | ||||
| Grid : Message : 100.337986 s : Average mflops/s per call per node (full): 4.9694e+06 | ||||
| Grid : Message : 100.337988 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 100.337990 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 100.337992 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 100.337995 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 100.337998 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 100.338000 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 109.354730 s : Compare to naive wilson implementation Dag to verify correctness | ||||
| Grid : Message : 109.355200 s : Called DwDag | ||||
| Grid : Message : 109.355210 s : norm dag result 12.0422 | ||||
| Grid : Message : 109.404420 s : norm dag ref    12.0422 | ||||
| Grid : Message : 109.435430 s : norm dag diff   7.28899e-14 | ||||
| Grid : Message : 109.565940 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec | ||||
| Grid : Message : 109.123204 s : src_e0.5 | ||||
| Grid : Message : 109.194082 s : src_o0.5 | ||||
| Grid : Message : 109.211743 s : ********************************************************* | ||||
| Grid : Message : 109.211749 s : * Benchmarking DomainWallFermionF::DhopEO                 | ||||
| Grid : Message : 109.211751 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 109.211754 s : * SINGLE precision  | ||||
| Grid : Message : 109.211756 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 109.211759 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 109.211761 s : ********************************************************* | ||||
| Grid : Message : 155.351395 s : Deo mflop/s =   7.29132e+07 | ||||
| Grid : Message : 155.351424 s : Deo mflop/s per rank   1.13927e+06 | ||||
| Grid : Message : 155.351427 s : Deo mflop/s per node   4.55708e+06 | ||||
| Grid : Message : 155.351433 s : #### Dhop calls report  | ||||
| Grid : Message : 155.351436 s : WilsonFermion5D Number of DhopEO Calls   : 30001 | ||||
| Grid : Message : 155.351440 s : WilsonFermion5D TotalTime   /Calls        : 1537.8 us | ||||
| Grid : Message : 155.351445 s : WilsonFermion5D CommTime    /Calls        : 1037.77 us | ||||
| Grid : Message : 155.351449 s : WilsonFermion5D FaceTime    /Calls        : 285.044 us | ||||
| Grid : Message : 155.351453 s : WilsonFermion5D ComputeTime1/Calls        : 4.8771 us | ||||
| Grid : Message : 155.351457 s : WilsonFermion5D ComputeTime2/Calls        : 237.861 us | ||||
| Grid : Message : 155.351481 s : Average mflops/s per call                : 2.07287e+10 | ||||
| Grid : Message : 155.351485 s : Average mflops/s per call per rank       : 3.23886e+08 | ||||
| Grid : Message : 155.351488 s : Average mflops/s per call per node       : 1.29554e+09 | ||||
| Grid : Message : 155.351492 s : Average mflops/s per call (full)         : 7.42306e+07 | ||||
| Grid : Message : 155.351496 s : Average mflops/s per call per rank (full): 1.15985e+06 | ||||
| Grid : Message : 155.351500 s : Average mflops/s per call per node (full): 4.63942e+06 | ||||
| Grid : Message : 155.351504 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 155.351506 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 155.351508 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 155.351511 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 155.351513 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 155.351515 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 155.370290 s : r_e6.02106 | ||||
| Grid : Message : 155.372244 s : r_o6.0211 | ||||
| Grid : Message : 155.373660 s : res12.0422 | ||||
| Grid : Message : 155.495172 s : norm diff   0 | ||||
| Grid : Message : 155.622362 s : norm diff even  0 | ||||
| Grid : Message : 155.695812 s : norm diff odd   0 | ||||
							
								
								
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/nodes
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/nodes
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21] | ||||
							
								
								
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/script
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1095.64087/script
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| #!/usr/bin/env bash | ||||
| # shellcheck disable=SC1091,SC2050,SC2170 | ||||
|  | ||||
| # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa | ||||
|  | ||||
| #SBATCH -J power-16A-1095 | ||||
| #SBATCH -A dp207 | ||||
| #SBATCH -t 48:00:00 | ||||
| #SBATCH --nodes=16 | ||||
| #SBATCH --ntasks=64 | ||||
| #SBATCH --ntasks-per-node=4 | ||||
| #SBATCH --cpus-per-task=8 | ||||
| #SBATCH --partition=gpu | ||||
| #SBATCH --gres=gpu:4 | ||||
| #SBATCH --output=%x.%j.out | ||||
| #SBATCH --error=%x.%j.err | ||||
| #SBATCH --reservation=dc-port1_61 | ||||
| #SBATCH --qos=reservation | ||||
| #SBATCH --no-requeue | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # OpenMP/OpenMPI/UCX environment ############################################### | ||||
| export OMP_NUM_THREADS=4 | ||||
| export OMPI_MCA_btl=^uct,openib | ||||
| export OMPI_MCA_pml=ucx | ||||
| export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc | ||||
| export UCX_RNDV_SCHEME=put_zcopy | ||||
| export UCX_RNDV_THRESH=16384 | ||||
| export UCX_IB_GPU_DIRECT_RDMA=yes | ||||
| export UCX_MEMTYPE_CACHE=n | ||||
|  | ||||
| # IO environment ############################################################### | ||||
|  | ||||
| if [ 16 -eq 1 ]; then | ||||
| 	export OMPI_MCA_io=ompio | ||||
| else | ||||
| 	export OMPI_MCA_io=romio321 | ||||
| fi | ||||
| export OMPI_MCA_btl_openib_allow_ib=true | ||||
| export OMPI_MCA_btl_openib_device_type=infiniband | ||||
| export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 | ||||
|  | ||||
| # load environment ############################################################# | ||||
| env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)" | ||||
| source "${env_dir}/env-base.sh" | ||||
| if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then | ||||
| 	source "${env_dir}/env-gpu.sh" | ||||
| else | ||||
| 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 | ||||
|   exit 1 | ||||
| fi | ||||
| spack load sshpass | ||||
|  | ||||
| # application and parameters ################################################### | ||||
| app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32' | ||||
| opt=('--comms-overlap' '--comms-concurrent') | ||||
| par='' | ||||
|  | ||||
| # collect job information ###################################################### | ||||
| job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} | ||||
| mkdir -p "${job_info_dir}" | ||||
|  | ||||
| date                         > "${job_info_dir}/start-date" | ||||
| echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date" | ||||
| set                          > "${job_info_dir}/env" | ||||
| ldd ${app}                   > "${job_info_dir}/ldd" | ||||
| md5sum ${app}                > "${job_info_dir}/app-hash" | ||||
| readelf -a ${app}            > "${job_info_dir}/elf" | ||||
| echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" | ||||
| cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script" | ||||
| if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi | ||||
|  | ||||
| # GPU frequency control ######################################################## | ||||
| power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/' | ||||
| freq=1095 | ||||
|  | ||||
| # set frequency | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}" | ||||
| done | ||||
| # start NVIDIA SMI monitoring | ||||
| tmp=$(mktemp) | ||||
| sleep 1 | ||||
| coproc nvidia-smi dmon -o DT &> "${tmp}" | ||||
|  | ||||
| # run! ######################################################################### | ||||
| mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ | ||||
| 	./gpu-mpi-wrapper.sh \ | ||||
|   ${app} "${par}" "${opt[@]}" \ | ||||
| 	--mpi 2.2.2.8 \ | ||||
|   --accelerator-threads 8 \ | ||||
| 	--grid 48.48.48.96 \ | ||||
| 	--shm 2048 &> "${job_info_dir}/log" | ||||
|  | ||||
| # if we reach that point the application exited successfully ################### | ||||
| touch "${job_info_dir}/success" | ||||
| date > "${job_info_dir}/end-date" | ||||
| echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date" | ||||
|  | ||||
| # reset GPUS ################################################################### | ||||
| # stop monitoring | ||||
| kill -INT "${COPROC_PID}" | ||||
|  | ||||
| # make monitoring DB | ||||
| ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}" | ||||
|  | ||||
| # reset clocks | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'  | ||||
| done | ||||
| ################################################################################ | ||||
| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 21:03:53 BST 2022 | ||||
| epoch 1661025833 | ||||
| @@ -0,0 +1 @@ | ||||
| 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32 | ||||
							
								
								
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/elf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/elf
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 21:12:23 BST 2022 | ||||
| epoch 1661026343 | ||||
							
								
								
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/env
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/ldd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/ldd
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| 	linux-vdso.so.1 (0x00007ffdef5db000) | ||||
| 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000152bce209000) | ||||
| 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000152bcde41000) | ||||
| 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000152bcd94f000) | ||||
| 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000152bcd625000) | ||||
| 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000152bcd344000) | ||||
| 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000152bcd0e3000) | ||||
| 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000152bce190000) | ||||
| 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000152bccd03000) | ||||
| 	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000152bcb5a7000) | ||||
| 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000152bcb1d7000) | ||||
| 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000152bcaf36000) | ||||
| 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000152bcae0b000) | ||||
| 	libm.so.6 => /lib64/libm.so.6 (0x0000152bcaa89000) | ||||
| 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000152bca852000) | ||||
| 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000152bca63a000) | ||||
| 	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000152bca41a000) | ||||
| 	libc.so.6 => /lib64/libc.so.6 (0x0000152bca055000) | ||||
| 	libdl.so.2 => /lib64/libdl.so.2 (0x0000152bc9e51000) | ||||
| 	/lib64/ld-linux-x86-64.so.2 (0x0000152bce059000) | ||||
| 	librt.so.1 => /lib64/librt.so.1 (0x0000152bc9c49000) | ||||
| 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000152bce0c4000) | ||||
| 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000152bce0bf000) | ||||
| 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000152bc9b3d000) | ||||
| 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000152bc9933000) | ||||
| 	libutil.so.1 => /lib64/libutil.so.1 (0x0000152bc972f000) | ||||
							
								
								
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | ||||
| tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| local rank 1 device 0 bus id: 0000:44:00.0 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| local rank 2 device 0 bus id: 0000:84:00.0 | ||||
| local rank 3 device 0 bus id: 0000:C4:00.0 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| SharedMemoryMpi:  World communicator of size 64 | ||||
| SharedMemoryMpi:  Node  communicator of size 4 | ||||
| 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x147320000000 for comms buffers  | ||||
| Setting up IPC | ||||
|  | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||
| __|_                                    _|__ | ||||
| __|_   GGGG    RRRR    III    DDDD      _|__ | ||||
| __|_  G        R   R    I     D   D     _|__ | ||||
| __|_  G        R   R    I     D    D    _|__ | ||||
| __|_  G  GG    RRRR     I     D    D    _|__ | ||||
| __|_  G   G    R  R     I     D   D     _|__ | ||||
| __|_   GGGG    R   R   III    DDDD      _|__ | ||||
| __|_                                    _|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
|   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||
|  | ||||
|  | ||||
| Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
| Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes | ||||
|  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : MPI is initialised and logging filters activated  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : Requested 2147483648 byte stencil comms buffers  | ||||
| Grid : Message : MemoryManager Cache 34004218675 bytes  | ||||
| Grid : Message : MemoryManager::Init() setting up | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 | ||||
| Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||
| Grid : Message : MemoryManager::Init() Using cudaMalloc | ||||
| Grid : Message : 1.574553 s : Grid Layout | ||||
| Grid : Message : 1.574555 s : 	Global lattice size  : 48 48 48 96  | ||||
| Grid : Message : 1.574559 s : 	OpenMP threads       : 4 | ||||
| Grid : Message : 1.574561 s : 	MPI tasks            : 2 2 2 8  | ||||
| Grid : Message : 1.590560 s : Making s innermost grids | ||||
| Grid : Message : 1.602336 s : Initialising 4d RNG | ||||
| Grid : Message : 1.619266 s : Intialising parallel RNG with unique string 'The 4D RNG' | ||||
| Grid : Message : 1.619291 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 | ||||
| Grid : Message : 1.883640 s : Initialising 5d RNG | ||||
| Grid : Message : 2.117383 s : Intialising parallel RNG with unique string 'The 5D RNG' | ||||
| Grid : Message : 2.117419 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a | ||||
| Grid : Message : 7.594282 s : Initialised RNGs | ||||
| Grid : Message : 8.809615 s : Drawing gauge field | ||||
| Grid : Message : 8.954788 s : Random gauge initialised  | ||||
| Grid : Message : 8.965668 s : Setting up Cshift based reference  | ||||
| Grid : Message : 13.965128 s : ***************************************************************** | ||||
| Grid : Message : 13.965152 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm | ||||
| Grid : Message : 13.965153 s : ***************************************************************** | ||||
| Grid : Message : 13.965154 s : ***************************************************************** | ||||
| Grid : Message : 13.965155 s : * Benchmarking DomainWallFermionR::Dhop                   | ||||
| Grid : Message : 13.965156 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 13.965157 s : * VComplexF size is 64 B | ||||
| Grid : Message : 13.965159 s : * SINGLE precision  | ||||
| Grid : Message : 13.965160 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 13.965161 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 13.965162 s : ***************************************************************** | ||||
| Grid : Message : 14.515202 s : Called warmup | ||||
| Grid : Message : 99.730150 s : Called Dw 30000 times in 8.52149e+07 us | ||||
| Grid : Message : 99.730204 s : mflop/s =   7.89395e+07 | ||||
| Grid : Message : 99.730206 s : mflop/s per rank =  1.23343e+06 | ||||
| Grid : Message : 99.730208 s : mflop/s per node =  4.93372e+06 | ||||
| Grid : Message : 99.730210 s : RF  GiB/s (base 2) =   160403 | ||||
| Grid : Message : 99.730212 s : mem GiB/s (base 2) =   100252 | ||||
| Grid : Message : 99.730784 s : norm diff   1.05775e-13 | ||||
| Grid : Message : 99.740621 s : #### Dhop calls report  | ||||
| Grid : Message : 99.740628 s : WilsonFermion5D Number of DhopEO Calls   : 60002 | ||||
| Grid : Message : 99.740631 s : WilsonFermion5D TotalTime   /Calls        : 1421.72 us | ||||
| Grid : Message : 99.740633 s : WilsonFermion5D CommTime    /Calls        : 984.801 us | ||||
| Grid : Message : 99.740635 s : WilsonFermion5D FaceTime    /Calls        : 215.72 us | ||||
| Grid : Message : 99.740637 s : WilsonFermion5D ComputeTime1/Calls        : 2.65594 us | ||||
| Grid : Message : 99.740639 s : WilsonFermion5D ComputeTime2/Calls        : 233.727 us | ||||
| Grid : Message : 99.740655 s : Average mflops/s per call                : 3.59268e+10 | ||||
| Grid : Message : 99.740658 s : Average mflops/s per call per rank       : 5.61356e+08 | ||||
| Grid : Message : 99.740660 s : Average mflops/s per call per node       : 2.24542e+09 | ||||
| Grid : Message : 99.740662 s : Average mflops/s per call (full)         : 8.02916e+07 | ||||
| Grid : Message : 99.740665 s : Average mflops/s per call per rank (full): 1.25456e+06 | ||||
| Grid : Message : 99.740667 s : Average mflops/s per call per node (full): 5.01823e+06 | ||||
| Grid : Message : 99.740669 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 99.740670 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 99.740672 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 99.740673 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 99.740675 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 99.740679 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 108.466783 s : Compare to naive wilson implementation Dag to verify correctness | ||||
| Grid : Message : 108.466816 s : Called DwDag | ||||
| Grid : Message : 108.466817 s : norm dag result 12.0422 | ||||
| Grid : Message : 108.470193 s : norm dag ref    12.0422 | ||||
| Grid : Message : 108.473428 s : norm dag diff   7.28899e-14 | ||||
| Grid : Message : 108.486838 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec | ||||
| Grid : Message : 108.550312 s : src_e0.5 | ||||
| Grid : Message : 108.623836 s : src_o0.5 | ||||
| Grid : Message : 108.640541 s : ********************************************************* | ||||
| Grid : Message : 108.640545 s : * Benchmarking DomainWallFermionF::DhopEO                 | ||||
| Grid : Message : 108.640546 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 108.640548 s : * SINGLE precision  | ||||
| Grid : Message : 108.640553 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 108.640555 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 108.640556 s : ********************************************************* | ||||
| Grid : Message : 154.233908 s : Deo mflop/s =   7.37872e+07 | ||||
| Grid : Message : 154.233941 s : Deo mflop/s per rank   1.15293e+06 | ||||
| Grid : Message : 154.233943 s : Deo mflop/s per node   4.6117e+06 | ||||
| Grid : Message : 154.233946 s : #### Dhop calls report  | ||||
| Grid : Message : 154.233948 s : WilsonFermion5D Number of DhopEO Calls   : 30001 | ||||
| Grid : Message : 154.233950 s : WilsonFermion5D TotalTime   /Calls        : 1519.59 us | ||||
| Grid : Message : 154.233952 s : WilsonFermion5D CommTime    /Calls        : 1019.64 us | ||||
| Grid : Message : 154.233954 s : WilsonFermion5D FaceTime    /Calls        : 288.201 us | ||||
| Grid : Message : 154.233956 s : WilsonFermion5D ComputeTime1/Calls        : 4.91837 us | ||||
| Grid : Message : 154.233958 s : WilsonFermion5D ComputeTime2/Calls        : 236.348 us | ||||
| Grid : Message : 154.233977 s : Average mflops/s per call                : 2.07539e+10 | ||||
| Grid : Message : 154.233980 s : Average mflops/s per call per rank       : 3.24279e+08 | ||||
| Grid : Message : 154.233982 s : Average mflops/s per call per node       : 1.29712e+09 | ||||
| Grid : Message : 154.233984 s : Average mflops/s per call (full)         : 7.51203e+07 | ||||
| Grid : Message : 154.233986 s : Average mflops/s per call per rank (full): 1.17375e+06 | ||||
| Grid : Message : 154.233988 s : Average mflops/s per call per node (full): 4.69502e+06 | ||||
| Grid : Message : 154.233991 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 154.233992 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 154.233993 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 154.233994 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 154.233995 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 154.233996 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 154.253979 s : r_e6.02106 | ||||
| Grid : Message : 154.255883 s : r_o6.0211 | ||||
| Grid : Message : 154.257289 s : res12.0422 | ||||
| Grid : Message : 154.364123 s : norm diff   0 | ||||
| Grid : Message : 154.496590 s : norm diff even  0 | ||||
| Grid : Message : 154.572879 s : norm diff odd   0 | ||||
							
								
								
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/nodes
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/nodes
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21] | ||||
							
								
								
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/script
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1110.64091/script
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| #!/usr/bin/env bash | ||||
| # shellcheck disable=SC1091,SC2050,SC2170 | ||||
|  | ||||
| # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa | ||||
|  | ||||
| #SBATCH -J power-16A-1110 | ||||
| #SBATCH -A dp207 | ||||
| #SBATCH -t 48:00:00 | ||||
| #SBATCH --nodes=16 | ||||
| #SBATCH --ntasks=64 | ||||
| #SBATCH --ntasks-per-node=4 | ||||
| #SBATCH --cpus-per-task=8 | ||||
| #SBATCH --partition=gpu | ||||
| #SBATCH --gres=gpu:4 | ||||
| #SBATCH --output=%x.%j.out | ||||
| #SBATCH --error=%x.%j.err | ||||
| #SBATCH --reservation=dc-port1_61 | ||||
| #SBATCH --qos=reservation | ||||
| #SBATCH --no-requeue | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # OpenMP/OpenMPI/UCX environment ############################################### | ||||
| export OMP_NUM_THREADS=4 | ||||
| export OMPI_MCA_btl=^uct,openib | ||||
| export OMPI_MCA_pml=ucx | ||||
| export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc | ||||
| export UCX_RNDV_SCHEME=put_zcopy | ||||
| export UCX_RNDV_THRESH=16384 | ||||
| export UCX_IB_GPU_DIRECT_RDMA=yes | ||||
| export UCX_MEMTYPE_CACHE=n | ||||
|  | ||||
| # IO environment ############################################################### | ||||
|  | ||||
| if [ 16 -eq 1 ]; then | ||||
| 	export OMPI_MCA_io=ompio | ||||
| else | ||||
| 	export OMPI_MCA_io=romio321 | ||||
| fi | ||||
| export OMPI_MCA_btl_openib_allow_ib=true | ||||
| export OMPI_MCA_btl_openib_device_type=infiniband | ||||
| export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 | ||||
|  | ||||
| # load environment ############################################################# | ||||
| env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)" | ||||
| source "${env_dir}/env-base.sh" | ||||
| if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then | ||||
| 	source "${env_dir}/env-gpu.sh" | ||||
| else | ||||
| 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 | ||||
|   exit 1 | ||||
| fi | ||||
| spack load sshpass | ||||
|  | ||||
| # application and parameters ################################################### | ||||
| app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32' | ||||
| opt=('--comms-overlap' '--comms-concurrent') | ||||
| par='' | ||||
|  | ||||
| # collect job information ###################################################### | ||||
| job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} | ||||
| mkdir -p "${job_info_dir}" | ||||
|  | ||||
| date                         > "${job_info_dir}/start-date" | ||||
| echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date" | ||||
| set                          > "${job_info_dir}/env" | ||||
| ldd ${app}                   > "${job_info_dir}/ldd" | ||||
| md5sum ${app}                > "${job_info_dir}/app-hash" | ||||
| readelf -a ${app}            > "${job_info_dir}/elf" | ||||
| echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" | ||||
| cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script" | ||||
| if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi | ||||
|  | ||||
| # GPU frequency control ######################################################## | ||||
| power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/' | ||||
| freq=1110 | ||||
|  | ||||
| # set frequency | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}" | ||||
| done | ||||
| # start NVIDIA SMI monitoring | ||||
| tmp=$(mktemp) | ||||
| sleep 1 | ||||
| coproc nvidia-smi dmon -o DT &> "${tmp}" | ||||
|  | ||||
| # run! ######################################################################### | ||||
| mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ | ||||
| 	./gpu-mpi-wrapper.sh \ | ||||
|   ${app} "${par}" "${opt[@]}" \ | ||||
| 	--mpi 2.2.2.8 \ | ||||
|   --accelerator-threads 8 \ | ||||
| 	--grid 48.48.48.96 \ | ||||
| 	--shm 2048 &> "${job_info_dir}/log" | ||||
|  | ||||
| # if we reach that point the application exited successfully ################### | ||||
| touch "${job_info_dir}/success" | ||||
| date > "${job_info_dir}/end-date" | ||||
| echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date" | ||||
|  | ||||
| # reset GPUS ################################################################### | ||||
| # stop monitoring | ||||
| kill -INT "${COPROC_PID}" | ||||
|  | ||||
| # make monitoring DB | ||||
| ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}" | ||||
|  | ||||
| # reset clocks | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'  | ||||
| done | ||||
| ################################################################################ | ||||
| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 21:09:39 BST 2022 | ||||
| epoch 1661026179 | ||||
| @@ -0,0 +1 @@ | ||||
| 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32 | ||||
							
								
								
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/elf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/elf
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 21:18:10 BST 2022 | ||||
| epoch 1661026690 | ||||
							
								
								
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/env
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/ldd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/ldd
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| 	linux-vdso.so.1 (0x00007ffe04b26000) | ||||
| 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ffbc78a000) | ||||
| 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ffbc3c2000) | ||||
| 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ffbbed0000) | ||||
| 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ffbbba6000) | ||||
| 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ffbb8c5000) | ||||
| 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ffbb664000) | ||||
| 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ffbc711000) | ||||
| 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ffbb284000) | ||||
| 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ffb9b28000) | ||||
| 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ffb9758000) | ||||
| 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ffb94b7000) | ||||
| 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ffb938c000) | ||||
| 	libm.so.6 => /lib64/libm.so.6 (0x000014ffb900a000) | ||||
| 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ffb8dd3000) | ||||
| 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ffb8bbb000) | ||||
| 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ffb899b000) | ||||
| 	libc.so.6 => /lib64/libc.so.6 (0x000014ffb85d6000) | ||||
| 	libdl.so.2 => /lib64/libdl.so.2 (0x000014ffb83d2000) | ||||
| 	/lib64/ld-linux-x86-64.so.2 (0x000014ffbc5da000) | ||||
| 	librt.so.1 => /lib64/librt.so.1 (0x000014ffb81ca000) | ||||
| 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ffbc645000) | ||||
| 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ffbc640000) | ||||
| 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ffb80be000) | ||||
| 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ffb7eb4000) | ||||
| 	libutil.so.1 => /lib64/libutil.so.1 (0x000014ffb7cb0000) | ||||
							
								
								
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | ||||
| tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 2 device 0 bus id: 0000:84:00.0 | ||||
| local rank 1 device 0 bus id: 0000:44:00.0 | ||||
| local rank 3 device 0 bus id: 0000:C4:00.0 | ||||
| SharedMemoryMpi:  World communicator of size 64 | ||||
| SharedMemoryMpi:  Node  communicator of size 4 | ||||
| 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146500000000 for comms buffers  | ||||
| Setting up IPC | ||||
|  | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||
| __|_                                    _|__ | ||||
| __|_   GGGG    RRRR    III    DDDD      _|__ | ||||
| __|_  G        R   R    I     D   D     _|__ | ||||
| __|_  G        R   R    I     D    D    _|__ | ||||
| __|_  G  GG    RRRR     I     D    D    _|__ | ||||
| __|_  G   G    R  R     I     D   D     _|__ | ||||
| __|_   GGGG    R   R   III    DDDD      _|__ | ||||
| __|_                                    _|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
|   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||
|  | ||||
|  | ||||
| Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
| Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes | ||||
|  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : MPI is initialised and logging filters activated  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : Requested 2147483648 byte stencil comms buffers  | ||||
| Grid : Message : MemoryManager Cache 34004218675 bytes  | ||||
| Grid : Message : MemoryManager::Init() setting up | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 | ||||
| Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||
| Grid : Message : MemoryManager::Init() Using cudaMalloc | ||||
| Grid : Message : 1.503072 s : Grid Layout | ||||
| Grid : Message : 1.503076 s : 	Global lattice size  : 48 48 48 96  | ||||
| Grid : Message : 1.503081 s : 	OpenMP threads       : 4 | ||||
| Grid : Message : 1.503083 s : 	MPI tasks            : 2 2 2 8  | ||||
| Grid : Message : 1.518479 s : Making s innermost grids | ||||
| Grid : Message : 1.535611 s : Initialising 4d RNG | ||||
| Grid : Message : 1.551229 s : Intialising parallel RNG with unique string 'The 4D RNG' | ||||
| Grid : Message : 1.551252 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 | ||||
| Grid : Message : 1.805667 s : Initialising 5d RNG | ||||
| Grid : Message : 2.356490 s : Intialising parallel RNG with unique string 'The 5D RNG' | ||||
| Grid : Message : 2.357030 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a | ||||
| Grid : Message : 7.303785 s : Initialised RNGs | ||||
| Grid : Message : 8.385261 s : Drawing gauge field | ||||
| Grid : Message : 8.496485 s : Random gauge initialised  | ||||
| Grid : Message : 8.509783 s : Setting up Cshift based reference  | ||||
| Grid : Message : 13.609539 s : ***************************************************************** | ||||
| Grid : Message : 13.609564 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm | ||||
| Grid : Message : 13.609566 s : ***************************************************************** | ||||
| Grid : Message : 13.609568 s : ***************************************************************** | ||||
| Grid : Message : 13.609573 s : * Benchmarking DomainWallFermionR::Dhop                   | ||||
| Grid : Message : 13.609575 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 13.609577 s : * VComplexF size is 64 B | ||||
| Grid : Message : 13.609579 s : * SINGLE precision  | ||||
| Grid : Message : 13.609582 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 13.609584 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 13.609586 s : ***************************************************************** | ||||
| Grid : Message : 14.155991 s : Called warmup | ||||
| Grid : Message : 98.420612 s : Called Dw 30000 times in 8.42644e+07 us | ||||
| Grid : Message : 98.420675 s : mflop/s =   7.983e+07 | ||||
| Grid : Message : 98.420677 s : mflop/s per rank =  1.24734e+06 | ||||
| Grid : Message : 98.420679 s : mflop/s per node =  4.98937e+06 | ||||
| Grid : Message : 98.420681 s : RF  GiB/s (base 2) =   162213 | ||||
| Grid : Message : 98.420683 s : mem GiB/s (base 2) =   101383 | ||||
| Grid : Message : 98.421254 s : norm diff   1.05775e-13 | ||||
| Grid : Message : 98.431170 s : #### Dhop calls report  | ||||
| Grid : Message : 98.431178 s : WilsonFermion5D Number of DhopEO Calls   : 60002 | ||||
| Grid : Message : 98.431182 s : WilsonFermion5D TotalTime   /Calls        : 1405.63 us | ||||
| Grid : Message : 98.431184 s : WilsonFermion5D CommTime    /Calls        : 961.451 us | ||||
| Grid : Message : 98.431186 s : WilsonFermion5D FaceTime    /Calls        : 222.433 us | ||||
| Grid : Message : 98.431188 s : WilsonFermion5D ComputeTime1/Calls        : 2.80214 us | ||||
| Grid : Message : 98.431190 s : WilsonFermion5D ComputeTime2/Calls        : 234.1 us | ||||
| Grid : Message : 98.431212 s : Average mflops/s per call                : 3.60793e+10 | ||||
| Grid : Message : 98.431216 s : Average mflops/s per call per rank       : 5.63738e+08 | ||||
| Grid : Message : 98.431218 s : Average mflops/s per call per node       : 2.25495e+09 | ||||
| Grid : Message : 98.431220 s : Average mflops/s per call (full)         : 8.12107e+07 | ||||
| Grid : Message : 98.431224 s : Average mflops/s per call per rank (full): 1.26892e+06 | ||||
| Grid : Message : 98.431226 s : Average mflops/s per call per node (full): 5.07567e+06 | ||||
| Grid : Message : 98.431229 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 98.431230 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 98.431235 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 98.431239 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 98.431240 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 98.431241 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 107.161203 s : Compare to naive wilson implementation Dag to verify correctness | ||||
| Grid : Message : 107.161230 s : Called DwDag | ||||
| Grid : Message : 107.161231 s : norm dag result 12.0422 | ||||
| Grid : Message : 107.163717 s : norm dag ref    12.0422 | ||||
| Grid : Message : 107.166717 s : norm dag diff   7.28899e-14 | ||||
| Grid : Message : 107.181064 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec | ||||
| Grid : Message : 107.248613 s : src_e0.5 | ||||
| Grid : Message : 107.314227 s : src_o0.5 | ||||
| Grid : Message : 107.331787 s : ********************************************************* | ||||
| Grid : Message : 107.331790 s : * Benchmarking DomainWallFermionF::DhopEO                 | ||||
| Grid : Message : 107.331792 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 107.331794 s : * SINGLE precision  | ||||
| Grid : Message : 107.331795 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 107.331796 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 107.331797 s : ********************************************************* | ||||
| Grid : Message : 152.337360 s : Deo mflop/s =   7.47496e+07 | ||||
| Grid : Message : 152.337387 s : Deo mflop/s per rank   1.16796e+06 | ||||
| Grid : Message : 152.337390 s : Deo mflop/s per node   4.67185e+06 | ||||
| Grid : Message : 152.337396 s : #### Dhop calls report  | ||||
| Grid : Message : 152.337399 s : WilsonFermion5D Number of DhopEO Calls   : 30001 | ||||
| Grid : Message : 152.337402 s : WilsonFermion5D TotalTime   /Calls        : 1500 us | ||||
| Grid : Message : 152.337405 s : WilsonFermion5D CommTime    /Calls        : 1002.91 us | ||||
| Grid : Message : 152.337408 s : WilsonFermion5D FaceTime    /Calls        : 282.963 us | ||||
| Grid : Message : 152.337410 s : WilsonFermion5D ComputeTime1/Calls        : 4.71911 us | ||||
| Grid : Message : 152.337412 s : WilsonFermion5D ComputeTime2/Calls        : 237.647 us | ||||
| Grid : Message : 152.337435 s : Average mflops/s per call                : 2.07759e+10 | ||||
| Grid : Message : 152.337439 s : Average mflops/s per call per rank       : 3.24624e+08 | ||||
| Grid : Message : 152.337441 s : Average mflops/s per call per node       : 1.29849e+09 | ||||
| Grid : Message : 152.337445 s : Average mflops/s per call (full)         : 7.61013e+07 | ||||
| Grid : Message : 152.337448 s : Average mflops/s per call per rank (full): 1.18908e+06 | ||||
| Grid : Message : 152.337451 s : Average mflops/s per call per node (full): 4.75633e+06 | ||||
| Grid : Message : 152.337453 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 152.337456 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 152.337457 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 152.337459 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 152.337462 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 152.337463 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 152.358219 s : r_e6.02106 | ||||
| Grid : Message : 152.359968 s : r_o6.0211 | ||||
| Grid : Message : 152.361373 s : res12.0422 | ||||
| Grid : Message : 152.467780 s : norm diff   0 | ||||
| Grid : Message : 152.609427 s : norm diff even  0 | ||||
| Grid : Message : 152.675745 s : norm diff odd   0 | ||||
							
								
								
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/nodes
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/nodes
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21] | ||||
							
								
								
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/script
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										112
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1125.64095/script
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| #!/usr/bin/env bash | ||||
| # shellcheck disable=SC1091,SC2050,SC2170 | ||||
|  | ||||
| # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa | ||||
|  | ||||
| #SBATCH -J power-16A-1125 | ||||
| #SBATCH -A dp207 | ||||
| #SBATCH -t 48:00:00 | ||||
| #SBATCH --nodes=16 | ||||
| #SBATCH --ntasks=64 | ||||
| #SBATCH --ntasks-per-node=4 | ||||
| #SBATCH --cpus-per-task=8 | ||||
| #SBATCH --partition=gpu | ||||
| #SBATCH --gres=gpu:4 | ||||
| #SBATCH --output=%x.%j.out | ||||
| #SBATCH --error=%x.%j.err | ||||
| #SBATCH --reservation=dc-port1_61 | ||||
| #SBATCH --qos=reservation | ||||
| #SBATCH --no-requeue | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # OpenMP/OpenMPI/UCX environment ############################################### | ||||
| export OMP_NUM_THREADS=4 | ||||
| export OMPI_MCA_btl=^uct,openib | ||||
| export OMPI_MCA_pml=ucx | ||||
| export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc | ||||
| export UCX_RNDV_SCHEME=put_zcopy | ||||
| export UCX_RNDV_THRESH=16384 | ||||
| export UCX_IB_GPU_DIRECT_RDMA=yes | ||||
| export UCX_MEMTYPE_CACHE=n | ||||
|  | ||||
| # IO environment ############################################################### | ||||
|  | ||||
| if [ 16 -eq 1 ]; then | ||||
| 	export OMPI_MCA_io=ompio | ||||
| else | ||||
| 	export OMPI_MCA_io=romio321 | ||||
| fi | ||||
| export OMPI_MCA_btl_openib_allow_ib=true | ||||
| export OMPI_MCA_btl_openib_device_type=infiniband | ||||
| export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 | ||||
|  | ||||
| # load environment ############################################################# | ||||
| env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)" | ||||
| source "${env_dir}/env-base.sh" | ||||
| if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then | ||||
| 	source "${env_dir}/env-gpu.sh" | ||||
| else | ||||
| 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 | ||||
|   exit 1 | ||||
| fi | ||||
| spack load sshpass | ||||
|  | ||||
| # application and parameters ################################################### | ||||
| app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32' | ||||
| opt=('--comms-overlap' '--comms-concurrent') | ||||
| par='' | ||||
|  | ||||
| # collect job information ###################################################### | ||||
| job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} | ||||
| mkdir -p "${job_info_dir}" | ||||
|  | ||||
| date                         > "${job_info_dir}/start-date" | ||||
| echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date" | ||||
| set                          > "${job_info_dir}/env" | ||||
| ldd ${app}                   > "${job_info_dir}/ldd" | ||||
| md5sum ${app}                > "${job_info_dir}/app-hash" | ||||
| readelf -a ${app}            > "${job_info_dir}/elf" | ||||
| echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" | ||||
| cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script" | ||||
| if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi | ||||
|  | ||||
| # GPU frequency control ######################################################## | ||||
| power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/' | ||||
| freq=1125 | ||||
|  | ||||
| # set frequency | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}" | ||||
| done | ||||
| # start NVIDIA SMI monitoring | ||||
| tmp=$(mktemp) | ||||
| sleep 1 | ||||
| coproc nvidia-smi dmon -o DT &> "${tmp}" | ||||
|  | ||||
| # run! ######################################################################### | ||||
| mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ | ||||
| 	./gpu-mpi-wrapper.sh \ | ||||
|   ${app} "${par}" "${opt[@]}" \ | ||||
| 	--mpi 2.2.2.8 \ | ||||
|   --accelerator-threads 8 \ | ||||
| 	--grid 48.48.48.96 \ | ||||
| 	--shm 2048 &> "${job_info_dir}/log" | ||||
|  | ||||
| # if we reach that point the application exited successfully ################### | ||||
| touch "${job_info_dir}/success" | ||||
| date > "${job_info_dir}/end-date" | ||||
| echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date" | ||||
|  | ||||
| # reset GPUS ################################################################### | ||||
| # stop monitoring | ||||
| kill -INT "${COPROC_PID}" | ||||
|  | ||||
| # make monitoring DB | ||||
| ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}" | ||||
|  | ||||
| # reset clocks | ||||
| for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do  | ||||
| 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'  | ||||
| done | ||||
| ################################################################################ | ||||
| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 21:15:27 BST 2022 | ||||
| epoch 1661026527 | ||||
| @@ -0,0 +1 @@ | ||||
| 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32 | ||||
							
								
								
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1140.64100/elf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4310
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1140.64100/elf
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -0,0 +1,2 @@ | ||||
| Sat Aug 20 21:23:53 BST 2022 | ||||
| epoch 1661027033 | ||||
							
								
								
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1140.64100/env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2062
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1140.64100/env
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1140.64100/ldd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1140.64100/ldd
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| 	linux-vdso.so.1 (0x00007ffebcf65000) | ||||
| 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014c5058a0000) | ||||
| 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014c5054d8000) | ||||
| 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014c504fe6000) | ||||
| 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014c504cbc000) | ||||
| 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014c5049db000) | ||||
| 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014c50477a000) | ||||
| 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014c505827000) | ||||
| 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014c50439a000) | ||||
| 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014c502c3e000) | ||||
| 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014c50286e000) | ||||
| 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014c5025cd000) | ||||
| 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014c5024a2000) | ||||
| 	libm.so.6 => /lib64/libm.so.6 (0x000014c502120000) | ||||
| 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014c501ee9000) | ||||
| 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014c501cd1000) | ||||
| 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014c501ab1000) | ||||
| 	libc.so.6 => /lib64/libc.so.6 (0x000014c5016ec000) | ||||
| 	libdl.so.2 => /lib64/libdl.so.2 (0x000014c5014e8000) | ||||
| 	/lib64/ld-linux-x86-64.so.2 (0x000014c5056f0000) | ||||
| 	librt.so.1 => /lib64/librt.so.1 (0x000014c5012e0000) | ||||
| 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014c50575b000) | ||||
| 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014c505756000) | ||||
| 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014c5011d4000) | ||||
| 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014c500fca000) | ||||
| 	libutil.so.1 => /lib64/libutil.so.1 (0x000014c500dc6000) | ||||
							
								
								
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1140.64100/log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								2-racks/size-C0/16-nodes/job/power-16A-1140.64100/log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | ||||
| tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 | ||||
| tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 | ||||
| tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 | ||||
| tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 | ||||
| tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device Number    : 0 | ||||
| AcceleratorCudaInit[0]: ======================== | ||||
| AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB | ||||
| AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344  | ||||
| AcceleratorCudaInit[0]:   managedMemory: 1  | ||||
| AcceleratorCudaInit[0]:   isMultiGpuBoard: 0  | ||||
| AcceleratorCudaInit[0]:   warpSize: 32  | ||||
| AcceleratorCudaInit[0]:   pciBusID: 3  | ||||
| AcceleratorCudaInit[0]:   pciDeviceID: 0  | ||||
| AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| OPENMPI detected | ||||
| AcceleratorCudaInit: using default device  | ||||
| AcceleratorCudaInit: assume user either uses | ||||
| AcceleratorCudaInit: a) IBM jsrun, or  | ||||
| AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding  | ||||
| AcceleratorCudaInit: Configure options --enable-setdevice=no  | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 2 device 0 bus id: 0000:84:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 0 device 0 bus id: 0000:03:00.0 | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| AcceleratorCudaInit: ================================================ | ||||
| local rank 1 device 0 bus id: 0000:44:00.0 | ||||
| local rank 3 device 0 bus id: 0000:C4:00.0 | ||||
| SharedMemoryMpi:  World communicator of size 64 | ||||
| SharedMemoryMpi:  Node  communicator of size 4 | ||||
| 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x151740000000 for comms buffers  | ||||
| Setting up IPC | ||||
|  | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||
| __|_                                    _|__ | ||||
| __|_   GGGG    RRRR    III    DDDD      _|__ | ||||
| __|_  G        R   R    I     D   D     _|__ | ||||
| __|_  G        R   R    I     D    D    _|__ | ||||
| __|_  G  GG    RRRR     I     D    D    _|__ | ||||
| __|_  G   G    R  R     I     D   D     _|__ | ||||
| __|_   GGGG    R   R   III    DDDD      _|__ | ||||
| __|_                                    _|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
|   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||
|  | ||||
|  | ||||
| Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
| Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes | ||||
|  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : MPI is initialised and logging filters activated  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : Requested 2147483648 byte stencil comms buffers  | ||||
| Grid : Message : MemoryManager Cache 34004218675 bytes  | ||||
| Grid : Message : MemoryManager::Init() setting up | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 | ||||
| Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||
| Grid : Message : MemoryManager::Init() Using cudaMalloc | ||||
| Grid : Message : 1.505233 s : Grid Layout | ||||
| Grid : Message : 1.505236 s : 	Global lattice size  : 48 48 48 96  | ||||
| Grid : Message : 1.505241 s : 	OpenMP threads       : 4 | ||||
| Grid : Message : 1.505243 s : 	MPI tasks            : 2 2 2 8  | ||||
| Grid : Message : 1.518777 s : Making s innermost grids | ||||
| Grid : Message : 1.535643 s : Initialising 4d RNG | ||||
| Grid : Message : 1.552849 s : Intialising parallel RNG with unique string 'The 4D RNG' | ||||
| Grid : Message : 1.552877 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 | ||||
| Grid : Message : 1.845937 s : Initialising 5d RNG | ||||
| Grid : Message : 2.760090 s : Intialising parallel RNG with unique string 'The 5D RNG' | ||||
| Grid : Message : 2.760740 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a | ||||
| Grid : Message : 7.286341 s : Initialised RNGs | ||||
| Grid : Message : 8.532165 s : Drawing gauge field | ||||
| Grid : Message : 8.668228 s : Random gauge initialised  | ||||
| Grid : Message : 8.682313 s : Setting up Cshift based reference  | ||||
| Grid : Message : 13.795201 s : ***************************************************************** | ||||
| Grid : Message : 13.795223 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm | ||||
| Grid : Message : 13.795225 s : ***************************************************************** | ||||
| Grid : Message : 13.795226 s : ***************************************************************** | ||||
| Grid : Message : 13.795227 s : * Benchmarking DomainWallFermionR::Dhop                   | ||||
| Grid : Message : 13.795228 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 13.795229 s : * VComplexF size is 64 B | ||||
| Grid : Message : 13.795230 s : * SINGLE precision  | ||||
| Grid : Message : 13.795231 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 13.795232 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 13.795233 s : ***************************************************************** | ||||
| Grid : Message : 14.341937 s : Called warmup | ||||
| Grid : Message : 98.152427 s : Called Dw 30000 times in 8.38102e+07 us | ||||
| Grid : Message : 98.152504 s : mflop/s =   8.02626e+07 | ||||
| Grid : Message : 98.152506 s : mflop/s per rank =  1.2541e+06 | ||||
| Grid : Message : 98.152508 s : mflop/s per node =  5.01641e+06 | ||||
| Grid : Message : 98.152510 s : RF  GiB/s (base 2) =   163092 | ||||
| Grid : Message : 98.152512 s : mem GiB/s (base 2) =   101932 | ||||
| Grid : Message : 98.153085 s : norm diff   1.05775e-13 | ||||
| Grid : Message : 98.162893 s : #### Dhop calls report  | ||||
| Grid : Message : 98.162901 s : WilsonFermion5D Number of DhopEO Calls   : 60002 | ||||
| Grid : Message : 98.162908 s : WilsonFermion5D TotalTime   /Calls        : 1397.97 us | ||||
| Grid : Message : 98.162910 s : WilsonFermion5D CommTime    /Calls        : 954.568 us | ||||
| Grid : Message : 98.162912 s : WilsonFermion5D FaceTime    /Calls        : 222.6 us | ||||
| Grid : Message : 98.162914 s : WilsonFermion5D ComputeTime1/Calls        : 2.85327 us | ||||
| Grid : Message : 98.162916 s : WilsonFermion5D ComputeTime2/Calls        : 233.554 us | ||||
| Grid : Message : 98.162999 s : Average mflops/s per call                : 3.60928e+10 | ||||
| Grid : Message : 98.163003 s : Average mflops/s per call per rank       : 5.63949e+08 | ||||
| Grid : Message : 98.163005 s : Average mflops/s per call per node       : 2.2558e+09 | ||||
| Grid : Message : 98.163007 s : Average mflops/s per call (full)         : 8.16558e+07 | ||||
| Grid : Message : 98.163009 s : Average mflops/s per call per rank (full): 1.27587e+06 | ||||
| Grid : Message : 98.163011 s : Average mflops/s per call per node (full): 5.10348e+06 | ||||
| Grid : Message : 98.163013 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 98.163014 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 98.163015 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 98.163016 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 98.163017 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 98.163018 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 106.895323 s : Compare to naive wilson implementation Dag to verify correctness | ||||
| Grid : Message : 106.895349 s : Called DwDag | ||||
| Grid : Message : 106.895350 s : norm dag result 12.0422 | ||||
| Grid : Message : 106.897900 s : norm dag ref    12.0422 | ||||
| Grid : Message : 106.900879 s : norm dag diff   7.28899e-14 | ||||
| Grid : Message : 106.910611 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec | ||||
| Grid : Message : 106.985274 s : src_e0.5 | ||||
| Grid : Message : 107.454790 s : src_o0.5 | ||||
| Grid : Message : 107.617760 s : ********************************************************* | ||||
| Grid : Message : 107.617810 s : * Benchmarking DomainWallFermionF::DhopEO                 | ||||
| Grid : Message : 107.617840 s : * Vectorising space-time by 8 | ||||
| Grid : Message : 107.617860 s : * SINGLE precision  | ||||
| Grid : Message : 107.617880 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 107.617970 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 107.618000 s : ********************************************************* | ||||
| Grid : Message : 151.618746 s : Deo mflop/s =   7.55023e+07 | ||||
| Grid : Message : 151.618781 s : Deo mflop/s per rank   1.17972e+06 | ||||
| Grid : Message : 151.618784 s : Deo mflop/s per node   4.71889e+06 | ||||
| Grid : Message : 151.618787 s : #### Dhop calls report  | ||||
| Grid : Message : 151.618789 s : WilsonFermion5D Number of DhopEO Calls   : 30001 | ||||
| Grid : Message : 151.618791 s : WilsonFermion5D TotalTime   /Calls        : 1485.05 us | ||||
| Grid : Message : 151.618793 s : WilsonFermion5D CommTime    /Calls        : 986.29 us | ||||
| Grid : Message : 151.618795 s : WilsonFermion5D FaceTime    /Calls        : 286.831 us | ||||
| Grid : Message : 151.618797 s : WilsonFermion5D ComputeTime1/Calls        : 4.88924 us | ||||
| Grid : Message : 151.618799 s : WilsonFermion5D ComputeTime2/Calls        : 235.928 us | ||||
| Grid : Message : 151.618820 s : Average mflops/s per call                : 2.07643e+10 | ||||
| Grid : Message : 151.618823 s : Average mflops/s per call per rank       : 3.24442e+08 | ||||
| Grid : Message : 151.618825 s : Average mflops/s per call per node       : 1.29777e+09 | ||||
| Grid : Message : 151.618827 s : Average mflops/s per call (full)         : 7.68676e+07 | ||||
| Grid : Message : 151.618829 s : Average mflops/s per call per rank (full): 1.20106e+06 | ||||
| Grid : Message : 151.618831 s : Average mflops/s per call per node (full): 4.80423e+06 | ||||
| Grid : Message : 151.618834 s : WilsonFermion5D Stencil | ||||
| Grid : Message : 151.618835 s : WilsonFermion5D StencilEven | ||||
| Grid : Message : 151.618837 s : WilsonFermion5D StencilOdd | ||||
| Grid : Message : 151.618840 s : WilsonFermion5D Stencil     Reporti() | ||||
| Grid : Message : 151.618841 s : WilsonFermion5D StencilEven Reporti() | ||||
| Grid : Message : 151.618842 s : WilsonFermion5D StencilOdd  Reporti() | ||||
| Grid : Message : 151.638271 s : r_e6.02106 | ||||
| Grid : Message : 151.639942 s : r_o6.0211 | ||||
| Grid : Message : 151.641290 s : res12.0422 | ||||
| Grid : Message : 151.757693 s : norm diff   0 | ||||
| Grid : Message : 151.882958 s : norm diff even  0 | ||||
| Grid : Message : 151.953171 s : norm diff odd   0 | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user