SDCC benchmarking scripts for A100 nodes and IceLake nodes (AVX512)

2026-07-18 08:03:27 +01:00 · 2023-12-04 15:45:57 -05:00
parent b77a9b8947
commit 14643c0aab
6 changed files with 92 additions and 4 deletions
@@ -185,6 +185,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  GaugeField Umu(UGrid);
  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
  //  SU<Nc>::ColdConfiguration(Umu);
  UmuCopy=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
@@ -307,6 +308,14 @@ void Benchmark(int Ls, Coordinate Dirichlet)
    if(( n2e>1.0e-4) ) {
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
      std::cout<<GridLogMessage << "RESULT" << std::endl;
      //      std::cout << result<<std::endl;
      std::cout << norm2(result)<<std::endl;
      std::cout<<GridLogMessage << "REF" << std::endl;
      std::cout << norm2(ref)<<std::endl;
      std::cout<<GridLogMessage << "ERR" << std::endl;
      std::cout << norm2(err)<<std::endl;
      FGrid->Barrier();
      exit(-1);
    }
    assert (n2e< 1.0e-4 );
@@ -0,0 +1,42 @@
 #!/bin/bash
 #SBATCH --partition csi
 #SBATCH --time=00:10:00
 #SBATCH -A csigeneral
 #SBATCH --exclusive
 #SBATCH --nodes=1
 #SBATCH --ntasks=4
 #SBATCH --qos csi
 #SBATCH --gres=gpu:4
 source sourceme.sh
 cat << EOF > select_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3)
 export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
 export CUDA_VISIBLE_DEVICES=\$GPU
 unset ROCR_VISIBLE_DEVICES
 echo RANK \$SLURM_LOCALID using GPU \$GPU    
 exec \$*
 EOF
 chmod +x ./select_gpu
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export UCX_TLS=cuda,gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=no
 export UCX_MEMTYPE_CACHE=n
 export OMP_NUM_THREAD=8
 #srun -N1 -n1 nvidia-smi
 #srun -N1 -n1 numactl -H > numa.txt
 srun -N1 -n1 lstopo A100-topo.pdf
 # 4.35 TF/s
 #srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --shm 2048 --shm-mpi 0  --accelerator-threads 16
 srun -N1 -n4 ./select_gpu ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 32.32.64.64 --shm 2048 --shm-mpi 0  --accelerator-threads 16
@@ -5,7 +5,7 @@
 --enable-accelerator=cuda \
 --enable-gen-simd-width=64 \
 --enable-simd=GPU \
--enable-accelerator-cshift \
+--disable-accelerator-cshift \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=nvcc \
@@ -0,0 +1,31 @@
 #!/bin/bash
 #SBATCH --partition lqcd
 #SBATCH --time=00:20:00
 #SBATCH -A lqcdtest
 #SBATCH --exclusive
 #SBATCH --nodes=1
 #SBATCH --ntasks=2
 #SBATCH --qos lqcd
 source sourceme.sh
 export OMP_NUM_THREAD=24
 #srun -N1 -n1 numactl -H > numa.txt
 #srun -N1 -n1 lstopo ice-topo.pdf
 cat << EOF > select_socket
 #!/bin/bash
 export NUM_MAP=(0 1)
 export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
 exec \$*
 EOF
 chmod +x ./select_socket
 #for vol in 8.8.8.16 8.8.8.32 8.8.8.64
 #for vol in 8.8.16.16 8.8.16.32 8.8.16.64
 for vol in 8.16.16.16 8.16.16.32 8.16.16.64 16.16.16.32 16.16.16.64 24.24.24.64 32.32.32.32
 do
 srun --cpu-bind=ldoms -N1 -n2 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid $vol --dslash-asm > $vol.2socket.out
 srun --cpu-bind=ldoms -N1 -n1 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm > $vol.1socket.out
 done
@@ -1,13 +1,18 @@
 ../../configure \
--enable-comms=mpi \
+--enable-debug \
 --enable-comms=mpi-auto \
 --enable-unified=yes \
 --enable-shm=shmopen \
 --enable-shm-fast-path=shmopen \
 --enable-accelerator=none \
--enable-simd=AVX2 \
+--enable-simd=AVX512 \
 --disable-accelerator-cshift \
 --disable-fermion-reps \
 --disable-gparity \
-CXX=mpicxx \
+CXX=clang++ \
 MPICXX=mpicxx \
 LDFLAGS=-L/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/hwloc-2.9.1-hgkscnt5pferhtde4ahctlupb6qf3vtl/lib/ \
 LIBS=-lhwloc \
 CXXFLAGS="-std=c++17"
@@ -1 +1,2 @@
 export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-12.0.1-agey6vtuw3e375rewhhobvkznjh5ltz4/lib/:$LD_LIBRARY_PATH
 module load openmpi
`@@ -1 +1,2 @@`
		`export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-12.0.1-agey6vtuw3e375rewhhobvkznjh5ltz4/lib/:$LD_LIBRARY_PATH`
	`module load openmpi`	`module load openmpi`