From 62055e04ddfda2d37069a0fb10e8af1aa45a0140 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 13 Feb 2024 18:18:27 +0100 Subject: [PATCH 1/5] missing semicolon generates error with some compilers --- Grid/qcd/smearing/StoutSmearing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/smearing/StoutSmearing.h b/Grid/qcd/smearing/StoutSmearing.h index 641331dc..787ef104 100644 --- a/Grid/qcd/smearing/StoutSmearing.h +++ b/Grid/qcd/smearing/StoutSmearing.h @@ -69,7 +69,7 @@ public: /*! Construct stout smearing object from explicitly specified rho matrix */ Smear_Stout(const std::vector& rho_) : OwnedBase{new Smear_APE(rho_)}, SmearBase{OwnedBase.get()} { - std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector& " << rho_ << " )" << std::endl + std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector& " << rho_ << " )" << std::endl; assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3"); } From 585efc6f3fce63f9766b2b66d3ae279ff0944a56 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:40:49 +0000 Subject: [PATCH 2/5] More benchmark scripts --- systems/Aurora/benchmarks/bench256.pbs | 48 ++++++++++++++++++++++++++ systems/Aurora/benchmarks/bench512.pbs | 48 ++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench256.pbs create mode 100644 systems/Aurora/benchmarks/bench512.pbs diff --git a/systems/Aurora/benchmarks/bench256.pbs b/systems/Aurora/benchmarks/bench256.pbs new file mode 100644 index 00000000..405d9ed4 --- /dev/null +++ b/systems/Aurora/benchmarks/bench256.pbs @@ -0,0 +1,48 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=256 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 3072 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.8.8 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 3072 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.4.12 --grid 128.128.128.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 256node.dwf.large diff --git a/systems/Aurora/benchmarks/bench512.pbs b/systems/Aurora/benchmarks/bench512.pbs new file mode 100644 index 00000000..0d8708d3 --- /dev/null +++ b/systems/Aurora/benchmarks/bench512.pbs @@ -0,0 +1,48 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=512 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 6144 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.8.16 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 6144 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.8.12 --grid 256.128.128.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 512node.dwf.large From 1502860004f953d02e2cf8b6d892e1109d940e04 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:47:02 +0000 Subject: [PATCH 3/5] Benchmark scripts --- systems/Aurora/benchmarks/bench1024.pbs | 56 +++++++++++++++++++++++++ systems/Aurora/benchmarks/bench2048.pbs | 56 +++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench1024.pbs create mode 100644 systems/Aurora/benchmarks/bench2048.pbs diff --git a/systems/Aurora/benchmarks/bench1024.pbs b/systems/Aurora/benchmarks/bench1024.pbs new file mode 100644 index 00000000..88f0100a --- /dev/null +++ b/systems/Aurora/benchmarks/bench1024.pbs @@ -0,0 +1,56 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=1024 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.16.16 --grid 64.48.64.284 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1024node.dwf.small + +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1024node.dwf + + diff --git a/systems/Aurora/benchmarks/bench2048.pbs b/systems/Aurora/benchmarks/bench2048.pbs new file mode 100644 index 00000000..b79081a2 --- /dev/null +++ b/systems/Aurora/benchmarks/bench2048.pbs @@ -0,0 +1,56 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=2048 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.12.16.16 --grid 64.48.64.284 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 128.128.128.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2048node.dwf.small + +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 256.256.256.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2048node.dwf + + From 5ef4da3f29f95b843bf97bce603ba43f1c54029d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:47:36 +0000 Subject: [PATCH 4/5] Silence verbose --- systems/Aurora/benchmarks/gpu_tile_compact.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/Aurora/benchmarks/gpu_tile_compact.sh b/systems/Aurora/benchmarks/gpu_tile_compact.sh index 69ba5107..5cab1ee3 100755 --- a/systems/Aurora/benchmarks/gpu_tile_compact.sh +++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh @@ -28,6 +28,6 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 -echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " +#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " numactl -m $NUMA -N $NUMAP "$@" From 303b83cdb80ad4e440785854976b34b8d2381d8e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:48:03 +0000 Subject: [PATCH 5/5] Scaling benchmarks, verbosity and MPICH aware in acceleratorInit() For some reason Dirichlet benchmark fails on several nodes; need to debug this. --- Grid/threads/Accelerator.cc | 19 ++++- benchmarks/Benchmark_dwf_fp32.cc | 20 +++--- systems/Aurora/benchmarks/bench_scaling.pbs | 80 +++++++++++++++++++++ 3 files changed, 106 insertions(+), 13 deletions(-) create mode 100644 systems/Aurora/benchmarks/bench_scaling.pbs diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 3769b2aa..19411b62 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -7,6 +7,8 @@ uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; +#define ENV_LOCAL_RANK_PALS "PALS_LOCAL_RANKID" +#define ENV_RANK_PALS "PALS_RANKID" #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" #define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID" @@ -228,8 +230,17 @@ void acceleratorInit(void) { rank = atoi(localRankStr); } + if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL) + { + rank = atoi(localRankStr); + } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_PALS )) != NULL) { world_rank = atoi(localRankStr);} + + char hostname[HOST_NAME_MAX+1]; + gethostname(hostname, HOST_NAME_MAX+1); + if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); auto devices = cl::sycl::device::get_devices(); for(int d = 0;d()); #define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); + if ( world_rank == 0) { - GPU_PROP_STR(vendor); - GPU_PROP_STR(version); + GPU_PROP_STR(vendor); + GPU_PROP_STR(version); // GPU_PROP_STR(device_type); /* GPU_PROP(max_compute_units); @@ -259,7 +271,8 @@ void acceleratorInit(void) GPU_PROP(single_fp_config); */ // GPU_PROP(double_fp_config); - GPU_PROP(global_mem_size); + GPU_PROP(global_mem_size); + } } if ( world_rank == 0 ) { diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 37287595..ce4fcfab 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -90,11 +90,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <1 ? 1 : 0; - Dirichlet[0] = 0; - Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; - Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; - Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; - Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + // Dirichlet[0] = 0; + // Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + // Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + // Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + // Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; Benchmark(Ls,Dirichlet); @@ -105,11 +105,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <1 ? 1 : 0; - Dirichlet[0] = 0; - Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; - Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; - Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; - Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; + // Dirichlet[0] = 0; + // Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; + // Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; + // Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; + // Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; Benchmark(Ls,Dirichlet); diff --git a/systems/Aurora/benchmarks/bench_scaling.pbs b/systems/Aurora/benchmarks/bench_scaling.pbs new file mode 100644 index 00000000..504fd3e9 --- /dev/null +++ b/systems/Aurora/benchmarks/bench_scaling.pbs @@ -0,0 +1,80 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=32 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 384 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1node.dwf + + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2node.dwf + +CMD="mpiexec -np 48 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 4node.dwf + +CMD="mpiexec -np 96 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 8node.dwf + +CMD="mpiexec -np 192 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 16node.dwf + + +CMD="mpiexec -np 384 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 32node.dwf