From 5bfa88be85b454e09ed82a8361f4cdf6846e7823 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 6 Feb 2024 16:28:40 +0000 Subject: [PATCH] Aurora MPI standalone benchmake and options that work well --- MPI_benchmark/bench2.pbs | 22 ++ MPI_benchmark/compile-command | 1 + MPI_benchmark/gpu_tile_compact.sh | 30 ++ MPI_benchmark/halo_mpi.cc | 333 ++++++++++++++++++ systems/Aurora/benchmarks/bench.pbs | 7 +- systems/Aurora/benchmarks/bench2.pbs | 44 +-- systems/Aurora/benchmarks/gpu_tile_compact.sh | 62 +--- .../Aurora/benchmarks/gpu_tile_compact4.sh | 43 +-- systems/Aurora/sourceme.sh | 2 +- 9 files changed, 426 insertions(+), 118 deletions(-) create mode 100644 MPI_benchmark/bench2.pbs create mode 100644 MPI_benchmark/compile-command create mode 100755 MPI_benchmark/gpu_tile_compact.sh create mode 100644 MPI_benchmark/halo_mpi.cc diff --git a/MPI_benchmark/bench2.pbs b/MPI_benchmark/bench2.pbs new file mode 100644 index 00000000..2c069a20 --- /dev/null +++ b/MPI_benchmark/bench2.pbs @@ -0,0 +1,22 @@ +#!/bin/bash +#PBS -q EarlyAppAccess +#PBS -l select=2 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +export TZ='/usr/share/zoneinfo/US/Central' +export OMP_PROC_BIND=spread +export OMP_NUM_THREADS=3 +unset OMP_PLACES + +cd $PBS_O_WORKDIR + +NNODES=`wc -l < $PBS_NODEFILE` +NRANKS=12 # Number of MPI ranks per node +NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node +NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS + +NTOTRANKS=$(( NNODES * NRANKS )) + +CMD="mpiexec -np 2 -ppn 1 -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1" +$CMD diff --git a/MPI_benchmark/compile-command b/MPI_benchmark/compile-command new file mode 100644 index 00000000..20f26a3c --- /dev/null +++ b/MPI_benchmark/compile-command @@ -0,0 +1 @@ +mpicxx -fsycl halo_mpi.cc -o halo_mpi \ No newline at end of file diff --git a/MPI_benchmark/gpu_tile_compact.sh b/MPI_benchmark/gpu_tile_compact.sh new file mode 100755 index 00000000..28fdb341 --- /dev/null +++ b/MPI_benchmark/gpu_tile_compact.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) +export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) +export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) + +export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]} +export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} +export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} +export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} + + +export ZE_AFFINITY_MASK=$gpu_id.$tile_id +export ONEAPI_DEVICE_FILTER=gpu,level_zero + +#unset EnableWalkerPartition +#export EnableImplicitScaling=0 +#export GRID_MPICH_NIC_BIND=$NIC +#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id +#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 +#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 +#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 + +echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " + +numactl -m $PNUMA -N $NUMA "$@" diff --git a/MPI_benchmark/halo_mpi.cc b/MPI_benchmark/halo_mpi.cc new file mode 100644 index 00000000..9e11c473 --- /dev/null +++ b/MPI_benchmark/halo_mpi.cc @@ -0,0 +1,333 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/************************************************************** + * GPU - GPU memory cartesian halo exchange benchmark + * Config: what is the target + ************************************************************** + */ +#undef ACC_CUDA +#undef ACC_HIP +#define ACC_SYCL +#undef ACC_NONE + +/************************************************************** + * Some MPI globals + ************************************************************** + */ +MPI_Comm WorldComm; +MPI_Comm WorldShmComm; + +int WorldSize; +int WorldRank; + +int WorldShmSize; +int WorldShmRank; + +/************************************************************** + * Allocate buffers on the GPU, SYCL needs an init call and context + ************************************************************** + */ +#ifdef ACC_CUDA +#include +void acceleratorInit(void){} +void *acceleratorAllocDevice(size_t bytes) +{ + void *ptr=NULL; + auto err = cudaMalloc((void **)&ptr,bytes); + assert(err==cudaSuccess); + return ptr; +} +void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);} +#endif +#ifdef ACC_HIP +#include +void acceleratorInit(void){} +inline void *acceleratorAllocDevice(size_t bytes) +{ + void *ptr=NULL; + auto err = hipMalloc((void **)&ptr,bytes); + if( err != hipSuccess ) { + ptr = (void *) NULL; + printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); + } + return ptr; +}; +inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);}; +#endif +#ifdef ACC_SYCL +#include +#include +cl::sycl::queue *theAccelerator; +void acceleratorInit(void) +{ + int nDevices = 1; +#if 1 + cl::sycl::gpu_selector selector; + cl::sycl::device selectedDevice { selector }; + theAccelerator = new sycl::queue (selectedDevice); +#else + cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v }; + theAccelerator = new sycl::queue (selectedDevice); +#endif + auto name = theAccelerator->get_device().get_info(); + printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout); +} +inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);}; +#endif +#ifdef ACC_NONE +void acceleratorInit(void){} +inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr);}; +#endif + + +/************************************************************** + * Microsecond timer + ************************************************************** + */ +inline double usecond(void) { + struct timeval tv; + gettimeofday(&tv,NULL); + return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec; +} +/************************************************************** + * Main benchmark routine + ************************************************************** + */ +void Benchmark(int64_t L,std::vector cart_geom,bool use_device,int ncall) +{ + int64_t words = 3*4*2; + int64_t face,vol; + int Nd=cart_geom.size(); + + /************************************************************** + * L^Nd volume, L^(Nd-1) faces, 12 complex per site + * Allocate memory for these + ************************************************************** + */ + face=1; for( int d=0;d send_bufs; + std::vector recv_bufs; + size_t vw = face*words; + size_t bytes = face*words*sizeof(double); + + if ( use_device ) { + for(int d=0;d<2*Nd;d++){ + send_bufs.push_back(acceleratorAllocDevice(bytes)); + recv_bufs.push_back(acceleratorAllocDevice(bytes)); + } + } else { + for(int d=0;d<2*Nd;d++){ + send_bufs.push_back(malloc(bytes)); + recv_bufs.push_back(malloc(bytes)); + } + } + /********************************************************* + * Build cartesian communicator + ********************************************************* + */ + int ierr; + int rank; + std::vector coor(Nd); + MPI_Comm communicator; + std::vector periodic(Nd,1); + MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator); + MPI_Comm_rank(communicator,&rank); + MPI_Cart_coords(communicator,rank,Nd,&coor[0]); + + static int reported; + if ( ! reported ) { + printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank, + coor[0],coor[1],coor[2],coor[3]); fflush(stdout); + reported =1 ; + } + /********************************************************* + * Perform halo exchanges + ********************************************************* + */ + for(int d=0;d1 ) { + double t0=usecond(); + + int from,to; + + MPI_Barrier(communicator); + for(int n=0;n & vec) +{ + vec.resize(0); + std::stringstream ss(str); + int i; + while (ss >> i){ + vec.push_back(i); + if(std::ispunct(ss.peek())) + ss.ignore(); + } + return; +} +/************************************** + * Command line junk + **************************************/ +int main(int argc, char **argv) +{ + std::string arg; + + acceleratorInit(); + + MPI_Init(&argc,&argv); + + WorldComm = MPI_COMM_WORLD; + + MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); + + MPI_Comm_rank(WorldComm ,&WorldRank); + MPI_Comm_size(WorldComm ,&WorldSize); + + MPI_Comm_rank(WorldShmComm ,&WorldShmRank); + MPI_Comm_size(WorldShmComm ,&WorldShmSize); + + if ( WorldSize/WorldShmSize > 2) { + printf("This benchmark is meant to run on at most two nodes only\n"); + } + + auto mpi =std::vector({1,1,1,1}); + + if( CmdOptionExists(argv,argv+argc,"--mpi") ){ + arg = CmdOptionPayload(argv,argv+argc,"--mpi"); + CmdOptionIntVector(arg,mpi); + } else { + printf("Must specify --mpi command line argument\n"); + exit(0); + } + + if( !WorldRank ) { + printf("***********************************\n"); + printf("%d ranks\n",WorldSize); + printf("%d ranks-per-node\n",WorldShmSize); + printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout); + printf("Cartesian layout: "); + for(int d=0;d