1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Aurora MPI standalone benchmake and options that work well

This commit is contained in:
Peter Boyle 2024-02-06 16:28:40 +00:00
parent 2a0d75bac2
commit 5bfa88be85
9 changed files with 426 additions and 118 deletions

22
MPI_benchmark/bench2.pbs Normal file
View File

@ -0,0 +1,22 @@
#!/bin/bash
#PBS -q EarlyAppAccess
#PBS -l select=2
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
export TZ='/usr/share/zoneinfo/US/Central'
export OMP_PROC_BIND=spread
export OMP_NUM_THREADS=3
unset OMP_PLACES
cd $PBS_O_WORKDIR
NNODES=`wc -l < $PBS_NODEFILE`
NRANKS=12 # Number of MPI ranks per node
NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node
NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
NTOTRANKS=$(( NNODES * NRANKS ))
CMD="mpiexec -np 2 -ppn 1 -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1"
$CMD

View File

@ -0,0 +1 @@
mpicxx -fsycl halo_mpi.cc -o halo_mpi

View File

@ -0,0 +1,30 @@
#!/bin/bash
export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
export ONEAPI_DEVICE_FILTER=gpu,level_zero
#unset EnableWalkerPartition
#export EnableImplicitScaling=0
#export GRID_MPICH_NIC_BIND=$NIC
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
numactl -m $PNUMA -N $NUMA "$@"

333
MPI_benchmark/halo_mpi.cc Normal file
View File

@ -0,0 +1,333 @@
#include <cassert>
#include <complex>
#include <memory>
#include <vector>
#include <algorithm>
#include <array>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <ctime>
#include <sys/time.h>
#include <mpi.h>
/**************************************************************
* GPU - GPU memory cartesian halo exchange benchmark
* Config: what is the target
**************************************************************
*/
#undef ACC_CUDA
#undef ACC_HIP
#define ACC_SYCL
#undef ACC_NONE
/**************************************************************
* Some MPI globals
**************************************************************
*/
MPI_Comm WorldComm;
MPI_Comm WorldShmComm;
int WorldSize;
int WorldRank;
int WorldShmSize;
int WorldShmRank;
/**************************************************************
* Allocate buffers on the GPU, SYCL needs an init call and context
**************************************************************
*/
#ifdef ACC_CUDA
#include <cuda.h>
void acceleratorInit(void){}
void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMalloc((void **)&ptr,bytes);
assert(err==cudaSuccess);
return ptr;
}
void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}
#endif
#ifdef ACC_HIP
#include <hip/hip_runtime.h>
void acceleratorInit(void){}
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = hipMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
}
return ptr;
};
inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
#endif
#ifdef ACC_SYCL
#include <sycl/CL/sycl.hpp>
#include <sycl/usm.hpp>
cl::sycl::queue *theAccelerator;
void acceleratorInit(void)
{
int nDevices = 1;
#if 1
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
theAccelerator = new sycl::queue (selectedDevice);
#else
cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v };
theAccelerator = new sycl::queue (selectedDevice);
#endif
auto name = theAccelerator->get_device().get_info<sycl::info::device::name>();
printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout);
}
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);};
#endif
#ifdef ACC_NONE
void acceleratorInit(void){}
inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);};
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
#endif
/**************************************************************
* Microsecond timer
**************************************************************
*/
inline double usecond(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec;
}
/**************************************************************
* Main benchmark routine
**************************************************************
*/
void Benchmark(int64_t L,std::vector<int> cart_geom,bool use_device,int ncall)
{
int64_t words = 3*4*2;
int64_t face,vol;
int Nd=cart_geom.size();
/**************************************************************
* L^Nd volume, L^(Nd-1) faces, 12 complex per site
* Allocate memory for these
**************************************************************
*/
face=1; for( int d=0;d<Nd-1;d++) face = face*L;
vol=1; for( int d=0;d<Nd;d++) vol = vol*L;
std::vector<void *> send_bufs;
std::vector<void *> recv_bufs;
size_t vw = face*words;
size_t bytes = face*words*sizeof(double);
if ( use_device ) {
for(int d=0;d<2*Nd;d++){
send_bufs.push_back(acceleratorAllocDevice(bytes));
recv_bufs.push_back(acceleratorAllocDevice(bytes));
}
} else {
for(int d=0;d<2*Nd;d++){
send_bufs.push_back(malloc(bytes));
recv_bufs.push_back(malloc(bytes));
}
}
/*********************************************************
* Build cartesian communicator
*********************************************************
*/
int ierr;
int rank;
std::vector<int> coor(Nd);
MPI_Comm communicator;
std::vector<int> periodic(Nd,1);
MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&rank);
MPI_Cart_coords(communicator,rank,Nd,&coor[0]);
static int reported;
if ( ! reported ) {
printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank,
coor[0],coor[1],coor[2],coor[3]); fflush(stdout);
reported =1 ;
}
/*********************************************************
* Perform halo exchanges
*********************************************************
*/
for(int d=0;d<Nd;d++){
if ( cart_geom[d]>1 ) {
double t0=usecond();
int from,to;
MPI_Barrier(communicator);
for(int n=0;n<ncall;n++){
void *xmit = (void *)send_bufs[d];
void *recv = (void *)recv_bufs[d];
ierr=MPI_Cart_shift(communicator,d,1,&from,&to);
assert(ierr==0);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
xmit = (void *)send_bufs[Nd+d];
recv = (void *)recv_bufs[Nd+d];
ierr=MPI_Cart_shift(communicator,d,-1,&from,&to);
assert(ierr==0);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
MPI_Barrier(communicator);
double t1=usecond();
double dbytes = bytes*WorldShmSize;
double xbytes = dbytes*2.0*ncall;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
if ( ! WorldRank ) {
printf("\t%12ld\t %12ld %16.0lf\n",L,bytes,bidibytes/(t1-t0)); fflush(stdout);
}
}
}
/*********************************************************
* Free memory
*********************************************************
*/
if ( use_device ) {
for(int d=0;d<2*Nd;d++){
acceleratorFreeDevice(send_bufs[d]);
acceleratorFreeDevice(recv_bufs[d]);
}
} else {
for(int d=0;d<2*Nd;d++){
free(send_bufs[d]);
free(recv_bufs[d]);
}
}
}
/**************************************
* Command line junk
**************************************/
std::string CmdOptionPayload(char ** begin, char ** end, const std::string & option)
{
char ** itr = std::find(begin, end, option);
if (itr != end && ++itr != end) {
std::string payload(*itr);
return payload;
}
return std::string("");
}
bool CmdOptionExists(char** begin, char** end, const std::string& option)
{
return std::find(begin, end, option) != end;
}
void CmdOptionIntVector(const std::string &str,std::vector<int> & vec)
{
vec.resize(0);
std::stringstream ss(str);
int i;
while (ss >> i){
vec.push_back(i);
if(std::ispunct(ss.peek()))
ss.ignore();
}
return;
}
/**************************************
* Command line junk
**************************************/
int main(int argc, char **argv)
{
std::string arg;
acceleratorInit();
MPI_Init(&argc,&argv);
WorldComm = MPI_COMM_WORLD;
MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
MPI_Comm_rank(WorldComm ,&WorldRank);
MPI_Comm_size(WorldComm ,&WorldSize);
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldSize/WorldShmSize > 2) {
printf("This benchmark is meant to run on at most two nodes only\n");
}
auto mpi =std::vector<int>({1,1,1,1});
if( CmdOptionExists(argv,argv+argc,"--mpi") ){
arg = CmdOptionPayload(argv,argv+argc,"--mpi");
CmdOptionIntVector(arg,mpi);
} else {
printf("Must specify --mpi <n1.n2.n3.n4> command line argument\n");
exit(0);
}
if( !WorldRank ) {
printf("***********************************\n");
printf("%d ranks\n",WorldSize);
printf("%d ranks-per-node\n",WorldShmSize);
printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout);
printf("Cartesian layout: ");
for(int d=0;d<mpi.size();d++){
printf("%d ",mpi[d]);
}
printf("\n");fflush(stdout);
printf("***********************************\n");
}
if( !WorldRank ) {
printf("=========================================================\n");
printf("= Benchmarking HOST memory MPI performance \n");
printf("=========================================================\n");fflush(stdout);
printf("= L\t pkt bytes\t MB/s \n");
printf("=========================================================\n");fflush(stdout);
}
for(int L=16;L<=64;L+=4){
Benchmark(L,mpi,false,100);
}
if( !WorldRank ) {
printf("=========================================================\n");
printf("= Benchmarking DEVICE memory MPI performance \n");
printf("=========================================================\n");fflush(stdout);
}
for(int L=16;L<=64;L+=4){
Benchmark(L,mpi,true,100);
}
if( !WorldRank ) {
printf("=========================================================\n");
printf("= DONE \n");
printf("=========================================================\n");
}
MPI_Finalize();
}

View File

@ -1,12 +1,10 @@
#!/bin/bash #!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess #PBS -q EarlyAppAccess
#PBS -l select=1 #PBS -l select=1
#PBS -l walltime=01:00:00 #PBS -l walltime=01:00:00
##PBS -A Aurora_Deployment #PBS -A LatticeQCD_aesp_CNDA
#PBS -A LatticeQCD_aesp
HDIR=/home/paboyle/ HDIR=/home/paboyle/
#module use /soft/testing/modulefiles/ #module use /soft/testing/modulefiles/
@ -48,7 +46,6 @@ echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \ CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
./gpu_tile_compact.sh \ ./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 1.1.2.6 --grid 32.24.32.192 \ ./Benchmark_memory_bandwidth --mpi 1.1.2.6 --grid 32.24.32.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD $CMD

View File

@ -6,40 +6,27 @@
#PBS -l select=2 #PBS -l select=2
#PBS -l walltime=01:00:00 #PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA #PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
HDIR=/home/paboyle/ #export OMP_NUM_THREADS=3
#module use /soft/testing/modulefiles/ #unset OMP_PLACES
#module load intel-UMD23.05.25593.11/23.05.25593.11
#module load tools/pti-gpu
#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
#export PATH=$HDIR/tools/bin:$PATH
export TZ='/usr/share/zoneinfo/US/Central'
export OMP_PROC_BIND=spread
export OMP_NUM_THREADS=3
unset OMP_PLACES
cd $PBS_O_WORKDIR cd $PBS_O_WORKDIR
source ../sourceme.sh source ../sourceme.sh
#echo Jobid: $PBS_JOBID
#echo Running on host `hostname`
#echo Running on nodes `cat $PBS_NODEFILE`
echo Jobid: $PBS_JOBID #echo NODES
echo Running on host `hostname` #cat $PBS_NODEFILE
echo Running on nodes `cat $PBS_NODEFILE`
echo NODES
cat $PBS_NODEFILE
NNODES=`wc -l < $PBS_NODEFILE` NNODES=`wc -l < $PBS_NODEFILE`
NRANKS=12 # Number of MPI ranks per node NRANKS=12 # Number of MPI ranks per node
NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node NDEPTH=3 # Number of hardware threads per rank, spacing between MPI ranks on a node
NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
NTOTRANKS=$(( NNODES * NRANKS )) NTOTRANKS=$(( NNODES * NRANKS ))
echo "NUM_NODES=${NNODES} TOTAL_RANKS=${NTOTRANKS} RANKS_PER_NODE=${NRANKS} THREADS_PER_RANK=${OMP_NUM_THREADS}"
echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \ CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \
./gpu_tile_compact.sh \ ./gpu_tile_compact.sh \
@ -60,7 +47,7 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
#$CMD | tee 2-to-2.comms.hmem1 #$CMD | tee 2-to-2.comms.hmem1
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
#$CMD | tee 2-to-2.comms.hmem0 $CMD | tee 2-to-2.comms.hmem0
CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \ CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \
./gpu_tile_compact.sh \ ./gpu_tile_compact.sh \
@ -74,15 +61,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \ CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \
./gpu_tile_compact4.sh \ ./gpu_tile_compact4a.sh \
./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \ ./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
$CMD | tee 4-to-4.comms.hmem1.nic-affinity #$CMD | tee 4-to-4.comms.hmem1.nic-affinity
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
$CMD | tee 4-to-4.comms.hmem0.nic-affinity $CMD | tee 4-to-4.comms.hmem0
#mpiexec -np 1 --ppn 1 -d 1 numactl -H | tee numa.log
CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \ CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \
./gpu_tile_compact.sh \ ./gpu_tile_compact.sh \
@ -92,7 +80,7 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
#$CMD | tee 6-to-6.comms.hmem1 #$CMD | tee 6-to-6.comms.hmem1
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
#$CMD | tee 6-to-6.comms.hmem0 $CMD | tee 6-to-6.comms.hmem0
CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \ CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
@ -104,4 +92,4 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
#$CMD | tee 12-to-12.comms.hmem1 #$CMD | tee 12-to-12.comms.hmem1
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
#$CMD | tee 12-to-12.comms.hmem0 $CMD | tee 12-to-12.comms.hmem0

View File

@ -1,65 +1,33 @@
#!/bin/bash #!/bin/bash
display_help() { export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
echo " Will map gpu tile to rank in compact and then round-robin fashion" #export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
echo " Usage (only work for one node of ATS/PVC):" export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
echo " mpiexec --np N gpu_tile_compact.sh ./a.out"
echo
echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
echo " 0 Rank 0.0"
echo " 1 Rank 0.1"
echo " 2 Rank 1.0"
echo " 3 Rank 1.1"
echo " 4 Rank 2.0"
echo " 5 Rank 2.1"
echo " 6 Rank 0.0"
echo
echo " Hacked together by apl@anl.gov, please contact if bug found"
exit 1
}
#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
num_gpu=6
num_tile=2
if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
display_help
fi
gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
tile_id=$((PALS_LOCAL_RANKID % num_tile))
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
export NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 ) export NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
export GRID_MPICH_NIC_BIND=$NIC #export GRID_MPICH_NIC_BIND=$NIC
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
unset EnableWalkerPartition unset EnableWalkerPartition
export EnableImplicitScaling=0 export EnableImplicitScaling=0
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
export ZE_AFFINITY_MASK=$gpu_id.$tile_id export ZE_AFFINITY_MASK=$gpu_id.$tile_id
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
export ONEAPI_DEVICE_FILTER=gpu,level_zero export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 #export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND" echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
if [ $PALS_LOCAL_RANKID = 0 ] numactl -m $NUMA -N $NUMAP "$@"
then
numactl -m $NUMA -N $NUMA "$@"
else
numactl -m $NUMA -N $NUMA "$@"
fi

View File

@ -1,39 +1,8 @@
#!/bin/bash #!/bin/bash
display_help() { export NUMA_MAP=(2 2 3 3 2 2 3 3 )
echo " Will map gpu tile to rank in compact and then round-robin fashion" export PROC_MAP=(0 0 1 1 0 0 1 1 )
echo " Usage (only work for one node of ATS/PVC):" export NIC_MAP=(0 0 4 4 1 1 5 5 )
echo " mpiexec --np N gpu_tile_compact.sh ./a.out"
echo
echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
echo " 0 Rank 0.0"
echo " 1 Rank 0.1"
echo " 2 Rank 1.0"
echo " 3 Rank 1.1"
echo " 4 Rank 2.0"
echo " 5 Rank 2.1"
echo " 6 Rank 0.0"
echo
echo " Hacked together by apl@anl.gov, please contact if bug found"
exit 1
}
#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
num_gpu=6
num_tile=2
if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
display_help
fi
gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
tile_id=$((PALS_LOCAL_RANKID % num_tile))
export NUMA_MAP=(0 0 1 1 0 0 1 1 )
export NIC_MAP=(0 1 4 5 0 1 4 5 )
export GPU_MAP=(0 1 3 4 0 1 3 4 ) export GPU_MAP=(0 1 3 4 0 1 3 4 )
export TILE_MAP=(0 0 0 0 1 1 1 1 ) export TILE_MAP=(0 0 0 0 1 1 1 1 )
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
@ -41,7 +10,7 @@ export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
export GRID_MPICH_NIC_BIND=$NIC #export GRID_MPICH_NIC_BIND=$NIC
unset EnableWalkerPartition unset EnableWalkerPartition
export EnableImplicitScaling=0 export EnableImplicitScaling=0
@ -55,6 +24,6 @@ export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND" echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND ; NUMA domain $NUMA"
numactl -m $NUMA -N $NUMA "$@" numactl -m $NUMA -N $PROC_MAP "$@"

View File

@ -7,6 +7,6 @@ export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
export http_proxy=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128
export https_proxy=http://proxy.alcf.anl.gov:3128 export https_proxy=http://proxy.alcf.anl.gov:3128
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
git config --global http.proxy http://proxy.alcf.anl.gov:3128 git config --global http.proxy http://proxy.alcf.anl.gov:3128