mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Merge branch 'develop' into hisq_fat_links
This commit is contained in:
commit
88b52cc045
@ -152,6 +152,7 @@ public:
|
||||
#ifdef RNG_FAST_DISCARD
|
||||
static void Skip(RngEngine &eng,uint64_t site)
|
||||
{
|
||||
#if 0
|
||||
/////////////////////////////////////////////////////////////////////////////////////
|
||||
// Skip by 2^40 elements between successive lattice sites
|
||||
// This goes by 10^12.
|
||||
@ -162,9 +163,9 @@ public:
|
||||
// tens of seconds per trajectory so this is clean in all reasonable cases,
|
||||
// and margin of safety is orders of magnitude.
|
||||
// We could hack Sitmo to skip in the higher order words of state if necessary
|
||||
//
|
||||
// Replace with 2^30 ; avoid problem on large volumes
|
||||
//
|
||||
//
|
||||
// Replace with 2^30 ; avoid problem on large volumes
|
||||
//
|
||||
/////////////////////////////////////////////////////////////////////////////////////
|
||||
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
|
||||
const int shift = 30;
|
||||
@ -179,6 +180,9 @@ public:
|
||||
assert((skip >> shift)==site); // check for overflow
|
||||
|
||||
eng.discard(skip);
|
||||
#else
|
||||
eng.discardhi(site);
|
||||
#endif
|
||||
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
|
||||
}
|
||||
#endif
|
||||
|
@ -69,7 +69,7 @@ public:
|
||||
/*! Construct stout smearing object from explicitly specified rho matrix */
|
||||
Smear_Stout(const std::vector<double>& rho_)
|
||||
: OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} {
|
||||
std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl
|
||||
std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl;
|
||||
assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
|
||||
}
|
||||
|
||||
|
@ -218,6 +218,10 @@ public:
|
||||
// -------------------------------------------------
|
||||
// misc
|
||||
// -------------------------------------------------
|
||||
void discardhi(uint64_t z) {
|
||||
_s[3] += z;
|
||||
encrypt_counter();
|
||||
}
|
||||
|
||||
// req: 26.5.1.4 Random number engine requirements, p.908 table 117, row 9
|
||||
// Advances e’s state ei to ei+z by any means equivalent to z
|
||||
@ -387,4 +391,4 @@ private:
|
||||
#undef MIXK
|
||||
#undef MIX2
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
@ -7,6 +7,8 @@ uint32_t accelerator_threads=2;
|
||||
uint32_t acceleratorThreads(void) {return accelerator_threads;};
|
||||
void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
|
||||
|
||||
#define ENV_LOCAL_RANK_PALS "PALS_LOCAL_RANKID"
|
||||
#define ENV_RANK_PALS "PALS_RANKID"
|
||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||
#define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID"
|
||||
@ -228,8 +230,17 @@ void acceleratorInit(void)
|
||||
{
|
||||
rank = atoi(localRankStr);
|
||||
}
|
||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL)
|
||||
{
|
||||
rank = atoi(localRankStr);
|
||||
}
|
||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||
if ((localRankStr = getenv(ENV_RANK_PALS )) != NULL) { world_rank = atoi(localRankStr);}
|
||||
|
||||
char hostname[HOST_NAME_MAX+1];
|
||||
gethostname(hostname, HOST_NAME_MAX+1);
|
||||
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
|
||||
|
||||
auto devices = cl::sycl::device::get_devices();
|
||||
for(int d = 0;d<devices.size();d++){
|
||||
@ -241,9 +252,10 @@ void acceleratorInit(void)
|
||||
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
|
||||
|
||||
#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
|
||||
if ( world_rank == 0) {
|
||||
|
||||
GPU_PROP_STR(vendor);
|
||||
GPU_PROP_STR(version);
|
||||
GPU_PROP_STR(vendor);
|
||||
GPU_PROP_STR(version);
|
||||
// GPU_PROP_STR(device_type);
|
||||
/*
|
||||
GPU_PROP(max_compute_units);
|
||||
@ -259,7 +271,8 @@ void acceleratorInit(void)
|
||||
GPU_PROP(single_fp_config);
|
||||
*/
|
||||
// GPU_PROP(double_fp_config);
|
||||
GPU_PROP(global_mem_size);
|
||||
GPU_PROP(global_mem_size);
|
||||
}
|
||||
|
||||
}
|
||||
if ( world_rank == 0 ) {
|
||||
|
22
MPI_benchmark/bench2.pbs
Normal file
22
MPI_benchmark/bench2.pbs
Normal file
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
#PBS -q EarlyAppAccess
|
||||
#PBS -l select=2
|
||||
#PBS -l walltime=01:00:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
export TZ='/usr/share/zoneinfo/US/Central'
|
||||
export OMP_PROC_BIND=spread
|
||||
export OMP_NUM_THREADS=3
|
||||
unset OMP_PLACES
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
NNODES=`wc -l < $PBS_NODEFILE`
|
||||
NRANKS=12 # Number of MPI ranks per node
|
||||
NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node
|
||||
NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
|
||||
|
||||
NTOTRANKS=$(( NNODES * NRANKS ))
|
||||
|
||||
CMD="mpiexec -np 2 -ppn 1 -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1"
|
||||
$CMD
|
1
MPI_benchmark/compile-command
Normal file
1
MPI_benchmark/compile-command
Normal file
@ -0,0 +1 @@
|
||||
mpicxx -fsycl halo_mpi.cc -o halo_mpi
|
30
MPI_benchmark/gpu_tile_compact.sh
Executable file
30
MPI_benchmark/gpu_tile_compact.sh
Executable file
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
|
||||
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
||||
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
|
||||
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
|
||||
|
||||
export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
|
||||
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
|
||||
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
|
||||
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
|
||||
|
||||
|
||||
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
|
||||
export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
||||
|
||||
#unset EnableWalkerPartition
|
||||
#export EnableImplicitScaling=0
|
||||
#export GRID_MPICH_NIC_BIND=$NIC
|
||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
|
||||
#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
||||
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
||||
|
||||
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
|
||||
|
||||
numactl -m $PNUMA -N $NUMA "$@"
|
333
MPI_benchmark/halo_mpi.cc
Normal file
333
MPI_benchmark/halo_mpi.cc
Normal file
@ -0,0 +1,333 @@
|
||||
#include <cassert>
|
||||
#include <complex>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <strings.h>
|
||||
#include <ctime>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
/**************************************************************
|
||||
* GPU - GPU memory cartesian halo exchange benchmark
|
||||
* Config: what is the target
|
||||
**************************************************************
|
||||
*/
|
||||
#undef ACC_CUDA
|
||||
#undef ACC_HIP
|
||||
#define ACC_SYCL
|
||||
#undef ACC_NONE
|
||||
|
||||
/**************************************************************
|
||||
* Some MPI globals
|
||||
**************************************************************
|
||||
*/
|
||||
MPI_Comm WorldComm;
|
||||
MPI_Comm WorldShmComm;
|
||||
|
||||
int WorldSize;
|
||||
int WorldRank;
|
||||
|
||||
int WorldShmSize;
|
||||
int WorldShmRank;
|
||||
|
||||
/**************************************************************
|
||||
* Allocate buffers on the GPU, SYCL needs an init call and context
|
||||
**************************************************************
|
||||
*/
|
||||
#ifdef ACC_CUDA
|
||||
#include <cuda.h>
|
||||
void acceleratorInit(void){}
|
||||
void *acceleratorAllocDevice(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
auto err = cudaMalloc((void **)&ptr,bytes);
|
||||
assert(err==cudaSuccess);
|
||||
return ptr;
|
||||
}
|
||||
void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}
|
||||
#endif
|
||||
#ifdef ACC_HIP
|
||||
#include <hip/hip_runtime.h>
|
||||
void acceleratorInit(void){}
|
||||
inline void *acceleratorAllocDevice(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
auto err = hipMalloc((void **)&ptr,bytes);
|
||||
if( err != hipSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
|
||||
#endif
|
||||
#ifdef ACC_SYCL
|
||||
#include <sycl/CL/sycl.hpp>
|
||||
#include <sycl/usm.hpp>
|
||||
cl::sycl::queue *theAccelerator;
|
||||
void acceleratorInit(void)
|
||||
{
|
||||
int nDevices = 1;
|
||||
#if 1
|
||||
cl::sycl::gpu_selector selector;
|
||||
cl::sycl::device selectedDevice { selector };
|
||||
theAccelerator = new sycl::queue (selectedDevice);
|
||||
#else
|
||||
cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v };
|
||||
theAccelerator = new sycl::queue (selectedDevice);
|
||||
#endif
|
||||
auto name = theAccelerator->get_device().get_info<sycl::info::device::name>();
|
||||
printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout);
|
||||
}
|
||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);};
|
||||
#endif
|
||||
#ifdef ACC_NONE
|
||||
void acceleratorInit(void){}
|
||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
||||
#endif
|
||||
|
||||
|
||||
/**************************************************************
|
||||
* Microsecond timer
|
||||
**************************************************************
|
||||
*/
|
||||
inline double usecond(void) {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv,NULL);
|
||||
return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec;
|
||||
}
|
||||
/**************************************************************
|
||||
* Main benchmark routine
|
||||
**************************************************************
|
||||
*/
|
||||
void Benchmark(int64_t L,std::vector<int> cart_geom,bool use_device,int ncall)
|
||||
{
|
||||
int64_t words = 3*4*2;
|
||||
int64_t face,vol;
|
||||
int Nd=cart_geom.size();
|
||||
|
||||
/**************************************************************
|
||||
* L^Nd volume, L^(Nd-1) faces, 12 complex per site
|
||||
* Allocate memory for these
|
||||
**************************************************************
|
||||
*/
|
||||
face=1; for( int d=0;d<Nd-1;d++) face = face*L;
|
||||
vol=1; for( int d=0;d<Nd;d++) vol = vol*L;
|
||||
|
||||
|
||||
std::vector<void *> send_bufs;
|
||||
std::vector<void *> recv_bufs;
|
||||
size_t vw = face*words;
|
||||
size_t bytes = face*words*sizeof(double);
|
||||
|
||||
if ( use_device ) {
|
||||
for(int d=0;d<2*Nd;d++){
|
||||
send_bufs.push_back(acceleratorAllocDevice(bytes));
|
||||
recv_bufs.push_back(acceleratorAllocDevice(bytes));
|
||||
}
|
||||
} else {
|
||||
for(int d=0;d<2*Nd;d++){
|
||||
send_bufs.push_back(malloc(bytes));
|
||||
recv_bufs.push_back(malloc(bytes));
|
||||
}
|
||||
}
|
||||
/*********************************************************
|
||||
* Build cartesian communicator
|
||||
*********************************************************
|
||||
*/
|
||||
int ierr;
|
||||
int rank;
|
||||
std::vector<int> coor(Nd);
|
||||
MPI_Comm communicator;
|
||||
std::vector<int> periodic(Nd,1);
|
||||
MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator);
|
||||
MPI_Comm_rank(communicator,&rank);
|
||||
MPI_Cart_coords(communicator,rank,Nd,&coor[0]);
|
||||
|
||||
static int reported;
|
||||
if ( ! reported ) {
|
||||
printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank,
|
||||
coor[0],coor[1],coor[2],coor[3]); fflush(stdout);
|
||||
reported =1 ;
|
||||
}
|
||||
/*********************************************************
|
||||
* Perform halo exchanges
|
||||
*********************************************************
|
||||
*/
|
||||
for(int d=0;d<Nd;d++){
|
||||
if ( cart_geom[d]>1 ) {
|
||||
double t0=usecond();
|
||||
|
||||
int from,to;
|
||||
|
||||
MPI_Barrier(communicator);
|
||||
for(int n=0;n<ncall;n++){
|
||||
|
||||
void *xmit = (void *)send_bufs[d];
|
||||
void *recv = (void *)recv_bufs[d];
|
||||
|
||||
ierr=MPI_Cart_shift(communicator,d,1,&from,&to);
|
||||
assert(ierr==0);
|
||||
|
||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
|
||||
recv,bytes,MPI_CHAR,from, from,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
assert(ierr==0);
|
||||
|
||||
xmit = (void *)send_bufs[Nd+d];
|
||||
recv = (void *)recv_bufs[Nd+d];
|
||||
|
||||
ierr=MPI_Cart_shift(communicator,d,-1,&from,&to);
|
||||
assert(ierr==0);
|
||||
|
||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
|
||||
recv,bytes,MPI_CHAR,from, from,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
assert(ierr==0);
|
||||
}
|
||||
MPI_Barrier(communicator);
|
||||
|
||||
double t1=usecond();
|
||||
|
||||
double dbytes = bytes*WorldShmSize;
|
||||
double xbytes = dbytes*2.0*ncall;
|
||||
double rbytes = xbytes;
|
||||
double bidibytes = xbytes+rbytes;
|
||||
|
||||
if ( ! WorldRank ) {
|
||||
printf("\t%12ld\t %12ld %16.0lf\n",L,bytes,bidibytes/(t1-t0)); fflush(stdout);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*********************************************************
|
||||
* Free memory
|
||||
*********************************************************
|
||||
*/
|
||||
if ( use_device ) {
|
||||
for(int d=0;d<2*Nd;d++){
|
||||
acceleratorFreeDevice(send_bufs[d]);
|
||||
acceleratorFreeDevice(recv_bufs[d]);
|
||||
}
|
||||
} else {
|
||||
for(int d=0;d<2*Nd;d++){
|
||||
free(send_bufs[d]);
|
||||
free(recv_bufs[d]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**************************************
|
||||
* Command line junk
|
||||
**************************************/
|
||||
|
||||
std::string CmdOptionPayload(char ** begin, char ** end, const std::string & option)
|
||||
{
|
||||
char ** itr = std::find(begin, end, option);
|
||||
if (itr != end && ++itr != end) {
|
||||
std::string payload(*itr);
|
||||
return payload;
|
||||
}
|
||||
return std::string("");
|
||||
}
|
||||
bool CmdOptionExists(char** begin, char** end, const std::string& option)
|
||||
{
|
||||
return std::find(begin, end, option) != end;
|
||||
}
|
||||
void CmdOptionIntVector(const std::string &str,std::vector<int> & vec)
|
||||
{
|
||||
vec.resize(0);
|
||||
std::stringstream ss(str);
|
||||
int i;
|
||||
while (ss >> i){
|
||||
vec.push_back(i);
|
||||
if(std::ispunct(ss.peek()))
|
||||
ss.ignore();
|
||||
}
|
||||
return;
|
||||
}
|
||||
/**************************************
|
||||
* Command line junk
|
||||
**************************************/
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
std::string arg;
|
||||
|
||||
acceleratorInit();
|
||||
|
||||
MPI_Init(&argc,&argv);
|
||||
|
||||
WorldComm = MPI_COMM_WORLD;
|
||||
|
||||
MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
|
||||
|
||||
MPI_Comm_rank(WorldComm ,&WorldRank);
|
||||
MPI_Comm_size(WorldComm ,&WorldSize);
|
||||
|
||||
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
|
||||
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
|
||||
|
||||
if ( WorldSize/WorldShmSize > 2) {
|
||||
printf("This benchmark is meant to run on at most two nodes only\n");
|
||||
}
|
||||
|
||||
auto mpi =std::vector<int>({1,1,1,1});
|
||||
|
||||
if( CmdOptionExists(argv,argv+argc,"--mpi") ){
|
||||
arg = CmdOptionPayload(argv,argv+argc,"--mpi");
|
||||
CmdOptionIntVector(arg,mpi);
|
||||
} else {
|
||||
printf("Must specify --mpi <n1.n2.n3.n4> command line argument\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if( !WorldRank ) {
|
||||
printf("***********************************\n");
|
||||
printf("%d ranks\n",WorldSize);
|
||||
printf("%d ranks-per-node\n",WorldShmSize);
|
||||
printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout);
|
||||
printf("Cartesian layout: ");
|
||||
for(int d=0;d<mpi.size();d++){
|
||||
printf("%d ",mpi[d]);
|
||||
}
|
||||
printf("\n");fflush(stdout);
|
||||
printf("***********************************\n");
|
||||
}
|
||||
|
||||
|
||||
if( !WorldRank ) {
|
||||
printf("=========================================================\n");
|
||||
printf("= Benchmarking HOST memory MPI performance \n");
|
||||
printf("=========================================================\n");fflush(stdout);
|
||||
printf("= L\t pkt bytes\t MB/s \n");
|
||||
printf("=========================================================\n");fflush(stdout);
|
||||
}
|
||||
|
||||
for(int L=16;L<=64;L+=4){
|
||||
Benchmark(L,mpi,false,100);
|
||||
}
|
||||
|
||||
if( !WorldRank ) {
|
||||
printf("=========================================================\n");
|
||||
printf("= Benchmarking DEVICE memory MPI performance \n");
|
||||
printf("=========================================================\n");fflush(stdout);
|
||||
}
|
||||
for(int L=16;L<=64;L+=4){
|
||||
Benchmark(L,mpi,true,100);
|
||||
}
|
||||
|
||||
if( !WorldRank ) {
|
||||
printf("=========================================================\n");
|
||||
printf("= DONE \n");
|
||||
printf("=========================================================\n");
|
||||
}
|
||||
MPI_Finalize();
|
||||
}
|
@ -90,11 +90,11 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||
Dirichlet[0] = 0;
|
||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||
// Dirichlet[0] = 0;
|
||||
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||
|
||||
Benchmark(Ls,Dirichlet);
|
||||
|
||||
@ -105,11 +105,11 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
||||
Dirichlet[0] = 0;
|
||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
|
||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
|
||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
|
||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
|
||||
// Dirichlet[0] = 0;
|
||||
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
|
||||
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
|
||||
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
|
||||
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
|
||||
|
||||
Benchmark(Ls,Dirichlet);
|
||||
|
||||
|
@ -1,54 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||
|
||||
#PBS -q EarlyAppAccess
|
||||
#PBS -l select=1
|
||||
#PBS -l walltime=01:00:00
|
||||
##PBS -A Aurora_Deployment
|
||||
#PBS -A LatticeQCD_aesp
|
||||
|
||||
HDIR=/home/paboyle/
|
||||
#module use /soft/testing/modulefiles/
|
||||
#module load intel-UMD23.05.25593.11/23.05.25593.11
|
||||
#module load tools/pti-gpu
|
||||
#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
|
||||
#export PATH=$HDIR/tools/bin:$PATH
|
||||
|
||||
export TZ='/usr/share/zoneinfo/US/Central'
|
||||
export OMP_PROC_BIND=spread
|
||||
export OMP_NUM_THREADS=3
|
||||
unset OMP_PLACES
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
|
||||
echo Jobid: $PBS_JOBID
|
||||
echo Running on host `hostname`
|
||||
echo Running on nodes `cat $PBS_NODEFILE`
|
||||
|
||||
echo NODES
|
||||
cat $PBS_NODEFILE
|
||||
NNODES=`wc -l < $PBS_NODEFILE`
|
||||
NRANKS=12 # Number of MPI ranks per node
|
||||
NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node
|
||||
NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
|
||||
|
||||
NTOTRANKS=$(( NNODES * NRANKS ))
|
||||
|
||||
echo "NUM_NODES=${NNODES} TOTAL_RANKS=${NTOTRANKS} RANKS_PER_NODE=${NRANKS} THREADS_PER_RANK=${OMP_NUM_THREADS}"
|
||||
echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
|
||||
|
||||
|
||||
#CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
|
||||
# ./gpu_tile_compact.sh \
|
||||
# ./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
|
||||
# --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 1.1.2.6 --grid 32.24.32.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
$CMD
|
56
systems/Aurora/benchmarks/bench1024.pbs
Normal file
56
systems/Aurora/benchmarks/bench1024.pbs
Normal file
@ -0,0 +1,56 @@
|
||||
#!/bin/bash
|
||||
|
||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||
|
||||
#PBS -q EarlyAppAccess
|
||||
#PBS -l select=1024
|
||||
#PBS -l walltime=01:00:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
#export OMP_PROC_BIND=spread
|
||||
#unset OMP_PLACES
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
|
||||
cat $PBS_NODEFILE
|
||||
|
||||
export OMP_NUM_THREADS=3
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
# 12 ppn, 32 nodes, 384 ranks
|
||||
#
|
||||
CMD="mpiexec -np 12288 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 8.6.16.16 --grid 64.48.64.284 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
$CMD
|
||||
|
||||
CMD="mpiexec -np 12288 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 1024node.dwf.small
|
||||
|
||||
CMD="mpiexec -np 12288 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 1024node.dwf
|
||||
|
||||
|
45
systems/Aurora/benchmarks/bench12.pbs
Normal file
45
systems/Aurora/benchmarks/bench12.pbs
Normal file
@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
|
||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||
|
||||
#PBS -q EarlyAppAccess
|
||||
#PBS -l select=2
|
||||
#PBS -l walltime=01:00:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
#export OMP_PROC_BIND=spread
|
||||
#unset OMP_PLACES
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
|
||||
export OMP_NUM_THREADS=3
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
$CMD
|
||||
|
||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
$CMD
|
@ -1,107 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||
|
||||
#PBS -q EarlyAppAccess
|
||||
#PBS -l select=2
|
||||
#PBS -l walltime=01:00:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
HDIR=/home/paboyle/
|
||||
#module use /soft/testing/modulefiles/
|
||||
#module load intel-UMD23.05.25593.11/23.05.25593.11
|
||||
#module load tools/pti-gpu
|
||||
#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
|
||||
#export PATH=$HDIR/tools/bin:$PATH
|
||||
|
||||
export TZ='/usr/share/zoneinfo/US/Central'
|
||||
export OMP_PROC_BIND=spread
|
||||
export OMP_NUM_THREADS=3
|
||||
unset OMP_PLACES
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
|
||||
|
||||
echo Jobid: $PBS_JOBID
|
||||
echo Running on host `hostname`
|
||||
echo Running on nodes `cat $PBS_NODEFILE`
|
||||
|
||||
echo NODES
|
||||
cat $PBS_NODEFILE
|
||||
NNODES=`wc -l < $PBS_NODEFILE`
|
||||
NRANKS=12 # Number of MPI ranks per node
|
||||
NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node
|
||||
NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
|
||||
|
||||
NTOTRANKS=$(( NNODES * NRANKS ))
|
||||
|
||||
echo "NUM_NODES=${NNODES} TOTAL_RANKS=${NTOTRANKS} RANKS_PER_NODE=${NRANKS} THREADS_PER_RANK=${OMP_NUM_THREADS}"
|
||||
echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
|
||||
|
||||
|
||||
CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 1.1.1.2 --grid 32.24.32.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
|
||||
#$CMD | tee 1-to-1.comms.hmem0
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||
#$CMD | tee 1-to-1.comms.hmem1
|
||||
|
||||
|
||||
CMD="mpiexec -np 4 -ppn 2 -d ${NDEPTH} --cpu-bind=depth -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 2.2.1.1 --grid 32.24.32.96 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||
#$CMD | tee 2-to-2.comms.hmem1
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
|
||||
#$CMD | tee 2-to-2.comms.hmem0
|
||||
|
||||
CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 3.2.1.1 --grid 32.24.32.96 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||
#$CMD | tee 3-to-3.comms.hmem1
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
|
||||
#$CMD | tee 3-to-3.comms.hmem0
|
||||
|
||||
|
||||
CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \
|
||||
./gpu_tile_compact4.sh \
|
||||
./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||
$CMD | tee 4-to-4.comms.hmem1.nic-affinity
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
|
||||
$CMD | tee 4-to-4.comms.hmem0.nic-affinity
|
||||
|
||||
|
||||
CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 3.2.2.1 --grid 32.24.32.96 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||
#$CMD | tee 6-to-6.comms.hmem1
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
|
||||
#$CMD | tee 6-to-6.comms.hmem0
|
||||
|
||||
|
||||
CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 3.2.2.2 --grid 32.24.32.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||
#$CMD | tee 12-to-12.comms.hmem1
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
|
||||
#$CMD | tee 12-to-12.comms.hmem0
|
56
systems/Aurora/benchmarks/bench2048.pbs
Normal file
56
systems/Aurora/benchmarks/bench2048.pbs
Normal file
@ -0,0 +1,56 @@
|
||||
#!/bin/bash
|
||||
|
||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||
|
||||
#PBS -q EarlyAppAccess
|
||||
#PBS -l select=2048
|
||||
#PBS -l walltime=01:00:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
#export OMP_PROC_BIND=spread
|
||||
#unset OMP_PLACES
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
|
||||
cat $PBS_NODEFILE
|
||||
|
||||
export OMP_NUM_THREADS=3
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
# 12 ppn, 32 nodes, 384 ranks
|
||||
#
|
||||
CMD="mpiexec -np 24576 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 8.12.16.16 --grid 64.48.64.284 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
$CMD
|
||||
|
||||
CMD="mpiexec -np 24576 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 128.128.128.384 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 2048node.dwf.small
|
||||
|
||||
CMD="mpiexec -np 24576 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 256.256.256.768 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 2048node.dwf
|
||||
|
||||
|
48
systems/Aurora/benchmarks/bench256.pbs
Normal file
48
systems/Aurora/benchmarks/bench256.pbs
Normal file
@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
|
||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||
|
||||
#PBS -q EarlyAppAccess
|
||||
#PBS -l select=256
|
||||
#PBS -l walltime=01:00:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
#export OMP_PROC_BIND=spread
|
||||
#unset OMP_PLACES
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
|
||||
cat $PBS_NODEFILE
|
||||
|
||||
export OMP_NUM_THREADS=3
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
# 12 ppn, 32 nodes, 384 ranks
|
||||
#
|
||||
CMD="mpiexec -np 3072 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 8.6.8.8 --grid 32.24.32.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
$CMD
|
||||
|
||||
CMD="mpiexec -np 3072 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 8.8.4.12 --grid 128.128.128.768 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 256node.dwf.large
|
48
systems/Aurora/benchmarks/bench512.pbs
Normal file
48
systems/Aurora/benchmarks/bench512.pbs
Normal file
@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
|
||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||
|
||||
#PBS -q EarlyAppAccess
|
||||
#PBS -l select=512
|
||||
#PBS -l walltime=01:00:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
#export OMP_PROC_BIND=spread
|
||||
#unset OMP_PLACES
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
|
||||
cat $PBS_NODEFILE
|
||||
|
||||
export OMP_NUM_THREADS=3
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
# 12 ppn, 32 nodes, 384 ranks
|
||||
#
|
||||
CMD="mpiexec -np 6144 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 8.6.8.16 --grid 32.24.32.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
$CMD
|
||||
|
||||
CMD="mpiexec -np 6144 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 8.8.8.12 --grid 256.128.128.768 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 512node.dwf.large
|
80
systems/Aurora/benchmarks/bench_scaling.pbs
Normal file
80
systems/Aurora/benchmarks/bench_scaling.pbs
Normal file
@ -0,0 +1,80 @@
|
||||
#!/bin/bash
|
||||
|
||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||
|
||||
#PBS -q EarlyAppAccess
|
||||
#PBS -l select=32
|
||||
#PBS -l walltime=01:00:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
#export OMP_PROC_BIND=spread
|
||||
#unset OMP_PLACES
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
|
||||
cat $PBS_NODEFILE
|
||||
|
||||
export OMP_NUM_THREADS=3
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
# 12 ppn, 32 nodes, 384 ranks
|
||||
#
|
||||
CMD="mpiexec -np 384 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
$CMD
|
||||
|
||||
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 1node.dwf
|
||||
|
||||
|
||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 2node.dwf
|
||||
|
||||
CMD="mpiexec -np 48 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 4node.dwf
|
||||
|
||||
CMD="mpiexec -np 96 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 8node.dwf
|
||||
|
||||
CMD="mpiexec -np 192 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 16node.dwf
|
||||
|
||||
|
||||
CMD="mpiexec -np 384 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 32node.dwf
|
@ -1,65 +1,33 @@
|
||||
#!/bin/bash
|
||||
|
||||
display_help() {
|
||||
echo " Will map gpu tile to rank in compact and then round-robin fashion"
|
||||
echo " Usage (only work for one node of ATS/PVC):"
|
||||
echo " mpiexec --np N gpu_tile_compact.sh ./a.out"
|
||||
echo
|
||||
echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
|
||||
echo " 0 Rank 0.0"
|
||||
echo " 1 Rank 0.1"
|
||||
echo " 2 Rank 1.0"
|
||||
echo " 3 Rank 1.1"
|
||||
echo " 4 Rank 2.0"
|
||||
echo " 5 Rank 2.1"
|
||||
echo " 6 Rank 0.0"
|
||||
echo
|
||||
echo " Hacked together by apl@anl.gov, please contact if bug found"
|
||||
exit 1
|
||||
}
|
||||
|
||||
#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
|
||||
#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
|
||||
num_gpu=6
|
||||
num_tile=2
|
||||
|
||||
if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
|
||||
display_help
|
||||
fi
|
||||
|
||||
|
||||
|
||||
gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
|
||||
tile_id=$((PALS_LOCAL_RANKID % num_tile))
|
||||
|
||||
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
||||
export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
|
||||
#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
||||
export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
||||
export NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
|
||||
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
|
||||
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
|
||||
|
||||
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
|
||||
export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
|
||||
export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
|
||||
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
|
||||
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
|
||||
|
||||
export GRID_MPICH_NIC_BIND=$NIC
|
||||
#export GRID_MPICH_NIC_BIND=$NIC
|
||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
|
||||
|
||||
unset EnableWalkerPartition
|
||||
export EnableImplicitScaling=0
|
||||
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
||||
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
|
||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
|
||||
export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||
|
||||
#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
||||
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
||||
|
||||
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
|
||||
#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
|
||||
|
||||
if [ $PALS_LOCAL_RANKID = 0 ]
|
||||
then
|
||||
numactl -m $NUMA -N $NUMA "$@"
|
||||
else
|
||||
numactl -m $NUMA -N $NUMA "$@"
|
||||
fi
|
||||
numactl -m $NUMA -N $NUMAP "$@"
|
||||
|
@ -1,39 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
display_help() {
|
||||
echo " Will map gpu tile to rank in compact and then round-robin fashion"
|
||||
echo " Usage (only work for one node of ATS/PVC):"
|
||||
echo " mpiexec --np N gpu_tile_compact.sh ./a.out"
|
||||
echo
|
||||
echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
|
||||
echo " 0 Rank 0.0"
|
||||
echo " 1 Rank 0.1"
|
||||
echo " 2 Rank 1.0"
|
||||
echo " 3 Rank 1.1"
|
||||
echo " 4 Rank 2.0"
|
||||
echo " 5 Rank 2.1"
|
||||
echo " 6 Rank 0.0"
|
||||
echo
|
||||
echo " Hacked together by apl@anl.gov, please contact if bug found"
|
||||
exit 1
|
||||
}
|
||||
|
||||
#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
|
||||
#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
|
||||
num_gpu=6
|
||||
num_tile=2
|
||||
|
||||
if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
|
||||
display_help
|
||||
fi
|
||||
|
||||
|
||||
|
||||
gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
|
||||
tile_id=$((PALS_LOCAL_RANKID % num_tile))
|
||||
|
||||
export NUMA_MAP=(0 0 1 1 0 0 1 1 )
|
||||
export NIC_MAP=(0 1 4 5 0 1 4 5 )
|
||||
export NUMA_MAP=(2 2 3 3 2 2 3 3 )
|
||||
export PROC_MAP=(0 0 1 1 0 0 1 1 )
|
||||
export NIC_MAP=(0 0 4 4 1 1 5 5 )
|
||||
export GPU_MAP=(0 1 3 4 0 1 3 4 )
|
||||
export TILE_MAP=(0 0 0 0 1 1 1 1 )
|
||||
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
|
||||
@ -41,7 +10,7 @@ export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
|
||||
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
|
||||
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
|
||||
|
||||
export GRID_MPICH_NIC_BIND=$NIC
|
||||
#export GRID_MPICH_NIC_BIND=$NIC
|
||||
|
||||
unset EnableWalkerPartition
|
||||
export EnableImplicitScaling=0
|
||||
@ -55,6 +24,6 @@ export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
||||
|
||||
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
|
||||
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND ; NUMA domain $NUMA"
|
||||
|
||||
numactl -m $NUMA -N $NUMA "$@"
|
||||
numactl -m $NUMA -N $PROC_MAP "$@"
|
||||
|
@ -7,6 +7,6 @@ export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
|
||||
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
|
||||
export http_proxy=http://proxy.alcf.anl.gov:3128
|
||||
export https_proxy=http://proxy.alcf.anl.gov:3128
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user