1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Merge branch 'develop' of https://github.com/paboyle/Grid into develop

This commit is contained in:
Peter Boyle 2021-09-21 12:18:05 -07:00
commit b2ccaad761
15 changed files with 594 additions and 171 deletions

View File

@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
bool Stencil_force_mpi = true;
///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////

View File

@ -35,11 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
#ifdef GRID_MPI3_SHM_NVLINK
const bool Stencil_force_mpi = true;
#else
const bool Stencil_force_mpi = false;
#endif
extern bool Stencil_force_mpi ;
class CartesianCommunicator : public SharedMemory {

View File

@ -384,6 +384,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
} else {
// TODO : make a OMP loop on CPU, call threaded bcopy
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
acceleratorCopySynchronize(); // MPI prob slower
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {

View File

@ -543,6 +543,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
//////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
ze_ipc_mem_handle_t handle;
if ( r==WorldShmRank ) {
@ -580,6 +582,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
}
}
#endif
//////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
@ -595,7 +598,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////
// If I am not the source, overwrite thisBuf with remote buffer
///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) {
thisBuf = nullptr;
@ -636,7 +639,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////
// Save a copy of the device buffers
///////////////////////////////////////////////////////////////
WorldShmCommBufs[r] = thisBuf;
}
WorldShmCommBufs[r] = thisBuf;
#else
WorldShmCommBufs[r] = ShmCommBuf;
#endif

View File

@ -326,21 +326,8 @@ public:
int xmit_to_rank;
if ( ! comm_dim ) return 1;
int nbr_proc;
if (displacement>0) nbr_proc = 1;
else nbr_proc = pd-1;
// FIXME this logic needs to be sorted for three link term
// assert( (displacement==1) || (displacement==-1));
// Present hack only works for >= 4^4 subvol per node
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
if ( (shm==NULL) || Stencil_force_mpi ) return 0;
return 1;
if ( displacement == 0 ) return 1;
return 0;
}
//////////////////////////////////////////
@ -1020,7 +1007,6 @@ public:
int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
int shm_receive_only = 1;
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
@ -1052,10 +1038,6 @@ public:
assert (xmit_to_rank != _grid->ThisRank());
assert (recv_from_rank != _grid->ThisRank());
/////////////////////////////////////////////////////////
// try the direct copy if possible
/////////////////////////////////////////////////////////
cobj *send_buf;
cobj *recv_buf;
if ( compress.DecompressionStep() ) {
recv_buf=u_simd_recv_buf[0];
@ -1063,52 +1045,36 @@ public:
recv_buf=this->u_recv_buf_p;
}
send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
if ( (send_buf==NULL) || Stencil_force_mpi ) {
send_buf = this->u_send_buf_p;
}
// Find out if we get the direct copy.
void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_send_buf_p);
if ((success==NULL)||Stencil_force_mpi) {
// we found a packet that comes from MPI and contributes to this leg of stencil
shm_receive_only = 0;
}
cobj *send_buf;
send_buf = this->u_send_buf_p; // Gather locally, must send
////////////////////////////////////////////////////////
// Gather locally
////////////////////////////////////////////////////////
gathertime-=usecond();
assert(send_buf!=NULL);
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
gathertime+=usecond();
///////////////////////////////////////////////////////////
// Build a list of things to do after we synchronise GPUs
// Start comms now???
///////////////////////////////////////////////////////////
AddPacket((void *)&send_buf[u_comm_offset],
(void *)&recv_buf[u_comm_offset],
xmit_to_rank,
recv_from_rank,
bytes);
if ( compress.DecompressionStep() ) {
if ( shm_receive_only ) { // Early decompress before MPI is finished is possible
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
&recv_buf[u_comm_offset],
words,DecompressionsSHM);
} else { // Decompress after MPI is finished
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
&recv_buf[u_comm_offset],
words,Decompressions);
}
AddPacket((void *)&send_buf[u_comm_offset],
(void *)&recv_buf[u_comm_offset],
xmit_to_rank,
recv_from_rank,
bytes);
} else {
AddPacket((void *)&send_buf[u_comm_offset],
(void *)&this->u_recv_buf_p[u_comm_offset],
xmit_to_rank,
recv_from_rank,
bytes);
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
&recv_buf[u_comm_offset],
words,Decompressions);
}
u_comm_offset+=words;
}
}
return shm_receive_only;
return 0;
}
template<class compressor>
@ -1159,7 +1125,6 @@ public:
int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
int shm_receive_only = 1;
for(int x=0;x<rd;x++){
int any_offnode = ( ((x+sshift)%fd) >= rd );
@ -1214,20 +1179,7 @@ public:
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
// shm == receive pointer if offnode
// shm == Translate[send pointer] if on node -- my view of his send pointer
cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
if ((shm==NULL)||Stencil_force_mpi) {
shm = rp;
// we found a packet that comes from MPI and contributes to this shift.
// is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
// Kernel will add the exterior_terms except if is_same_node.
shm_receive_only = 0;
// leg of stencil
}
// if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
// assuming above pointer flip
rpointers[i] = shm;
rpointers[i] = rp;
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
@ -1239,102 +1191,17 @@ public:
}
}
if ( shm_receive_only ) {
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM);
} else {
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
}
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
u_comm_offset +=buffer_size;
}
}
return shm_receive_only;
return 0;
}
void ZeroCounters(void) {
gathertime = 0.;
commtime = 0.;
mpi3synctime=0.;
mpi3synctime_g=0.;
shmmergetime=0.;
for(int i=0;i<this->_npoints;i++){
comm_time_thr[i]=0;
comm_bytes_thr[i]=0;
comm_enter_thr[i]=0;
comm_leave_thr[i]=0;
shm_bytes_thr[i]=0;
}
halogtime = 0.;
mergetime = 0.;
decompresstime = 0.;
gathermtime = 0.;
splicetime = 0.;
nosplicetime = 0.;
comms_bytes = 0.;
shm_bytes = 0.;
calls = 0.;
};
void ZeroCounters(void) { };
void Report(void) {
#define AVERAGE(A)
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
double t = 0;
// if comm_time_thr is set they were all done in parallel so take the max
// but add up the bytes
int threaded = 0 ;
for (int i = 0; i < 8; ++i) {
if ( comm_time_thr[i]>0.0 ) {
threaded = 1;
comms_bytes += comm_bytes_thr[i];
shm_bytes += shm_bytes_thr[i];
if (t < comm_time_thr[i]) t = comm_time_thr[i];
}
}
if (threaded) commtime += t;
_grid->GlobalSum(commtime); commtime/=NP;
if ( calls > 0. ) {
std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
PRINTIT(halogtime);
PRINTIT(gathertime);
PRINTIT(gathermtime);
PRINTIT(mergetime);
PRINTIT(decompresstime);
if(comms_bytes>1.0){
PRINTIT(comms_bytes);
PRINTIT(commtime);
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
}
if(shm_bytes>1.0){
PRINTIT(shm_bytes); // X bytes + R bytes
// Double this to include spin projection overhead with 2:1 ratio in wilson
auto gatheralltime = gathertime+gathermtime;
std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
auto all_bytes = comms_bytes+shm_bytes;
std::cout << GridLogMessage << " Stencil SHM all " << (all_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
std::cout << GridLogMessage << " Stencil SHM all " << (all_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
auto membytes = (shm_bytes + comms_bytes/2) // read/write
+ (shm_bytes+comms_bytes)/2 * sizeof(vobj)/sizeof(cobj);
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
}
/*
PRINTIT(mpi3synctime);
PRINTIT(mpi3synctime_g);
PRINTIT(shmmergetime);
PRINTIT(splicetime);
PRINTIT(nosplicetime);
*/
}
#undef PRINTIT
#undef AVERAGE
};
void Report(void) { };
};
NAMESPACE_END(Grid);

View File

@ -8,6 +8,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
#ifdef GRID_CUDA
cudaDeviceProp *gpu_props;
cudaStream_t copyStream;
void acceleratorInit(void)
{
int nDevices = 1;

View File

@ -105,6 +105,7 @@ void acceleratorInit(void);
#define accelerator_inline __host__ __device__ inline
extern int acceleratorAbortOnGpuError;
extern cudaStream_t copyStream;
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
@ -213,9 +214,13 @@ inline void *acceleratorAllocDevice(size_t bytes)
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
}
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
inline int acceleratorIsCommunicable(void *ptr)
{
// int uvm=0;
@ -289,7 +294,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) {
theGridAccelerator->memcpy(to,from,bytes);
}
inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); }
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
@ -394,7 +402,8 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
inline void acceleratorCopySynchronise(void) { }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
#endif
@ -435,7 +444,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopySynchronize(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}

View File

@ -301,6 +301,13 @@ void Grid_init(int *argc,char ***argv)
GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-mpi") ){
int forcempi;
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm-mpi");
GridCmdOptionInt(arg,forcempi);
Stencil_force_mpi = (bool)forcempi;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--device-mem") ){
int MB;
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--device-mem");
@ -419,7 +426,9 @@ void Grid_init(int *argc,char ***argv)
std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl;
std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl;
std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl;
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
std::cout<<GridLogMessage<<" --shm-mpi 0|1 : Force MPI usage under multi-rank per node "<<std::endl;
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
std::cout<<GridLogMessage<<std::endl;

View File

@ -0,0 +1,129 @@
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 16
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1463a0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=e188c0512ebee79bfb15906676af1c9e142aa21a: (HEAD -> develop) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 0.729967 s : Grid is setup to use 4 threads
Grid : Message : 0.729975 s : Number of iterations to average: 250
Grid : Message : 0.729977 s : ====================================================================================================
Grid : Message : 0.729978 s : = Benchmarking sequential halo exchange from host memory
Grid : Message : 0.729979 s : ====================================================================================================
Grid : Message : 0.729980 s : L Ls bytes MB/s uni (err/min/max) MB/s bidi (err/min/max)
Grid : Message : 0.749870 s : 8 8 393216 50783.4 101566.8
Grid : Message : 0.764282 s : 8 8 393216 54704.5 109409.0
Grid : Message : 0.780310 s : 8 8 393216 49090.6 98181.3
Grid : Message : 0.796479 s : 8 8 393216 48662.3 97324.7
Grid : Message : 0.841551 s : 12 8 1327104 66728.9 133457.8
Grid : Message : 0.880653 s : 12 8 1327104 67932.9 135865.9
Grid : Message : 0.920097 s : 12 8 1327104 67304.2 134608.4
Grid : Message : 0.961444 s : 12 8 1327104 64205.9 128411.8
Grid : Message : 1.660890 s : 16 8 3145728 67833.1 135666.3
Grid : Message : 1.153006 s : 16 8 3145728 72416.3 144832.6
Grid : Message : 1.240962 s : 16 8 3145728 71536.1 143072.2
Grid : Message : 1.330372 s : 16 8 3145728 70372.7 140745.3
Grid : Message : 1.519996 s : 20 8 6144000 71017.4 142034.8
Grid : Message : 1.667745 s : 20 8 6144000 83189.5 166378.9
Grid : Message : 1.817908 s : 20 8 6144000 81836.5 163673.1
Grid : Message : 1.969344 s : 20 8 6144000 81148.0 162296.0
Grid : Message : 2.260249 s : 24 8 10616832 79299.9 158599.8
Grid : Message : 2.512319 s : 24 8 10616832 84249.2 168498.4
Grid : Message : 2.763820 s : 24 8 10616832 84430.4 168860.9
Grid : Message : 3.172850 s : 24 8 10616832 83776.5 167553.1
Grid : Message : 3.460951 s : 28 8 16859136 82176.6 164353.1
Grid : Message : 3.859348 s : 28 8 16859136 84642.9 169285.9
Grid : Message : 4.254351 s : 28 8 16859136 85366.0 170731.9
Grid : Message : 4.651748 s : 28 8 16859136 84850.2 169700.4
Grid : Message : 5.302166 s : 32 8 25165824 83402.1 166804.1
Grid : Message : 5.889123 s : 32 8 25165824 85756.3 171512.6
Grid : Message : 6.472357 s : 32 8 25165824 86299.1 172598.3
Grid : Message : 7.572140 s : 32 8 25165824 86059.7 172119.3
Grid : Message : 7.578700 s : ====================================================================================================
Grid : Message : 7.578740 s : = Benchmarking sequential halo exchange from GPU memory
Grid : Message : 7.578750 s : ====================================================================================================
Grid : Message : 7.578760 s : L Ls bytes MB/s uni (err/min/max) MB/s bidi (err/min/max)
Grid : Message : 7.119231 s : 8 8 393216 13844.9 27689.8
Grid : Message : 7.150661 s : 8 8 393216 25034.4 50068.9
Grid : Message : 7.173800 s : 8 8 393216 34002.0 68004.0
Grid : Message : 7.197415 s : 8 8 393216 33317.7 66635.5
Grid : Message : 7.240696 s : 12 8 1327104 110772.0 221544.0
Grid : Message : 7.263466 s : 12 8 1327104 116627.5 233254.9
Grid : Message : 7.310752 s : 12 8 1327104 56142.8 112285.6
Grid : Message : 7.356881 s : 12 8 1327104 57551.3 115102.6
Grid : Message : 7.422351 s : 16 8 3145728 167086.0 334172.0
Grid : Message : 7.458334 s : 16 8 3145728 174903.6 349807.1
Grid : Message : 7.558746 s : 16 8 3145728 62663.3 125326.6
Grid : Message : 7.658824 s : 16 8 3145728 62871.8 125743.6
Grid : Message : 7.741423 s : 20 8 6144000 231840.3 463680.6
Grid : Message : 7.794862 s : 20 8 6144000 229996.1 459992.1
Grid : Message : 7.982472 s : 20 8 6144000 65501.1 131002.1
Grid : Message : 8.170548 s : 20 8 6144000 65338.8 130677.5
Grid : Message : 8.277182 s : 24 8 10616832 274319.0 548638.0
Grid : Message : 8.354585 s : 24 8 10616832 274365.1 548730.2
Grid : Message : 8.675675 s : 24 8 10616832 66132.8 132265.7
Grid : Message : 8.999237 s : 24 8 10616832 65627.4 131254.7
Grid : Message : 9.140302 s : 28 8 16859136 300825.0 601650.0
Grid : Message : 9.251320 s : 28 8 16859136 303749.1 607498.1
Grid : Message : 9.632241 s : 28 8 16859136 88520.3 177040.6
Grid : Message : 9.999663 s : 28 8 16859136 91772.9 183545.7
Grid : Message : 10.183071 s : 32 8 25165824 328325.5 656651.1
Grid : Message : 10.335093 s : 32 8 25165824 331109.7 662219.3
Grid : Message : 10.875980 s : 32 8 25165824 93056.0 186111.9
Grid : Message : 11.418666 s : 32 8 25165824 92747.5 185495.0
Grid : Message : 11.434792 s : ====================================================================================================
Grid : Message : 11.434797 s : = All done; Bye Bye
Grid : Message : 11.434798 s : ====================================================================================================

View File

@ -0,0 +1,14 @@
LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
../../configure \
--enable-comms=mpi \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-shm=nvlink \
--enable-accelerator=cuda \
--with-lime=$LIME \
--disable-accelerator-cshift \
--disable-unified \
CXX=nvcc \
LDFLAGS="-cudart shared " \
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"

View File

@ -0,0 +1,156 @@
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ac40000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=f660dc67e4b193afc4015bc5e5fe47cfdbb0356e: (HEAD -> develop, origin/develop, origin/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 0.910318 s : Grid Layout
Grid : Message : 0.910320 s : Global lattice size : 64 64 64 256
Grid : Message : 0.910325 s : OpenMP threads : 4
Grid : Message : 0.910326 s : MPI tasks : 2 2 2 8
Grid : Message : 0.973956 s : Making s innermost grids
Grid : Message : 1.198830 s : Initialising 4d RNG
Grid : Message : 1.119813 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.119870 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.683307 s : Initialising 5d RNG
Grid : Message : 4.220535 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.220563 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 37.198140 s : Initialised RNGs
Grid : Message : 39.952612 s : Drawing gauge field
Grid : Message : 40.488019 s : Random gauge initialised
Grid : Message : 42.659220 s : Setting up Cshift based reference
Grid : Message : 47.622210 s : *****************************************************************
Grid : Message : 47.622236 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 47.622237 s : *****************************************************************
Grid : Message : 47.622238 s : *****************************************************************
Grid : Message : 47.622239 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 47.622240 s : * Vectorising space-time by 8
Grid : Message : 47.622241 s : * VComplexF size is 64 B
Grid : Message : 47.622242 s : * SINGLE precision
Grid : Message : 47.622243 s : * Using Overlapped Comms/Compute
Grid : Message : 47.622244 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 47.622245 s : *****************************************************************
Grid : Message : 48.950210 s : Called warmup
Grid : Message : 77.311124 s : Called Dw 3000 times in 2.83592e+07 us
Grid : Message : 77.311181 s : mflop/s = 1.49934e+08
Grid : Message : 77.311184 s : mflop/s per rank = 2.34273e+06
Grid : Message : 77.311185 s : mflop/s per node = 9.37091e+06
Grid : Message : 77.311186 s : RF GiB/s (base 2) = 304663
Grid : Message : 77.311187 s : mem GiB/s (base 2) = 190415
Grid : Message : 77.314752 s : norm diff 1.03478e-13
Grid : Message : 77.349587 s : #### Dhop calls report
Grid : Message : 77.349591 s : WilsonFermion5D Number of DhopEO Calls : 6002
Grid : Message : 77.349613 s : WilsonFermion5D TotalTime /Calls : 4761.53 us
Grid : Message : 77.349615 s : WilsonFermion5D CommTime /Calls : 3363.09 us
Grid : Message : 77.349616 s : WilsonFermion5D FaceTime /Calls : 469.094 us
Grid : Message : 77.349617 s : WilsonFermion5D ComputeTime1/Calls : 26.8794 us
Grid : Message : 77.349618 s : WilsonFermion5D ComputeTime2/Calls : 949.276 us
Grid : Message : 77.349702 s : Average mflops/s per call : 2.68569e+10
Grid : Message : 77.349710 s : Average mflops/s per call per rank : 4.1964e+08
Grid : Message : 77.349711 s : Average mflops/s per call per node : 1.67856e+09
Grid : Message : 77.349712 s : Average mflops/s per call (full) : 1.51538e+08
Grid : Message : 77.349713 s : Average mflops/s per call per rank (full): 2.36779e+06
Grid : Message : 77.349714 s : Average mflops/s per call per node (full): 9.47115e+06
Grid : Message : 77.349715 s : WilsonFermion5D Stencil
Grid : Message : 77.349716 s : WilsonFermion5D StencilEven
Grid : Message : 77.349717 s : WilsonFermion5D StencilOdd
Grid : Message : 77.349718 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 77.349719 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 77.349720 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 104.883719 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 104.883743 s : Called DwDag
Grid : Message : 104.883744 s : norm dag result 12.0421
Grid : Message : 104.901901 s : norm dag ref 12.0421
Grid : Message : 104.917822 s : norm dag diff 7.63254e-14
Grid : Message : 104.957229 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 105.334551 s : src_e0.499998
Grid : Message : 105.416616 s : src_o0.500002
Grid : Message : 105.486729 s : *********************************************************
Grid : Message : 105.486732 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 105.486733 s : * Vectorising space-time by 8
Grid : Message : 105.486734 s : * SINGLE precision
Grid : Message : 105.486739 s : * Using Overlapped Comms/Compute
Grid : Message : 105.486740 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 105.486741 s : *********************************************************
Grid : Message : 119.695464 s : Deo mflop/s = 1.5039e+08
Grid : Message : 119.695494 s : Deo mflop/s per rank 2.34984e+06
Grid : Message : 119.695496 s : Deo mflop/s per node 9.39937e+06
Grid : Message : 119.695502 s : #### Dhop calls report
Grid : Message : 119.695503 s : WilsonFermion5D Number of DhopEO Calls : 3001
Grid : Message : 119.695505 s : WilsonFermion5D TotalTime /Calls : 4734.45 us
Grid : Message : 119.695507 s : WilsonFermion5D CommTime /Calls : 3287.23 us
Grid : Message : 119.695508 s : WilsonFermion5D FaceTime /Calls : 537.724 us
Grid : Message : 119.695509 s : WilsonFermion5D ComputeTime1/Calls : 16.0483 us
Grid : Message : 119.695510 s : WilsonFermion5D ComputeTime2/Calls : 939.854 us
Grid : Message : 119.695533 s : Average mflops/s per call : 4.50726e+10
Grid : Message : 119.695535 s : Average mflops/s per call per rank : 7.04259e+08
Grid : Message : 119.695536 s : Average mflops/s per call per node : 2.81703e+09
Grid : Message : 119.695537 s : Average mflops/s per call (full) : 1.52405e+08
Grid : Message : 119.695538 s : Average mflops/s per call per rank (full): 2.38133e+06
Grid : Message : 119.695539 s : Average mflops/s per call per node (full): 9.52532e+06
Grid : Message : 119.695540 s : WilsonFermion5D Stencil
Grid : Message : 119.695541 s : WilsonFermion5D StencilEven
Grid : Message : 119.695542 s : WilsonFermion5D StencilOdd
Grid : Message : 119.695543 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 119.695544 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 119.695545 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 119.752707 s : r_e6.02108
Grid : Message : 119.759448 s : r_o6.02101
Grid : Message : 119.765382 s : res12.0421
Grid : Message : 120.419093 s : norm diff 0
Grid : Message : 120.829772 s : norm diff even 0
Grid : Message : 120.909078 s : norm diff odd 0

View File

@ -0,0 +1,156 @@
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 16
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14e9c0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=e188c0512ebee79bfb15906676af1c9e142aa21a: (HEAD -> develop) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 0.717713 s : Grid Layout
Grid : Message : 0.717716 s : Global lattice size : 64 64 64 64
Grid : Message : 0.717724 s : OpenMP threads : 4
Grid : Message : 0.717725 s : MPI tasks : 2 2 2 2
Grid : Message : 0.801634 s : Making s innermost grids
Grid : Message : 0.844903 s : Initialising 4d RNG
Grid : Message : 0.940001 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 0.940060 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.338368 s : Initialising 5d RNG
Grid : Message : 2.859273 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.859304 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 11.140924 s : Initialised RNGs
Grid : Message : 13.433456 s : Drawing gauge field
Grid : Message : 13.955847 s : Random gauge initialised
Grid : Message : 15.528535 s : Setting up Cshift based reference
Grid : Message : 21.484340 s : *****************************************************************
Grid : Message : 21.484840 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 21.484860 s : *****************************************************************
Grid : Message : 21.484870 s : *****************************************************************
Grid : Message : 21.484880 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 21.484890 s : * Vectorising space-time by 8
Grid : Message : 21.484900 s : * VComplexF size is 64 B
Grid : Message : 21.484910 s : * SINGLE precision
Grid : Message : 21.484920 s : * Using Overlapped Comms/Compute
Grid : Message : 21.484930 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 21.484940 s : *****************************************************************
Grid : Message : 22.344741 s : Called warmup
Grid : Message : 49.832292 s : Called Dw 3000 times in 2.74873e+07 us
Grid : Message : 49.832358 s : mflop/s = 3.86726e+07
Grid : Message : 49.832360 s : mflop/s per rank = 2.41704e+06
Grid : Message : 49.832361 s : mflop/s per node = 9.66814e+06
Grid : Message : 49.832362 s : RF GiB/s (base 2) = 78581.7
Grid : Message : 49.832363 s : mem GiB/s (base 2) = 49113.6
Grid : Message : 49.835924 s : norm diff 1.03481e-13
Grid : Message : 49.870568 s : #### Dhop calls report
Grid : Message : 49.870574 s : WilsonFermion5D Number of DhopEO Calls : 6002
Grid : Message : 49.870598 s : WilsonFermion5D TotalTime /Calls : 4616.79 us
Grid : Message : 49.870600 s : WilsonFermion5D CommTime /Calls : 3241.77 us
Grid : Message : 49.870601 s : WilsonFermion5D FaceTime /Calls : 469.006 us
Grid : Message : 49.870602 s : WilsonFermion5D ComputeTime1/Calls : 27.0492 us
Grid : Message : 49.870603 s : WilsonFermion5D ComputeTime2/Calls : 926.33 us
Grid : Message : 49.870614 s : Average mflops/s per call : 6.71631e+09
Grid : Message : 49.870619 s : Average mflops/s per call per rank : 4.19769e+08
Grid : Message : 49.870621 s : Average mflops/s per call per node : 1.67908e+09
Grid : Message : 49.870626 s : Average mflops/s per call (full) : 3.90723e+07
Grid : Message : 49.870627 s : Average mflops/s per call per rank (full): 2.44202e+06
Grid : Message : 49.870628 s : Average mflops/s per call per node (full): 9.76808e+06
Grid : Message : 49.870629 s : WilsonFermion5D Stencil
Grid : Message : 49.870630 s : WilsonFermion5D StencilEven
Grid : Message : 49.870631 s : WilsonFermion5D StencilOdd
Grid : Message : 49.870632 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 49.870633 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 49.870634 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 77.321890 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 77.321911 s : Called DwDag
Grid : Message : 77.321912 s : norm dag result 12.0421
Grid : Message : 77.334619 s : norm dag ref 12.0421
Grid : Message : 77.350515 s : norm dag diff 7.63236e-14
Grid : Message : 77.389923 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 77.769815 s : src_e0.499997
Grid : Message : 77.847560 s : src_o0.500003
Grid : Message : 77.917493 s : *********************************************************
Grid : Message : 77.917496 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 77.917497 s : * Vectorising space-time by 8
Grid : Message : 77.917498 s : * SINGLE precision
Grid : Message : 77.917499 s : * Using Overlapped Comms/Compute
Grid : Message : 77.917500 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 77.917501 s : *********************************************************
Grid : Message : 91.412946 s : Deo mflop/s = 3.95925e+07
Grid : Message : 91.412978 s : Deo mflop/s per rank 2.47453e+06
Grid : Message : 91.412980 s : Deo mflop/s per node 9.89813e+06
Grid : Message : 91.412983 s : #### Dhop calls report
Grid : Message : 91.412984 s : WilsonFermion5D Number of DhopEO Calls : 3001
Grid : Message : 91.412986 s : WilsonFermion5D TotalTime /Calls : 4496.84 us
Grid : Message : 91.412988 s : WilsonFermion5D CommTime /Calls : 3057.28 us
Grid : Message : 91.412989 s : WilsonFermion5D FaceTime /Calls : 528.499 us
Grid : Message : 91.412990 s : WilsonFermion5D ComputeTime1/Calls : 16.1939 us
Grid : Message : 91.412991 s : WilsonFermion5D ComputeTime2/Calls : 942.557 us
Grid : Message : 91.413021 s : Average mflops/s per call : 1.12574e+10
Grid : Message : 91.413023 s : Average mflops/s per call per rank : 7.03586e+08
Grid : Message : 91.413024 s : Average mflops/s per call per node : 2.81434e+09
Grid : Message : 91.413025 s : Average mflops/s per call (full) : 4.01145e+07
Grid : Message : 91.413026 s : Average mflops/s per call per rank (full): 2.50716e+06
Grid : Message : 91.413027 s : Average mflops/s per call per node (full): 1.00286e+07
Grid : Message : 91.413028 s : WilsonFermion5D Stencil
Grid : Message : 91.413029 s : WilsonFermion5D StencilEven
Grid : Message : 91.413030 s : WilsonFermion5D StencilOdd
Grid : Message : 91.413031 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 91.413032 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 91.413033 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 91.470394 s : r_e6.02111
Grid : Message : 91.476539 s : r_o6.02102
Grid : Message : 91.482442 s : res12.0421
Grid : Message : 92.138799 s : norm diff 0
Grid : Message : 92.545354 s : norm diff even 0
Grid : Message : 92.619444 s : norm diff odd 0

View File

@ -0,0 +1,29 @@
#!/bin/sh
#SBATCH --account=gm2dwf
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=12
#SBATCH --time=0:30:00
#SBATCH --partition=booster
#SBATCH --gres=gpu:4
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
OPT="--comms-overlap --comms-concurrent"
srun -N 16 -n $SLURM_NTASKS \
./benchmarks/Benchmark_dwf_fp32 \
$OPT \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 64.64.64.256 \
--shm 2048 > dwf.16node.perf

View File

@ -0,0 +1,39 @@
#!/bin/sh
#SBATCH --account=gm2dwf
#SBATCH --nodes=4
#SBATCH --ntasks=16
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=12
#SBATCH --time=2:00:00
#SBATCH --partition=develbooster
#SBATCH --gres=gpu:4
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
OPT="--comms-overlap --comms-concurrent"
srun -N 4 -n $SLURM_NTASKS \
./benchmarks/Benchmark_dwf_fp32 \
$OPT \
--mpi 2.2.2.2 \
--accelerator-threads 8 \
--grid 64.64.64.64 \
--shm 2048 > dwf.4node.perf
srun -N 4 -n $SLURM_NTASKS \
./benchmarks/Benchmark_comms_host_device \
--mpi 2.2.2.2 \
--accelerator-threads 8 \
--grid 64.64.64.64 \
--shm 2048 > comms.4node.perf

View File

@ -0,0 +1,5 @@
module load GCC/9.3.0
module load GMP/6.2.0
module load MPFR/4.1.0
module load OpenMPI/4.1.0rc1
module load CUDA/11.3