mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge branch 'develop' of https://github.com/paboyle/Grid into develop
This commit is contained in:
commit
b2ccaad761
@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
bool Stencil_force_mpi = true;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
@ -35,11 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#ifdef GRID_MPI3_SHM_NVLINK
|
||||
const bool Stencil_force_mpi = true;
|
||||
#else
|
||||
const bool Stencil_force_mpi = false;
|
||||
#endif
|
||||
extern bool Stencil_force_mpi ;
|
||||
|
||||
class CartesianCommunicator : public SharedMemory {
|
||||
|
||||
|
@ -384,6 +384,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
off_node_bytes+=bytes;
|
||||
} else {
|
||||
// TODO : make a OMP loop on CPU, call threaded bcopy
|
||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
|
||||
acceleratorCopySynchronize(); // MPI prob slower
|
||||
}
|
||||
|
||||
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
||||
|
@ -543,6 +543,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
//////////////////////////////////////////////////
|
||||
// If it is me, pass around the IPC access key
|
||||
//////////////////////////////////////////////////
|
||||
void * thisBuf = ShmCommBuf;
|
||||
if(!Stencil_force_mpi) {
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
ze_ipc_mem_handle_t handle;
|
||||
if ( r==WorldShmRank ) {
|
||||
@ -580,6 +582,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// Share this IPC handle across the Shm Comm
|
||||
//////////////////////////////////////////////////
|
||||
@ -595,7 +598,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
///////////////////////////////////////////////////////////////
|
||||
// If I am not the source, overwrite thisBuf with remote buffer
|
||||
///////////////////////////////////////////////////////////////
|
||||
void * thisBuf = ShmCommBuf;
|
||||
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
if ( r!=WorldShmRank ) {
|
||||
thisBuf = nullptr;
|
||||
@ -636,7 +639,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Save a copy of the device buffers
|
||||
///////////////////////////////////////////////////////////////
|
||||
WorldShmCommBufs[r] = thisBuf;
|
||||
}
|
||||
WorldShmCommBufs[r] = thisBuf;
|
||||
#else
|
||||
WorldShmCommBufs[r] = ShmCommBuf;
|
||||
#endif
|
||||
|
@ -326,21 +326,8 @@ public:
|
||||
int xmit_to_rank;
|
||||
|
||||
if ( ! comm_dim ) return 1;
|
||||
|
||||
int nbr_proc;
|
||||
if (displacement>0) nbr_proc = 1;
|
||||
else nbr_proc = pd-1;
|
||||
|
||||
// FIXME this logic needs to be sorted for three link term
|
||||
// assert( (displacement==1) || (displacement==-1));
|
||||
// Present hack only works for >= 4^4 subvol per node
|
||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
|
||||
|
||||
if ( (shm==NULL) || Stencil_force_mpi ) return 0;
|
||||
|
||||
return 1;
|
||||
if ( displacement == 0 ) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////
|
||||
@ -1020,7 +1007,6 @@ public:
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
|
||||
int shm_receive_only = 1;
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
@ -1052,10 +1038,6 @@ public:
|
||||
assert (xmit_to_rank != _grid->ThisRank());
|
||||
assert (recv_from_rank != _grid->ThisRank());
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// try the direct copy if possible
|
||||
/////////////////////////////////////////////////////////
|
||||
cobj *send_buf;
|
||||
cobj *recv_buf;
|
||||
if ( compress.DecompressionStep() ) {
|
||||
recv_buf=u_simd_recv_buf[0];
|
||||
@ -1063,52 +1045,36 @@ public:
|
||||
recv_buf=this->u_recv_buf_p;
|
||||
}
|
||||
|
||||
send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
|
||||
if ( (send_buf==NULL) || Stencil_force_mpi ) {
|
||||
send_buf = this->u_send_buf_p;
|
||||
}
|
||||
|
||||
// Find out if we get the direct copy.
|
||||
void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_send_buf_p);
|
||||
if ((success==NULL)||Stencil_force_mpi) {
|
||||
// we found a packet that comes from MPI and contributes to this leg of stencil
|
||||
shm_receive_only = 0;
|
||||
}
|
||||
cobj *send_buf;
|
||||
send_buf = this->u_send_buf_p; // Gather locally, must send
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Gather locally
|
||||
////////////////////////////////////////////////////////
|
||||
gathertime-=usecond();
|
||||
assert(send_buf!=NULL);
|
||||
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
|
||||
gathertime+=usecond();
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Build a list of things to do after we synchronise GPUs
|
||||
// Start comms now???
|
||||
///////////////////////////////////////////////////////////
|
||||
AddPacket((void *)&send_buf[u_comm_offset],
|
||||
(void *)&recv_buf[u_comm_offset],
|
||||
xmit_to_rank,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
if ( compress.DecompressionStep() ) {
|
||||
|
||||
if ( shm_receive_only ) { // Early decompress before MPI is finished is possible
|
||||
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
|
||||
&recv_buf[u_comm_offset],
|
||||
words,DecompressionsSHM);
|
||||
} else { // Decompress after MPI is finished
|
||||
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
|
||||
&recv_buf[u_comm_offset],
|
||||
words,Decompressions);
|
||||
}
|
||||
|
||||
AddPacket((void *)&send_buf[u_comm_offset],
|
||||
(void *)&recv_buf[u_comm_offset],
|
||||
xmit_to_rank,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
} else {
|
||||
AddPacket((void *)&send_buf[u_comm_offset],
|
||||
(void *)&this->u_recv_buf_p[u_comm_offset],
|
||||
xmit_to_rank,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
|
||||
&recv_buf[u_comm_offset],
|
||||
words,Decompressions);
|
||||
}
|
||||
u_comm_offset+=words;
|
||||
}
|
||||
}
|
||||
return shm_receive_only;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<class compressor>
|
||||
@ -1159,7 +1125,6 @@ public:
|
||||
int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
|
||||
// loop over outer coord planes orthog to dim
|
||||
int shm_receive_only = 1;
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int any_offnode = ( ((x+sshift)%fd) >= rd );
|
||||
@ -1214,20 +1179,7 @@ public:
|
||||
|
||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
// shm == receive pointer if offnode
|
||||
// shm == Translate[send pointer] if on node -- my view of his send pointer
|
||||
cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
|
||||
if ((shm==NULL)||Stencil_force_mpi) {
|
||||
shm = rp;
|
||||
// we found a packet that comes from MPI and contributes to this shift.
|
||||
// is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
|
||||
// Kernel will add the exterior_terms except if is_same_node.
|
||||
shm_receive_only = 0;
|
||||
// leg of stencil
|
||||
}
|
||||
// if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
|
||||
// assuming above pointer flip
|
||||
rpointers[i] = shm;
|
||||
rpointers[i] = rp;
|
||||
|
||||
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
|
||||
|
||||
@ -1239,102 +1191,17 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if ( shm_receive_only ) {
|
||||
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM);
|
||||
} else {
|
||||
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||
}
|
||||
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||
|
||||
u_comm_offset +=buffer_size;
|
||||
}
|
||||
}
|
||||
return shm_receive_only;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ZeroCounters(void) {
|
||||
gathertime = 0.;
|
||||
commtime = 0.;
|
||||
mpi3synctime=0.;
|
||||
mpi3synctime_g=0.;
|
||||
shmmergetime=0.;
|
||||
for(int i=0;i<this->_npoints;i++){
|
||||
comm_time_thr[i]=0;
|
||||
comm_bytes_thr[i]=0;
|
||||
comm_enter_thr[i]=0;
|
||||
comm_leave_thr[i]=0;
|
||||
shm_bytes_thr[i]=0;
|
||||
}
|
||||
halogtime = 0.;
|
||||
mergetime = 0.;
|
||||
decompresstime = 0.;
|
||||
gathermtime = 0.;
|
||||
splicetime = 0.;
|
||||
nosplicetime = 0.;
|
||||
comms_bytes = 0.;
|
||||
shm_bytes = 0.;
|
||||
calls = 0.;
|
||||
};
|
||||
void ZeroCounters(void) { };
|
||||
|
||||
void Report(void) {
|
||||
#define AVERAGE(A)
|
||||
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
||||
RealD NP = _grid->_Nprocessors;
|
||||
RealD NN = _grid->NodeCount();
|
||||
double t = 0;
|
||||
// if comm_time_thr is set they were all done in parallel so take the max
|
||||
// but add up the bytes
|
||||
int threaded = 0 ;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
if ( comm_time_thr[i]>0.0 ) {
|
||||
threaded = 1;
|
||||
comms_bytes += comm_bytes_thr[i];
|
||||
shm_bytes += shm_bytes_thr[i];
|
||||
if (t < comm_time_thr[i]) t = comm_time_thr[i];
|
||||
}
|
||||
}
|
||||
if (threaded) commtime += t;
|
||||
|
||||
_grid->GlobalSum(commtime); commtime/=NP;
|
||||
if ( calls > 0. ) {
|
||||
std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
|
||||
PRINTIT(halogtime);
|
||||
PRINTIT(gathertime);
|
||||
PRINTIT(gathermtime);
|
||||
PRINTIT(mergetime);
|
||||
PRINTIT(decompresstime);
|
||||
if(comms_bytes>1.0){
|
||||
PRINTIT(comms_bytes);
|
||||
PRINTIT(commtime);
|
||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
}
|
||||
if(shm_bytes>1.0){
|
||||
PRINTIT(shm_bytes); // X bytes + R bytes
|
||||
// Double this to include spin projection overhead with 2:1 ratio in wilson
|
||||
auto gatheralltime = gathertime+gathermtime;
|
||||
std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
|
||||
auto all_bytes = comms_bytes+shm_bytes;
|
||||
std::cout << GridLogMessage << " Stencil SHM all " << (all_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil SHM all " << (all_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
|
||||
auto membytes = (shm_bytes + comms_bytes/2) // read/write
|
||||
+ (shm_bytes+comms_bytes)/2 * sizeof(vobj)/sizeof(cobj);
|
||||
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
}
|
||||
/*
|
||||
PRINTIT(mpi3synctime);
|
||||
PRINTIT(mpi3synctime_g);
|
||||
PRINTIT(shmmergetime);
|
||||
PRINTIT(splicetime);
|
||||
PRINTIT(nosplicetime);
|
||||
*/
|
||||
}
|
||||
#undef PRINTIT
|
||||
#undef AVERAGE
|
||||
};
|
||||
void Report(void) { };
|
||||
|
||||
};
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -8,6 +8,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
cudaDeviceProp *gpu_props;
|
||||
cudaStream_t copyStream;
|
||||
void acceleratorInit(void)
|
||||
{
|
||||
int nDevices = 1;
|
||||
|
@ -105,6 +105,7 @@ void acceleratorInit(void);
|
||||
#define accelerator_inline __host__ __device__ inline
|
||||
|
||||
extern int acceleratorAbortOnGpuError;
|
||||
extern cudaStream_t copyStream;
|
||||
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#ifdef GRID_SIMT
|
||||
@ -213,9 +214,13 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
{
|
||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
||||
inline int acceleratorIsCommunicable(void *ptr)
|
||||
{
|
||||
// int uvm=0;
|
||||
@ -289,7 +294,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t
|
||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) {
|
||||
theGridAccelerator->memcpy(to,from,bytes);
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); }
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
|
||||
@ -394,7 +402,8 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||
inline void acceleratorCopySynchronise(void) { }
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
|
||||
|
||||
#endif
|
||||
@ -435,7 +444,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopySynchronize(void) {};
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
|
||||
|
@ -301,6 +301,13 @@ void Grid_init(int *argc,char ***argv)
|
||||
GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-mpi") ){
|
||||
int forcempi;
|
||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm-mpi");
|
||||
GridCmdOptionInt(arg,forcempi);
|
||||
Stencil_force_mpi = (bool)forcempi;
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--device-mem") ){
|
||||
int MB;
|
||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--device-mem");
|
||||
@ -419,7 +426,9 @@ void Grid_init(int *argc,char ***argv)
|
||||
std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --shm-mpi 0|1 : Force MPI usage under multi-rank per node "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
|
129
systems/Booster/comms.4node.perf
Normal file
129
systems/Booster/comms.4node.perf
Normal file
@ -0,0 +1,129 @@
|
||||
OPENMPI detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 3
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1463a0000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=e188c0512ebee79bfb15906676af1c9e142aa21a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.729967 s : Grid is setup to use 4 threads
|
||||
Grid : Message : 0.729975 s : Number of iterations to average: 250
|
||||
Grid : Message : 0.729977 s : ====================================================================================================
|
||||
Grid : Message : 0.729978 s : = Benchmarking sequential halo exchange from host memory
|
||||
Grid : Message : 0.729979 s : ====================================================================================================
|
||||
Grid : Message : 0.729980 s : L Ls bytes MB/s uni (err/min/max) MB/s bidi (err/min/max)
|
||||
Grid : Message : 0.749870 s : 8 8 393216 50783.4 101566.8
|
||||
Grid : Message : 0.764282 s : 8 8 393216 54704.5 109409.0
|
||||
Grid : Message : 0.780310 s : 8 8 393216 49090.6 98181.3
|
||||
Grid : Message : 0.796479 s : 8 8 393216 48662.3 97324.7
|
||||
Grid : Message : 0.841551 s : 12 8 1327104 66728.9 133457.8
|
||||
Grid : Message : 0.880653 s : 12 8 1327104 67932.9 135865.9
|
||||
Grid : Message : 0.920097 s : 12 8 1327104 67304.2 134608.4
|
||||
Grid : Message : 0.961444 s : 12 8 1327104 64205.9 128411.8
|
||||
Grid : Message : 1.660890 s : 16 8 3145728 67833.1 135666.3
|
||||
Grid : Message : 1.153006 s : 16 8 3145728 72416.3 144832.6
|
||||
Grid : Message : 1.240962 s : 16 8 3145728 71536.1 143072.2
|
||||
Grid : Message : 1.330372 s : 16 8 3145728 70372.7 140745.3
|
||||
Grid : Message : 1.519996 s : 20 8 6144000 71017.4 142034.8
|
||||
Grid : Message : 1.667745 s : 20 8 6144000 83189.5 166378.9
|
||||
Grid : Message : 1.817908 s : 20 8 6144000 81836.5 163673.1
|
||||
Grid : Message : 1.969344 s : 20 8 6144000 81148.0 162296.0
|
||||
Grid : Message : 2.260249 s : 24 8 10616832 79299.9 158599.8
|
||||
Grid : Message : 2.512319 s : 24 8 10616832 84249.2 168498.4
|
||||
Grid : Message : 2.763820 s : 24 8 10616832 84430.4 168860.9
|
||||
Grid : Message : 3.172850 s : 24 8 10616832 83776.5 167553.1
|
||||
Grid : Message : 3.460951 s : 28 8 16859136 82176.6 164353.1
|
||||
Grid : Message : 3.859348 s : 28 8 16859136 84642.9 169285.9
|
||||
Grid : Message : 4.254351 s : 28 8 16859136 85366.0 170731.9
|
||||
Grid : Message : 4.651748 s : 28 8 16859136 84850.2 169700.4
|
||||
Grid : Message : 5.302166 s : 32 8 25165824 83402.1 166804.1
|
||||
Grid : Message : 5.889123 s : 32 8 25165824 85756.3 171512.6
|
||||
Grid : Message : 6.472357 s : 32 8 25165824 86299.1 172598.3
|
||||
Grid : Message : 7.572140 s : 32 8 25165824 86059.7 172119.3
|
||||
Grid : Message : 7.578700 s : ====================================================================================================
|
||||
Grid : Message : 7.578740 s : = Benchmarking sequential halo exchange from GPU memory
|
||||
Grid : Message : 7.578750 s : ====================================================================================================
|
||||
Grid : Message : 7.578760 s : L Ls bytes MB/s uni (err/min/max) MB/s bidi (err/min/max)
|
||||
Grid : Message : 7.119231 s : 8 8 393216 13844.9 27689.8
|
||||
Grid : Message : 7.150661 s : 8 8 393216 25034.4 50068.9
|
||||
Grid : Message : 7.173800 s : 8 8 393216 34002.0 68004.0
|
||||
Grid : Message : 7.197415 s : 8 8 393216 33317.7 66635.5
|
||||
Grid : Message : 7.240696 s : 12 8 1327104 110772.0 221544.0
|
||||
Grid : Message : 7.263466 s : 12 8 1327104 116627.5 233254.9
|
||||
Grid : Message : 7.310752 s : 12 8 1327104 56142.8 112285.6
|
||||
Grid : Message : 7.356881 s : 12 8 1327104 57551.3 115102.6
|
||||
Grid : Message : 7.422351 s : 16 8 3145728 167086.0 334172.0
|
||||
Grid : Message : 7.458334 s : 16 8 3145728 174903.6 349807.1
|
||||
Grid : Message : 7.558746 s : 16 8 3145728 62663.3 125326.6
|
||||
Grid : Message : 7.658824 s : 16 8 3145728 62871.8 125743.6
|
||||
Grid : Message : 7.741423 s : 20 8 6144000 231840.3 463680.6
|
||||
Grid : Message : 7.794862 s : 20 8 6144000 229996.1 459992.1
|
||||
Grid : Message : 7.982472 s : 20 8 6144000 65501.1 131002.1
|
||||
Grid : Message : 8.170548 s : 20 8 6144000 65338.8 130677.5
|
||||
Grid : Message : 8.277182 s : 24 8 10616832 274319.0 548638.0
|
||||
Grid : Message : 8.354585 s : 24 8 10616832 274365.1 548730.2
|
||||
Grid : Message : 8.675675 s : 24 8 10616832 66132.8 132265.7
|
||||
Grid : Message : 8.999237 s : 24 8 10616832 65627.4 131254.7
|
||||
Grid : Message : 9.140302 s : 28 8 16859136 300825.0 601650.0
|
||||
Grid : Message : 9.251320 s : 28 8 16859136 303749.1 607498.1
|
||||
Grid : Message : 9.632241 s : 28 8 16859136 88520.3 177040.6
|
||||
Grid : Message : 9.999663 s : 28 8 16859136 91772.9 183545.7
|
||||
Grid : Message : 10.183071 s : 32 8 25165824 328325.5 656651.1
|
||||
Grid : Message : 10.335093 s : 32 8 25165824 331109.7 662219.3
|
||||
Grid : Message : 10.875980 s : 32 8 25165824 93056.0 186111.9
|
||||
Grid : Message : 11.418666 s : 32 8 25165824 92747.5 185495.0
|
||||
Grid : Message : 11.434792 s : ====================================================================================================
|
||||
Grid : Message : 11.434797 s : = All done; Bye Bye
|
||||
Grid : Message : 11.434798 s : ====================================================================================================
|
14
systems/Booster/config-command
Normal file
14
systems/Booster/config-command
Normal file
@ -0,0 +1,14 @@
|
||||
LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
|
||||
../../configure \
|
||||
--enable-comms=mpi \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=cuda \
|
||||
--with-lime=$LIME \
|
||||
--disable-accelerator-cshift \
|
||||
--disable-unified \
|
||||
CXX=nvcc \
|
||||
LDFLAGS="-cudart shared " \
|
||||
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
|
||||
|
156
systems/Booster/dwf.16node.perf
Normal file
156
systems/Booster/dwf.16node.perf
Normal file
@ -0,0 +1,156 @@
|
||||
OPENMPI detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 3
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 64
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ac40000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=f660dc67e4b193afc4015bc5e5fe47cfdbb0356e: (HEAD -> develop, origin/develop, origin/HEAD) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.910318 s : Grid Layout
|
||||
Grid : Message : 0.910320 s : Global lattice size : 64 64 64 256
|
||||
Grid : Message : 0.910325 s : OpenMP threads : 4
|
||||
Grid : Message : 0.910326 s : MPI tasks : 2 2 2 8
|
||||
Grid : Message : 0.973956 s : Making s innermost grids
|
||||
Grid : Message : 1.198830 s : Initialising 4d RNG
|
||||
Grid : Message : 1.119813 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 1.119870 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 2.683307 s : Initialising 5d RNG
|
||||
Grid : Message : 4.220535 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 4.220563 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 37.198140 s : Initialised RNGs
|
||||
Grid : Message : 39.952612 s : Drawing gauge field
|
||||
Grid : Message : 40.488019 s : Random gauge initialised
|
||||
Grid : Message : 42.659220 s : Setting up Cshift based reference
|
||||
Grid : Message : 47.622210 s : *****************************************************************
|
||||
Grid : Message : 47.622236 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 47.622237 s : *****************************************************************
|
||||
Grid : Message : 47.622238 s : *****************************************************************
|
||||
Grid : Message : 47.622239 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 47.622240 s : * Vectorising space-time by 8
|
||||
Grid : Message : 47.622241 s : * VComplexF size is 64 B
|
||||
Grid : Message : 47.622242 s : * SINGLE precision
|
||||
Grid : Message : 47.622243 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 47.622244 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 47.622245 s : *****************************************************************
|
||||
Grid : Message : 48.950210 s : Called warmup
|
||||
Grid : Message : 77.311124 s : Called Dw 3000 times in 2.83592e+07 us
|
||||
Grid : Message : 77.311181 s : mflop/s = 1.49934e+08
|
||||
Grid : Message : 77.311184 s : mflop/s per rank = 2.34273e+06
|
||||
Grid : Message : 77.311185 s : mflop/s per node = 9.37091e+06
|
||||
Grid : Message : 77.311186 s : RF GiB/s (base 2) = 304663
|
||||
Grid : Message : 77.311187 s : mem GiB/s (base 2) = 190415
|
||||
Grid : Message : 77.314752 s : norm diff 1.03478e-13
|
||||
Grid : Message : 77.349587 s : #### Dhop calls report
|
||||
Grid : Message : 77.349591 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 77.349613 s : WilsonFermion5D TotalTime /Calls : 4761.53 us
|
||||
Grid : Message : 77.349615 s : WilsonFermion5D CommTime /Calls : 3363.09 us
|
||||
Grid : Message : 77.349616 s : WilsonFermion5D FaceTime /Calls : 469.094 us
|
||||
Grid : Message : 77.349617 s : WilsonFermion5D ComputeTime1/Calls : 26.8794 us
|
||||
Grid : Message : 77.349618 s : WilsonFermion5D ComputeTime2/Calls : 949.276 us
|
||||
Grid : Message : 77.349702 s : Average mflops/s per call : 2.68569e+10
|
||||
Grid : Message : 77.349710 s : Average mflops/s per call per rank : 4.1964e+08
|
||||
Grid : Message : 77.349711 s : Average mflops/s per call per node : 1.67856e+09
|
||||
Grid : Message : 77.349712 s : Average mflops/s per call (full) : 1.51538e+08
|
||||
Grid : Message : 77.349713 s : Average mflops/s per call per rank (full): 2.36779e+06
|
||||
Grid : Message : 77.349714 s : Average mflops/s per call per node (full): 9.47115e+06
|
||||
Grid : Message : 77.349715 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 77.349716 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 77.349717 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 77.349718 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 77.349719 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 77.349720 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 104.883719 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 104.883743 s : Called DwDag
|
||||
Grid : Message : 104.883744 s : norm dag result 12.0421
|
||||
Grid : Message : 104.901901 s : norm dag ref 12.0421
|
||||
Grid : Message : 104.917822 s : norm dag diff 7.63254e-14
|
||||
Grid : Message : 104.957229 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 105.334551 s : src_e0.499998
|
||||
Grid : Message : 105.416616 s : src_o0.500002
|
||||
Grid : Message : 105.486729 s : *********************************************************
|
||||
Grid : Message : 105.486732 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 105.486733 s : * Vectorising space-time by 8
|
||||
Grid : Message : 105.486734 s : * SINGLE precision
|
||||
Grid : Message : 105.486739 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 105.486740 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 105.486741 s : *********************************************************
|
||||
Grid : Message : 119.695464 s : Deo mflop/s = 1.5039e+08
|
||||
Grid : Message : 119.695494 s : Deo mflop/s per rank 2.34984e+06
|
||||
Grid : Message : 119.695496 s : Deo mflop/s per node 9.39937e+06
|
||||
Grid : Message : 119.695502 s : #### Dhop calls report
|
||||
Grid : Message : 119.695503 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 119.695505 s : WilsonFermion5D TotalTime /Calls : 4734.45 us
|
||||
Grid : Message : 119.695507 s : WilsonFermion5D CommTime /Calls : 3287.23 us
|
||||
Grid : Message : 119.695508 s : WilsonFermion5D FaceTime /Calls : 537.724 us
|
||||
Grid : Message : 119.695509 s : WilsonFermion5D ComputeTime1/Calls : 16.0483 us
|
||||
Grid : Message : 119.695510 s : WilsonFermion5D ComputeTime2/Calls : 939.854 us
|
||||
Grid : Message : 119.695533 s : Average mflops/s per call : 4.50726e+10
|
||||
Grid : Message : 119.695535 s : Average mflops/s per call per rank : 7.04259e+08
|
||||
Grid : Message : 119.695536 s : Average mflops/s per call per node : 2.81703e+09
|
||||
Grid : Message : 119.695537 s : Average mflops/s per call (full) : 1.52405e+08
|
||||
Grid : Message : 119.695538 s : Average mflops/s per call per rank (full): 2.38133e+06
|
||||
Grid : Message : 119.695539 s : Average mflops/s per call per node (full): 9.52532e+06
|
||||
Grid : Message : 119.695540 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 119.695541 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 119.695542 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 119.695543 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 119.695544 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 119.695545 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 119.752707 s : r_e6.02108
|
||||
Grid : Message : 119.759448 s : r_o6.02101
|
||||
Grid : Message : 119.765382 s : res12.0421
|
||||
Grid : Message : 120.419093 s : norm diff 0
|
||||
Grid : Message : 120.829772 s : norm diff even 0
|
||||
Grid : Message : 120.909078 s : norm diff odd 0
|
156
systems/Booster/dwf.4node.perf
Normal file
156
systems/Booster/dwf.4node.perf
Normal file
@ -0,0 +1,156 @@
|
||||
OPENMPI detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 3
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14e9c0000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=e188c0512ebee79bfb15906676af1c9e142aa21a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.717713 s : Grid Layout
|
||||
Grid : Message : 0.717716 s : Global lattice size : 64 64 64 64
|
||||
Grid : Message : 0.717724 s : OpenMP threads : 4
|
||||
Grid : Message : 0.717725 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 0.801634 s : Making s innermost grids
|
||||
Grid : Message : 0.844903 s : Initialising 4d RNG
|
||||
Grid : Message : 0.940001 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.940060 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 1.338368 s : Initialising 5d RNG
|
||||
Grid : Message : 2.859273 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 2.859304 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 11.140924 s : Initialised RNGs
|
||||
Grid : Message : 13.433456 s : Drawing gauge field
|
||||
Grid : Message : 13.955847 s : Random gauge initialised
|
||||
Grid : Message : 15.528535 s : Setting up Cshift based reference
|
||||
Grid : Message : 21.484340 s : *****************************************************************
|
||||
Grid : Message : 21.484840 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 21.484860 s : *****************************************************************
|
||||
Grid : Message : 21.484870 s : *****************************************************************
|
||||
Grid : Message : 21.484880 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 21.484890 s : * Vectorising space-time by 8
|
||||
Grid : Message : 21.484900 s : * VComplexF size is 64 B
|
||||
Grid : Message : 21.484910 s : * SINGLE precision
|
||||
Grid : Message : 21.484920 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 21.484930 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 21.484940 s : *****************************************************************
|
||||
Grid : Message : 22.344741 s : Called warmup
|
||||
Grid : Message : 49.832292 s : Called Dw 3000 times in 2.74873e+07 us
|
||||
Grid : Message : 49.832358 s : mflop/s = 3.86726e+07
|
||||
Grid : Message : 49.832360 s : mflop/s per rank = 2.41704e+06
|
||||
Grid : Message : 49.832361 s : mflop/s per node = 9.66814e+06
|
||||
Grid : Message : 49.832362 s : RF GiB/s (base 2) = 78581.7
|
||||
Grid : Message : 49.832363 s : mem GiB/s (base 2) = 49113.6
|
||||
Grid : Message : 49.835924 s : norm diff 1.03481e-13
|
||||
Grid : Message : 49.870568 s : #### Dhop calls report
|
||||
Grid : Message : 49.870574 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 49.870598 s : WilsonFermion5D TotalTime /Calls : 4616.79 us
|
||||
Grid : Message : 49.870600 s : WilsonFermion5D CommTime /Calls : 3241.77 us
|
||||
Grid : Message : 49.870601 s : WilsonFermion5D FaceTime /Calls : 469.006 us
|
||||
Grid : Message : 49.870602 s : WilsonFermion5D ComputeTime1/Calls : 27.0492 us
|
||||
Grid : Message : 49.870603 s : WilsonFermion5D ComputeTime2/Calls : 926.33 us
|
||||
Grid : Message : 49.870614 s : Average mflops/s per call : 6.71631e+09
|
||||
Grid : Message : 49.870619 s : Average mflops/s per call per rank : 4.19769e+08
|
||||
Grid : Message : 49.870621 s : Average mflops/s per call per node : 1.67908e+09
|
||||
Grid : Message : 49.870626 s : Average mflops/s per call (full) : 3.90723e+07
|
||||
Grid : Message : 49.870627 s : Average mflops/s per call per rank (full): 2.44202e+06
|
||||
Grid : Message : 49.870628 s : Average mflops/s per call per node (full): 9.76808e+06
|
||||
Grid : Message : 49.870629 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 49.870630 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 49.870631 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 49.870632 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 49.870633 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 49.870634 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 77.321890 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 77.321911 s : Called DwDag
|
||||
Grid : Message : 77.321912 s : norm dag result 12.0421
|
||||
Grid : Message : 77.334619 s : norm dag ref 12.0421
|
||||
Grid : Message : 77.350515 s : norm dag diff 7.63236e-14
|
||||
Grid : Message : 77.389923 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 77.769815 s : src_e0.499997
|
||||
Grid : Message : 77.847560 s : src_o0.500003
|
||||
Grid : Message : 77.917493 s : *********************************************************
|
||||
Grid : Message : 77.917496 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 77.917497 s : * Vectorising space-time by 8
|
||||
Grid : Message : 77.917498 s : * SINGLE precision
|
||||
Grid : Message : 77.917499 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 77.917500 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 77.917501 s : *********************************************************
|
||||
Grid : Message : 91.412946 s : Deo mflop/s = 3.95925e+07
|
||||
Grid : Message : 91.412978 s : Deo mflop/s per rank 2.47453e+06
|
||||
Grid : Message : 91.412980 s : Deo mflop/s per node 9.89813e+06
|
||||
Grid : Message : 91.412983 s : #### Dhop calls report
|
||||
Grid : Message : 91.412984 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 91.412986 s : WilsonFermion5D TotalTime /Calls : 4496.84 us
|
||||
Grid : Message : 91.412988 s : WilsonFermion5D CommTime /Calls : 3057.28 us
|
||||
Grid : Message : 91.412989 s : WilsonFermion5D FaceTime /Calls : 528.499 us
|
||||
Grid : Message : 91.412990 s : WilsonFermion5D ComputeTime1/Calls : 16.1939 us
|
||||
Grid : Message : 91.412991 s : WilsonFermion5D ComputeTime2/Calls : 942.557 us
|
||||
Grid : Message : 91.413021 s : Average mflops/s per call : 1.12574e+10
|
||||
Grid : Message : 91.413023 s : Average mflops/s per call per rank : 7.03586e+08
|
||||
Grid : Message : 91.413024 s : Average mflops/s per call per node : 2.81434e+09
|
||||
Grid : Message : 91.413025 s : Average mflops/s per call (full) : 4.01145e+07
|
||||
Grid : Message : 91.413026 s : Average mflops/s per call per rank (full): 2.50716e+06
|
||||
Grid : Message : 91.413027 s : Average mflops/s per call per node (full): 1.00286e+07
|
||||
Grid : Message : 91.413028 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 91.413029 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 91.413030 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 91.413031 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 91.413032 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 91.413033 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 91.470394 s : r_e6.02111
|
||||
Grid : Message : 91.476539 s : r_o6.02102
|
||||
Grid : Message : 91.482442 s : res12.0421
|
||||
Grid : Message : 92.138799 s : norm diff 0
|
||||
Grid : Message : 92.545354 s : norm diff even 0
|
||||
Grid : Message : 92.619444 s : norm diff odd 0
|
29
systems/Booster/dwf16.slurm
Normal file
29
systems/Booster/dwf16.slurm
Normal file
@ -0,0 +1,29 @@
|
||||
#!/bin/sh
|
||||
#SBATCH --account=gm2dwf
|
||||
#SBATCH --nodes=16
|
||||
#SBATCH --ntasks=64
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH --cpus-per-task=12
|
||||
#SBATCH --time=0:30:00
|
||||
#SBATCH --partition=booster
|
||||
#SBATCH --gres=gpu:4
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
export OMPI_MCA_btl=^uct,openib
|
||||
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||
export UCX_RNDV_SCHEME=put_zcopy
|
||||
export UCX_RNDV_THRESH=16384
|
||||
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||
export UCX_MEMTYPE_CACHE=n
|
||||
OPT="--comms-overlap --comms-concurrent"
|
||||
|
||||
|
||||
srun -N 16 -n $SLURM_NTASKS \
|
||||
./benchmarks/Benchmark_dwf_fp32 \
|
||||
$OPT \
|
||||
--mpi 2.2.2.8 \
|
||||
--accelerator-threads 8 \
|
||||
--grid 64.64.64.256 \
|
||||
--shm 2048 > dwf.16node.perf
|
||||
|
||||
|
39
systems/Booster/dwf4.slurm
Normal file
39
systems/Booster/dwf4.slurm
Normal file
@ -0,0 +1,39 @@
|
||||
#!/bin/sh
|
||||
#SBATCH --account=gm2dwf
|
||||
#SBATCH --nodes=4
|
||||
#SBATCH --ntasks=16
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH --cpus-per-task=12
|
||||
#SBATCH --time=2:00:00
|
||||
#SBATCH --partition=develbooster
|
||||
#SBATCH --gres=gpu:4
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
export OMPI_MCA_btl=^uct,openib
|
||||
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||
export UCX_RNDV_SCHEME=put_zcopy
|
||||
export UCX_RNDV_THRESH=16384
|
||||
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||
export UCX_MEMTYPE_CACHE=n
|
||||
|
||||
OPT="--comms-overlap --comms-concurrent"
|
||||
|
||||
srun -N 4 -n $SLURM_NTASKS \
|
||||
./benchmarks/Benchmark_dwf_fp32 \
|
||||
$OPT \
|
||||
--mpi 2.2.2.2 \
|
||||
--accelerator-threads 8 \
|
||||
--grid 64.64.64.64 \
|
||||
--shm 2048 > dwf.4node.perf
|
||||
|
||||
|
||||
srun -N 4 -n $SLURM_NTASKS \
|
||||
./benchmarks/Benchmark_comms_host_device \
|
||||
--mpi 2.2.2.2 \
|
||||
--accelerator-threads 8 \
|
||||
--grid 64.64.64.64 \
|
||||
--shm 2048 > comms.4node.perf
|
||||
|
||||
|
||||
|
||||
|
5
systems/Booster/sourceme.sh
Normal file
5
systems/Booster/sourceme.sh
Normal file
@ -0,0 +1,5 @@
|
||||
module load GCC/9.3.0
|
||||
module load GMP/6.2.0
|
||||
module load MPFR/4.1.0
|
||||
module load OpenMPI/4.1.0rc1
|
||||
module load CUDA/11.3
|
Loading…
Reference in New Issue
Block a user