1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-03 18:55:56 +01:00

Best results on Aurora so far

This commit is contained in:
Peter Boyle 2025-01-31 16:14:45 +00:00
parent 94019a922e
commit 8cf809e231
7 changed files with 82 additions and 20 deletions

View File

@ -408,8 +408,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
{
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(int list_idx,
std::vector<CommsRequest_t> &list,
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
@ -470,6 +469,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
this->StencilBarrier();
}
#else /* NOT ... ACCELERATOR_AWARE_MPI */
@ -481,10 +481,10 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* - post device - device transfers
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
@ -561,6 +561,8 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#undef DEVICE_TO_HOST_CONCURRENT // pipeline
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
@ -577,11 +579,30 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
srq.host_buf = host_xmit;
srq.device_buf = xmit;
list.push_back(srq);
#else
tag= dir+_processor*32;
} else {
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
host_xmit = this->HostBufferMalloc(xbytes);
const int chunks=1;
for(int n=0;n<chunks;n++){
void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
void * xmitc = (void *)( (uint64_t) xmit + n*xbytes/chunks);
acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
}
acceleratorCopySynchronise(); // Complete all pending copy transfers
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
off_node_bytes+=xbytes;
CommsRequest_t srq;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
list.push_back(srq);
#endif
}
}
@ -623,17 +644,17 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
* - complete all copies
* - post MPI send asynch
*/
acceleratorCopySynchronise();
static int printed;
if(!printed && this->IsBoss() ) {
printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
printed=1;
}
// static int printed;
// if((printed<8) && this->IsBoss() ) {
// printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
// printed++;
// }
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
// Find the send in the prepared list
int list_idx=-1;
@ -652,7 +673,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(ierr==0);
list[list_idx].req = xrq; // Update the MPI request in the list
off_node_bytes+=xbytes;
}
#endif
} else {
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
}
}
return off_node_bytes;
}
@ -680,6 +706,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
acceleratorCopySynchronise(); // Complete all pending copy transfers
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
this->StencilBarrier();
}
#endif
////////////////////////////////////////////

View File

@ -543,6 +543,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#ifdef HAVE_NUMAIF_H
#warning "Moving host buffers to specific NUMA domain"
@ -569,6 +572,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
}
#endif
acceleratorPin(HostCommBuf,bytes);
#endif
#endif
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {

View File

@ -332,7 +332,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
// std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
std::vector<std::vector<CommsRequest_t> > requests;
#ifndef GRID_ACCELERATED
#if 1
/////////////////////////////
// Overlap with comms
/////////////////////////////
@ -352,7 +353,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
}
#ifdef GRID_ACCELERATED
//ifdef GRID_ACCELERATED
#if 0
/////////////////////////////
// Overlap with comms -- on GPU the interior kernel call is nonblocking
/////////////////////////////

View File

@ -376,6 +376,7 @@ public:
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
acceleratorCopySynchronise();
for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf,
@ -401,7 +402,6 @@ public:
else DslashLogFull();
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
// accelerator_barrier();
_grid->StencilBarrier();
for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv )
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);

View File

@ -209,6 +209,17 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
} \
}
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMallocHost((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
assert(0);
}
return ptr;
}
inline void *acceleratorAllocShared(size_t bytes)
{
void *ptr=NULL;
@ -230,8 +241,10 @@ inline void *acceleratorAllocDevice(size_t bytes)
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
@ -322,7 +335,9 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
@ -441,6 +456,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
} \
}
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = hipMallocHost((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
}
return ptr;
};
inline void *acceleratorAllocShared(size_t bytes)
{
void *ptr=NULL;
@ -464,6 +489,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
return ptr;
};
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
@ -546,8 +572,10 @@ inline void acceleratorCopySynchronise(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
#ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);};
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
#else

View File

@ -52,7 +52,7 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads();
int Ls=16;
int Ls=8;
for(int i=0;i<argc;i++) {
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;

View File

@ -19,12 +19,12 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:7
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
export MPI_BUF_NUMA=$NUMAH
#export MPI_BUF_NUMA=$NUMAH
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "