mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 11:15:55 +01:00
Best results on Aurora so far
This commit is contained in:
parent
94019a922e
commit
8cf809e231
@ -408,8 +408,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
|||||||
{
|
{
|
||||||
return 0.0; // Do nothing -- no preparation required
|
return 0.0; // Do nothing -- no preparation required
|
||||||
}
|
}
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(int list_idx,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,int dox,
|
int dest,int dox,
|
||||||
void *recv,
|
void *recv,
|
||||||
@ -470,6 +469,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
|||||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.resize(0);
|
list.resize(0);
|
||||||
|
this->StencilBarrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* NOT ... ACCELERATOR_AWARE_MPI */
|
#else /* NOT ... ACCELERATOR_AWARE_MPI */
|
||||||
@ -481,10 +481,10 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
|||||||
* PHASE 1: (prepare)
|
* PHASE 1: (prepare)
|
||||||
* - post MPI receive buffers asynch
|
* - post MPI receive buffers asynch
|
||||||
* - post device - host send buffer transfer asynch
|
* - post device - host send buffer transfer asynch
|
||||||
* - post device - device transfers
|
|
||||||
* PHASE 2: (Begin)
|
* PHASE 2: (Begin)
|
||||||
* - complete all copies
|
* - complete all copies
|
||||||
* - post MPI send asynch
|
* - post MPI send asynch
|
||||||
|
* - post device - device transfers
|
||||||
* PHASE 3: (Complete)
|
* PHASE 3: (Complete)
|
||||||
* - MPI_waitall
|
* - MPI_waitall
|
||||||
* - host-device transfers
|
* - host-device transfers
|
||||||
@ -561,6 +561,8 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
|||||||
|
|
||||||
if (dox) {
|
if (dox) {
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
|
#undef DEVICE_TO_HOST_CONCURRENT // pipeline
|
||||||
|
#ifdef DEVICE_TO_HOST_CONCURRENT
|
||||||
tag= dir+_processor*32;
|
tag= dir+_processor*32;
|
||||||
|
|
||||||
host_xmit = this->HostBufferMalloc(xbytes);
|
host_xmit = this->HostBufferMalloc(xbytes);
|
||||||
@ -577,11 +579,30 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
|||||||
srq.host_buf = host_xmit;
|
srq.host_buf = host_xmit;
|
||||||
srq.device_buf = xmit;
|
srq.device_buf = xmit;
|
||||||
list.push_back(srq);
|
list.push_back(srq);
|
||||||
|
#else
|
||||||
|
tag= dir+_processor*32;
|
||||||
|
|
||||||
} else {
|
host_xmit = this->HostBufferMalloc(xbytes);
|
||||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
const int chunks=1;
|
||||||
assert(shm!=NULL);
|
for(int n=0;n<chunks;n++){
|
||||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
|
||||||
|
void * xmitc = (void *)( (uint64_t) xmit + n*xbytes/chunks);
|
||||||
|
acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
|
||||||
|
}
|
||||||
|
acceleratorCopySynchronise(); // Complete all pending copy transfers
|
||||||
|
|
||||||
|
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
|
assert(ierr==0);
|
||||||
|
off_node_bytes+=xbytes;
|
||||||
|
|
||||||
|
CommsRequest_t srq;
|
||||||
|
srq.PacketType = InterNodeXmit;
|
||||||
|
srq.bytes = xbytes;
|
||||||
|
srq.req = xrq;
|
||||||
|
srq.host_buf = host_xmit;
|
||||||
|
srq.device_buf = xmit;
|
||||||
|
list.push_back(srq);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -623,17 +644,17 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
* - complete all copies
|
* - complete all copies
|
||||||
* - post MPI send asynch
|
* - post MPI send asynch
|
||||||
*/
|
*/
|
||||||
acceleratorCopySynchronise();
|
|
||||||
|
|
||||||
static int printed;
|
// static int printed;
|
||||||
if(!printed && this->IsBoss() ) {
|
// if((printed<8) && this->IsBoss() ) {
|
||||||
printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
|
// printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
|
||||||
printed=1;
|
// printed++;
|
||||||
}
|
// }
|
||||||
|
|
||||||
if (dox) {
|
if (dox) {
|
||||||
|
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
|
#ifdef DEVICE_TO_HOST_CONCURRENT
|
||||||
tag= dir+_processor*32;
|
tag= dir+_processor*32;
|
||||||
// Find the send in the prepared list
|
// Find the send in the prepared list
|
||||||
int list_idx=-1;
|
int list_idx=-1;
|
||||||
@ -652,7 +673,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list[list_idx].req = xrq; // Update the MPI request in the list
|
list[list_idx].req = xrq; // Update the MPI request in the list
|
||||||
off_node_bytes+=xbytes;
|
off_node_bytes+=xbytes;
|
||||||
}
|
#endif
|
||||||
|
} else {
|
||||||
|
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||||
|
assert(shm!=NULL);
|
||||||
|
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
@ -680,6 +706,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
|||||||
acceleratorCopySynchronise(); // Complete all pending copy transfers
|
acceleratorCopySynchronise(); // Complete all pending copy transfers
|
||||||
list.resize(0); // Delete the list
|
list.resize(0); // Delete the list
|
||||||
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
||||||
|
this->StencilBarrier();
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
|
@ -543,6 +543,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
printf("Host buffer allocate for GPU non-aware MPI\n");
|
printf("Host buffer allocate for GPU non-aware MPI\n");
|
||||||
|
#if 0
|
||||||
|
HostCommBuf= acceleratorAllocHost(bytes);
|
||||||
|
#else
|
||||||
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
||||||
#ifdef HAVE_NUMAIF_H
|
#ifdef HAVE_NUMAIF_H
|
||||||
#warning "Moving host buffers to specific NUMA domain"
|
#warning "Moving host buffers to specific NUMA domain"
|
||||||
@ -569,6 +572,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
acceleratorPin(HostCommBuf,bytes);
|
acceleratorPin(HostCommBuf,bytes);
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
|
@ -332,7 +332,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
|
|
||||||
// std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
|
// std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
std::vector<std::vector<CommsRequest_t> > requests;
|
||||||
#ifndef GRID_ACCELERATED
|
|
||||||
|
#if 1
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Overlap with comms
|
// Overlap with comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@ -352,7 +353,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GRID_ACCELERATED
|
//ifdef GRID_ACCELERATED
|
||||||
|
#if 0
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Overlap with comms -- on GPU the interior kernel call is nonblocking
|
// Overlap with comms -- on GPU the interior kernel call is nonblocking
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
@ -376,6 +376,7 @@ public:
|
|||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||||
}
|
}
|
||||||
|
acceleratorCopySynchronise();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
@ -401,7 +402,6 @@ public:
|
|||||||
else DslashLogFull();
|
else DslashLogFull();
|
||||||
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
||||||
// accelerator_barrier();
|
// accelerator_barrier();
|
||||||
_grid->StencilBarrier();
|
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
if ( Packets[i].do_recv )
|
if ( Packets[i].do_recv )
|
||||||
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
||||||
|
@ -209,6 +209,17 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void *acceleratorAllocHost(size_t bytes)
|
||||||
|
{
|
||||||
|
void *ptr=NULL;
|
||||||
|
auto err = cudaMallocHost((void **)&ptr,bytes);
|
||||||
|
if( err != cudaSuccess ) {
|
||||||
|
ptr = (void *) NULL;
|
||||||
|
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
inline void *acceleratorAllocShared(size_t bytes)
|
inline void *acceleratorAllocShared(size_t bytes)
|
||||||
{
|
{
|
||||||
void *ptr=NULL;
|
void *ptr=NULL;
|
||||||
@ -230,8 +241,10 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||||
|
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
||||||
@ -322,7 +335,9 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
|
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
|
||||||
|
|
||||||
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
|
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
|
||||||
|
inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);};
|
||||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
||||||
|
inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
|
||||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
||||||
|
|
||||||
@ -441,6 +456,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void *acceleratorAllocHost(size_t bytes)
|
||||||
|
{
|
||||||
|
void *ptr=NULL;
|
||||||
|
auto err = hipMallocHost((void **)&ptr,bytes);
|
||||||
|
if( err != hipSuccess ) {
|
||||||
|
ptr = (void *) NULL;
|
||||||
|
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
};
|
||||||
inline void *acceleratorAllocShared(size_t bytes)
|
inline void *acceleratorAllocShared(size_t bytes)
|
||||||
{
|
{
|
||||||
void *ptr=NULL;
|
void *ptr=NULL;
|
||||||
@ -464,6 +489,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||||
@ -546,8 +572,10 @@ inline void acceleratorCopySynchronise(void) {};
|
|||||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
|
inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
||||||
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
||||||
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
||||||
|
inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);};
|
||||||
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
|
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
|
||||||
#else
|
#else
|
||||||
|
@ -52,7 +52,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
int threads = GridThread::GetThreads();
|
||||||
|
|
||||||
int Ls=16;
|
int Ls=8;
|
||||||
for(int i=0;i<argc;i++) {
|
for(int i=0;i<argc;i++) {
|
||||||
if(std::string(argv[i]) == "-Ls"){
|
if(std::string(argv[i]) == "-Ls"){
|
||||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||||
|
@ -19,12 +19,12 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
|||||||
|
|
||||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:7
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
||||||
|
|
||||||
export MPI_BUF_NUMA=$NUMAH
|
#export MPI_BUF_NUMA=$NUMAH
|
||||||
|
|
||||||
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
|
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user