1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-14 13:57:07 +01:00

Compare commits

...

3 Commits

16 changed files with 433 additions and 132 deletions

View File

@ -136,7 +136,7 @@ public:
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]); column.resize(_processors[d]);
column[0] = accum; column[0] = accum;
std::vector<CommsRequest_t> list; std::vector<MpiCommsRequest_t> list;
for(int p=1;p<_processors[d];p++){ for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest); ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list, SendToRecvFromBegin(list,
@ -166,8 +166,8 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way // Face exchange, buffer swap in translational invariant way
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void CommsComplete(std::vector<CommsRequest_t> &list); void CommsComplete(std::vector<MpiCommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
@ -186,6 +186,12 @@ public:
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int bytes,int dir); int bytes,int dir);
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,

View File

@ -317,7 +317,7 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
@ -342,7 +342,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
assert(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list) void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
{ {
int nreq=list.size(); int nreq=list.size();
@ -361,7 +361,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int from, int from,
int bytes) int bytes)
{ {
std::vector<CommsRequest_t> reqs(0); std::vector<MpiCommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0); unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0); unsigned long rcrc = crc32(0L, Z_NULL, 0);
@ -391,12 +391,23 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int bytes,int dir) int bytes,int dir)
{ {
std::vector<CommsRequest_t> list; std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir); StencilSendToRecvFromComplete(list,dir);
return offbytes; return offbytes;
} }
#undef NVLINK_GET // Define to use get instead of put DMA
#ifdef ACCELERATOR_AWARE_MPI
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest,int dox, int dest,int dox,
@ -429,15 +440,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=rbytes; off_node_bytes+=rbytes;
} }
#ifdef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
#endif
} }
if (dox) { if (dox) {
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@ -445,17 +450,14 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=xbytes; off_node_bytes+=xbytes;
} else { } else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv); void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL); assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
} }
} }
return off_node_bytes; return off_node_bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
int nreq=list.size(); int nreq=list.size();
@ -463,12 +465,254 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
acceleratorCopySynchronise(); acceleratorCopySynchronise();
if (nreq==0) return; if (nreq==0) return;
std::vector<MPI_Status> status(nreq); std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0); assert(ierr==0);
list.resize(0); list.resize(0);
this->StencilBarrier();
} }
#else /* NOT ... ACCELERATOR_AWARE_MPI */
///////////////////////////////////////////
// Pipeline mode through host memory
///////////////////////////////////////////
/*
* In prepare (phase 1):
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
*
*********************************
* NB could split this further:
*--------------------------------
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (BeginInterNode)
* - complete all copies
* - post MPI send asynch
* PHASE 3: (BeginIntraNode)
* - post device - device transfers
* PHASE 4: (Complete)
* - MPI_waitall
* - host-device transfers asynch
* - (complete all copies)
*/
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
* Assume using XeLink is ok
*/
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_recv = NULL;
void * host_xmit = NULL;
/*
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
*/
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
list.push_back(srq);
off_node_bytes+=rbytes;
}
}
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#undef DEVICE_TO_HOST_CONCURRENT // pipeline
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0);
// off_node_bytes+=xbytes;
CommsRequest_t srq;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
list.push_back(srq);
#else
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
const int chunks=1;
for(int n=0;n<chunks;n++){
void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
void * xmitc = (void *)( (uint64_t) xmit + n*xbytes/chunks);
acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
}
acceleratorCopySynchronise(); // Complete all pending copy transfers
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
off_node_bytes+=xbytes;
CommsRequest_t srq;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
list.push_back(srq);
#endif
}
}
return off_node_bytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_xmit = NULL;
////////////////////////////////
// Receives already posted
// Copies already started
////////////////////////////////
/*
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
*/
// static int printed;
// if((printed<8) && this->IsBoss() ) {
// printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
// printed++;
// }
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
// Find the send in the prepared list
int list_idx=-1;
for(int idx = 0; idx<list.size();idx++){
if ( (list[idx].device_buf==xmit)
&&(list[idx].PacketType==InterNodeXmit)
&&(list[idx].bytes==xbytes) ) {
list_idx = idx;
host_xmit = list[idx].host_buf;
}
}
assert(list_idx != -1); // found it
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[list_idx].req = xrq; // Update the MPI request in the list
off_node_bytes+=xbytes;
#endif
} else {
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
}
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
std::vector<MPI_Request> MpiRequests(nreq);
for(int r=0;r<nreq;r++){
MpiRequests[r] = list[r].req;
}
int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
assert(ierr==0);
for(int r=0;r<nreq;r++){
if ( list[r].PacketType==InterNodeRecv ) {
acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
}
}
acceleratorCopySynchronise(); // Complete all pending copy transfers
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
this->StencilBarrier();
}
#endif
////////////////////////////////////////////
// END PIPELINE MODE / NO CUDA AWARE MPI
////////////////////////////////////////////
void CartesianCommunicator::StencilBarrier(void) void CartesianCommunicator::StencilBarrier(void)
{ {
MPI_Barrier (ShmComm); MPI_Barrier (ShmComm);

View File

@ -132,6 +132,15 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
{ {
return 2.0*bytes; return 2.0*bytes;
} }
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return xbytes+rbytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,

View File

@ -46,8 +46,22 @@ NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm; typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
#else #else
enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
unsigned long bytes;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
#else
typedef int MpiCommsRequest_t;
typedef int CommsRequest_t; typedef int CommsRequest_t;
typedef int Grid_MPI_Comm; typedef int Grid_MPI_Comm;
#endif #endif

View File

@ -43,8 +43,8 @@ Author: Christoph Lehner <christoph@lhnr.de>
#define GRID_SYCL_LEVEL_ZERO_IPC #define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS #define SHM_SOCKETS
#else #else
#undef NUMA_PLACE_HOSTBUF #ifdef HAVE_NUMAIF_H
#ifdef NUMA_PLACE_HOSTBUF #warning " Using NUMAIF "
#include <numaif.h> #include <numaif.h>
#endif #endif
#endif #endif
@ -543,19 +543,23 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI #ifndef ACCELERATOR_AWARE_MPI
printf("Host buffer allocate for GPU non-aware MPI\n"); printf("Host buffer allocate for GPU non-aware MPI\n");
HostCommBuf= malloc(bytes); #if 0
#ifdef NUMA_PLACE_HOSTBUF HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#ifdef HAVE_NUMAIF_H
#warning "Moving host buffers to specific NUMA domain"
int numa; int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA"); char *numa_name=(char *)getenv("MPI_BUF_NUMA");
if(numa_name) { if(numa_name) {
page_size = sysconf(_SC_PAGESIZE); unsigned long page_size = sysconf(_SC_PAGESIZE);
numa = atoi(numa_name); numa = atoi(numa_name);
unsigned long page_count = bytes/page_size; unsigned long page_count = bytes/page_size;
std::vector<void *> pages(pcount); std::vector<void *> pages(page_count);
std::vector<int> nodes(pcount,numa); std::vector<int> nodes(page_count,numa);
std::vector<int> status(pcount,-1); std::vector<int> status(page_count,-1);
for(unsigned long p=0;p<page_count;p++){ for(unsigned long p=0;p<page_count;p++){
pages[p] = HostCommBuf + p*page_size; pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
} }
int ret = move_pages(0, int ret = move_pages(0,
page_count, page_count,
@ -565,7 +569,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
MPOL_MF_MOVE); MPOL_MF_MOVE);
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret); printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
if (ret) perror(" move_pages failed for reason:"); if (ret) perror(" move_pages failed for reason:");
}
#endif #endif
acceleratorPin(HostCommBuf,bytes);
#endif
#endif #endif
ShmCommBuf = acceleratorAllocDevice(bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {

View File

@ -467,8 +467,8 @@ public:
send_buf.resize(buffer_size*2*depth); send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth);
std::vector<CommsRequest_t> fwd_req; std::vector<MpiCommsRequest_t> fwd_req;
std::vector<CommsRequest_t> bwd_req; std::vector<MpiCommsRequest_t> bwd_req;
int words = buffer_size; int words = buffer_size;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);

View File

@ -332,22 +332,18 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
// std::cout << " WilsonFermion5D Communicate Begin " <<std::endl; // std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
std::vector<std::vector<CommsRequest_t> > requests; std::vector<std::vector<CommsRequest_t> > requests;
auto id=traceStart("Communicate overlapped");
st.CommunicateBegin(requests);
#if 1
///////////////////////////// /////////////////////////////
// Overlap with comms // Overlap with comms
///////////////////////////// /////////////////////////////
{ st.CommunicateBegin(requests);
// std::cout << " WilsonFermion5D Comms merge " <<std::endl;
GRID_TRACE("MergeSHM");
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
} #endif
///////////////////////////// /////////////////////////////
// do the compute interior // do the compute interior
///////////////////////////// /////////////////////////////
// std::cout << " WilsonFermion5D Interior " <<std::endl;
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
if (dag == DaggerYes) { if (dag == DaggerYes) {
GRID_TRACE("DhopDagInterior"); GRID_TRACE("DhopDagInterior");
@ -357,12 +353,22 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
} }
//ifdef GRID_ACCELERATED
#if 0
/////////////////////////////
// Overlap with comms -- on GPU the interior kernel call is nonblocking
/////////////////////////////
st.CommunicateBegin(requests);
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
#endif
///////////////////////////// /////////////////////////////
// Complete comms // Complete comms
///////////////////////////// /////////////////////////////
// std::cout << " WilsonFermion5D Comms Complete " <<std::endl; // std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
st.CommunicateComplete(requests); st.CommunicateComplete(requests);
traceStop(id); // traceStop(id);
///////////////////////////// /////////////////////////////
// do the compute exterior // do the compute exterior

View File

@ -368,7 +368,15 @@ public:
// accelerator_barrier(); // All kernels should ALREADY be complete // accelerator_barrier(); // All kernels should ALREADY be complete
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
// But the HaloGather had a barrier too. // But the HaloGather had a barrier too.
#ifdef ACCELERATOR_AWARE_MPI for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromPrepare(MpiReqs,
Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
acceleratorCopySynchronise();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(MpiReqs, _grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf, Packets[i].send_buf,
@ -377,23 +385,6 @@ public:
Packets[i].from_rank,Packets[i].do_recv, Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i); Packets[i].xbytes,Packets[i].rbytes,i);
} }
#else
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
for(int i=0;i<Packets.size();i++){
// Introduce a host buffer with a cheap slab allocator and zero cost wipe all
Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
if ( Packets[i].do_send ) {
acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
}
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].host_send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].host_recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
#endif
// Get comms started then run checksums // Get comms started then run checksums
// Having this PRIOR to the dslash seems to make Sunspot work... (!) // Having this PRIOR to the dslash seems to make Sunspot work... (!)
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
@ -411,16 +402,6 @@ public:
else DslashLogFull(); else DslashLogFull();
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
// accelerator_barrier(); // accelerator_barrier();
#ifndef ACCELERATOR_AWARE_MPI
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv ) {
acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
}
}
_grid->HostBufferFreeAll();
#endif // run any checksums
_grid->StencilBarrier();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv ) if ( Packets[i].do_recv )
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);

View File

@ -209,6 +209,17 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
} \ } \
} }
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMallocHost((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
assert(0);
}
return ptr;
}
inline void *acceleratorAllocShared(size_t bytes) inline void *acceleratorAllocShared(size_t bytes)
{ {
void *ptr=NULL; void *ptr=NULL;
@ -230,8 +241,10 @@ inline void *acceleratorAllocDevice(size_t bytes)
} }
return ptr; return ptr;
}; };
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
@ -322,12 +335,17 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); } #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
@ -438,6 +456,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
} \ } \
} }
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = hipMallocHost((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
}
return ptr;
};
inline void *acceleratorAllocShared(size_t bytes) inline void *acceleratorAllocShared(size_t bytes)
{ {
void *ptr=NULL; void *ptr=NULL;
@ -461,12 +489,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
return ptr; return ptr;
}; };
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
//inline void acceleratorCopySynchronise(void) { }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
@ -483,6 +511,13 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
#endif #endif
inline void acceleratorPin(void *ptr,unsigned long bytes)
{
#ifdef GRID_SYCL
sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context());
#endif
}
////////////////////////////////////////////// //////////////////////////////////////////////
// Common on all GPU targets // Common on all GPU targets
////////////////////////////////////////////// //////////////////////////////////////////////
@ -537,8 +572,10 @@ inline void acceleratorCopySynchronise(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline int acceleratorIsCommunicable(void *ptr){ return 1; }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);};
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);}; inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);}; inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
#else #else

View File

@ -1,5 +1,5 @@
# additional include paths necessary to compile the C++ library # additional include paths necessary to compile the C++ library
SUBDIRS = Grid HMC benchmarks tests examples SUBDIRS = Grid benchmarks tests examples HMC
include $(top_srcdir)/doxygen.inc include $(top_srcdir)/doxygen.inc

View File

@ -52,7 +52,7 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads(); int threads = GridThread::GetThreads();
int Ls=16; int Ls=8;
for(int i=0;i<argc;i++) { for(int i=0;i<argc;i++) {
if(std::string(argv[i]) == "-Ls"){ if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls; std::stringstream ss(argv[i+1]); ss >> Ls;

View File

@ -72,6 +72,7 @@ AC_CHECK_HEADERS(malloc/malloc.h)
AC_CHECK_HEADERS(malloc.h) AC_CHECK_HEADERS(malloc.h)
AC_CHECK_HEADERS(endian.h) AC_CHECK_HEADERS(endian.h)
AC_CHECK_HEADERS(execinfo.h) AC_CHECK_HEADERS(execinfo.h)
AC_CHECK_HEADERS(numaif.h)
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
@ -245,9 +246,11 @@ AC_ARG_ENABLE([accelerator-aware-mpi],
[AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
[ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
# Force accelerator CSHIFT now
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device])
case ${ac_ACCELERATOR_AWARE_MPI} in case ${ac_ACCELERATOR_AWARE_MPI} in
yes) yes)
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device])
AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
*);; *);;
esac esac

View File

@ -29,7 +29,7 @@ export MPICH_OFI_NIC_POLICY=GPU
CMD="mpiexec -np 12 -ppn 12 -envall \ CMD="mpiexec -np 12 -ppn 12 -envall \
./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.96 \ ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.96 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 8 " --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 8 "
echo $CMD echo $CMD
$CMD $CMD

View File

@ -1,58 +1,48 @@
#!/bin/bash #!/bin/bash
#PBS -q EarlyAppAccess ##PBS -q EarlyAppAccess
#PBS -q debug
#PBS -l select=2 #PBS -l select=2
#PBS -l walltime=00:20:00 #PBS -l walltime=00:20:00
#PBS -A LatticeQCD_aesp_CNDA #PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR cd $PBS_O_WORKDIR
source ../sourceme.sh source ../sourceme.sh
#module load pti-gpu
cp $PBS_NODEFILE nodefile cp $PBS_NODEFILE nodefile
export OMP_NUM_THREADS=4 export OMP_NUM_THREADS=4
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 export MPICH_OFI_NIC_POLICY=GPU
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 2 nodes, 24 ranks
# #
CMD="mpiexec -np 24 -ppn 12 -envall \ # Local vol 16.16.16.32
./gpu_tile.sh \ #
./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
#$CMD | tee 2node.comms.hbm
#VOL=32.64.64.96
CMD="mpiexec -np 24 -ppn 12 -envall \ for VOL in 32.32.32.96 32.64.64.96
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap --debug-signals"
#for f in 1 2 3 4 5 6 7 8
for f in 1
do do
for AT in 32
do
CMD="mpiexec -np 24 -ppn 12 -envall \
./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid $VOL \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
echo $CMD echo $CMD
$CMD | tee 2node.32.32.64.48.dwf.hbm.$f $CMD
done
done done
CMD="mpiexec -np 24 -ppn 12 -envall \
./gpu_tile.sh \
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
#$CMD | tee 2node.64.64.64.96.dwf.hbm

View File

@ -5,11 +5,11 @@
#export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1) #export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)
export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 ); export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 );
export NUMA_MMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 ); export NUMA_HMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 );
export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 ) export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )
export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]} export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
export NUMAM=${NUMA_PMAP[$PALS_LOCAL_RANKID]} export NUMAH=${NUMA_HMAP[$PALS_LOCAL_RANKID]}
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
unset EnableWalkerPartition unset EnableWalkerPartition
@ -19,17 +19,19 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
#export MPI_BUF_NUMA=$NUMAH
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
if [ $PALS_RANKID = "0" ] if [ $PALS_RANKID = "0" ]
then then
# numactl -m $NUMAM -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
numactl -m $NUMAM -N $NUMAP "$@" # numactl -p $NUMAP -N $NUMAP "$@"
else else
numactl -m $NUMAM -N $NUMAP "$@" numactl -p $NUMAP -N $NUMAP "$@"
fi fi

View File

@ -1,6 +1,7 @@
#Ahead of time compile for PVC #Ahead of time compile for PVC
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions " export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
#JIT compile #JIT compile
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "