From 94019a922e933749b82e16bf24fe89fef50dc95b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 30 Jan 2025 16:36:46 +0000 Subject: [PATCH] Significantly better performance on Aurora without using pipeline mode --- Grid/communicator/Communicator_base.h | 6 + Grid/communicator/Communicator_mpi3.cc | 282 +++++++++++++----- Grid/communicator/Communicator_none.cc | 9 + Grid/communicator/SharedMemoryMPI.cc | 19 +- .../WilsonFermion5DImplementation.h | 28 +- Grid/stencil/Stencil.h | 8 + Grid/threads/Accelerator.h | 13 +- Makefile.am | 2 +- configure.ac | 5 +- systems/Aurora/benchmarks/bench2.pbs | 16 +- systems/Aurora/benchmarks/gpu_tile.sh | 14 +- systems/Aurora/config-command | 5 +- 12 files changed, 306 insertions(+), 101 deletions(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 0da7dc22..85659b3d 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -186,6 +186,12 @@ public: int recv_from_rank,int do_recv, int bytes,int dir); + double StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int xmit_to_rank,int do_xmit, + void *recv, + int recv_from_rank,int do_recv, + int xbytes,int rbytes,int dir); double StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank,int do_xmit, diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index d269f933..6b6c9dec 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -391,42 +391,131 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int bytes,int dir) { std::vector list; - double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); + double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); + offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); StencilSendToRecvFromComplete(list,dir); return offbytes; } -#undef NVLINK_GET // Define to use get instead of put DMA -double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + +#ifdef ACCELERATOR_AWARE_MPI +double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int dest,int dox, + void *recv, + int from,int dor, + int xbytes,int rbytes,int dir) +{ + return 0.0; // Do nothing -- no preparation required +} +double CartesianCommunicator::StencilSendToRecvFromBegin(int list_idx, + std::vector &list, void *xmit, int dest,int dox, void *recv, int from,int dor, int xbytes,int rbytes,int dir) +{ + int ncomm =communicator_halo.size(); + int commdir=dir%ncomm; + + MPI_Request xrq; + MPI_Request rrq; + + int ierr; + int gdest = ShmRanks[dest]; + int gfrom = ShmRanks[from]; + int gme = ShmRanks[_processor]; + + assert(dest != _processor); + assert(from != _processor); + assert(gme == ShmRank); + double off_node_bytes=0.0; + int tag; + + if ( dor ) { + if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { + tag= dir+from*32; + ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); + assert(ierr==0); + list.push_back(rrq); + off_node_bytes+=rbytes; + } + } + + if (dox) { + if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { + tag= dir+_processor*32; + ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); + assert(ierr==0); + list.push_back(xrq); + off_node_bytes+=xbytes; + } else { + void *shm = (void *) this->ShmBufferTranslate(dest,recv); + assert(shm!=NULL); + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); + } + } + return off_node_bytes; +} + +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) +{ + int nreq=list.size(); + + acceleratorCopySynchronise(); + + if (nreq==0) return; + std::vector status(nreq); + int ierr = MPI_Waitall(nreq,&list[0],&status[0]); + assert(ierr==0); + list.resize(0); +} + +#else /* NOT ... ACCELERATOR_AWARE_MPI */ +/////////////////////////////////////////// +// Pipeline mode through host memory +/////////////////////////////////////////// + /* + * In prepare (phase 1): + * PHASE 1: (prepare) + * - post MPI receive buffers asynch + * - post device - host send buffer transfer asynch + * - post device - device transfers + * PHASE 2: (Begin) + * - complete all copies + * - post MPI send asynch + * PHASE 3: (Complete) + * - MPI_waitall + * - host-device transfers + * + ********************************* + * NB could split this further: + *-------------------------------- + * PHASE 1: (Prepare) + * - post MPI receive buffers asynch + * - post device - host send buffer transfer asynch + * PHASE 2: (BeginInterNode) + * - complete all copies + * - post MPI send asynch + * PHASE 3: (BeginIntraNode) + * - post device - device transfers + * PHASE 4: (Complete) + * - MPI_waitall + * - host-device transfers asynch + * - (complete all copies) + */ +double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int dest,int dox, + void *recv, + int from,int dor, + int xbytes,int rbytes,int dir) { /* * Bring sequence from Stencil.h down to lower level. * Assume using XeLink is ok -#warning "Using COPY VIA HOST BUFFERS IN STENCIL" - // Introduce a host buffer with a cheap slab allocator and zero cost wipe all - Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes); - Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); - if ( Packets[i].do_send ) { - acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); - } - _grid->StencilSendToRecvFromBegin(MpiReqs, - Packets[i].host_send_buf, - Packets[i].to_rank,Packets[i].do_send, - Packets[i].host_recv_buf, - Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes,Packets[i].rbytes,i); - } - for(int i=0;iHostBufferFreeAll(); -*/ + */ int ncomm =communicator_halo.size(); int commdir=dir%ncomm; @@ -447,14 +536,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorHostBufferMalloc(rbytes); ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); assert(ierr==0); @@ -465,79 +555,137 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(from,xmit); - assert(shm!=NULL); - acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); -#endif } } if (dox) { - // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { tag= dir+_processor*32; -#ifdef ACCELERATOR_AWARE_MPI - ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); - assert(ierr==0); - list.push_back(xrq); -#else - std::cout << " send via host bounce "<HostBufferMalloc(xbytes); - acceleratorCopyFromDevice(xmit, host_xmit,xbytes); - ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); - assert(ierr==0); + acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch + + // ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); + // assert(ierr==0); + // off_node_bytes+=xbytes; + CommsRequest_t srq; srq.PacketType = InterNodeXmit; srq.bytes = xbytes; - srq.req = xrq; + // srq.req = xrq; srq.host_buf = host_xmit; srq.device_buf = xmit; list.push_back(srq); -#endif - off_node_bytes+=xbytes; + } else { -#ifndef NVLINK_GET void *shm = (void *) this->ShmBufferTranslate(dest,recv); assert(shm!=NULL); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); -#endif - } } return off_node_bytes; } + +double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int dest,int dox, + void *recv, + int from,int dor, + int xbytes,int rbytes,int dir) +{ + int ncomm =communicator_halo.size(); + int commdir=dir%ncomm; + + MPI_Request xrq; + MPI_Request rrq; + + int ierr; + int gdest = ShmRanks[dest]; + int gfrom = ShmRanks[from]; + int gme = ShmRanks[_processor]; + + assert(dest != _processor); + assert(from != _processor); + assert(gme == ShmRank); + double off_node_bytes=0.0; + int tag; + + void * host_xmit = NULL; + + //////////////////////////////// + // Receives already posted + // Copies already started + //////////////////////////////// + /* + * PHASE 2: (Begin) + * - complete all copies + * - post MPI send asynch + */ + acceleratorCopySynchronise(); + + static int printed; + if(!printed && this->IsBoss() ) { + printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes); + printed=1; + } + + if (dox) { + + if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { + tag= dir+_processor*32; + // Find the send in the prepared list + int list_idx=-1; + for(int idx = 0; idx &list,int dir) { int nreq=list.size(); - acceleratorCopySynchronise(); - if (nreq==0) return; -#ifdef ACCELERATOR_AWARE_MPI std::vector status(nreq); - int ierr = MPI_Waitall(nreq,&list[0],&status[0]); - assert(ierr==0); - list.resize(0); -#else - // Wait individually and immediately copy receives to device - // Promition to Asynch copy and single wait is easy - MPI_Status status; + std::vector MpiRequests(nreq); + + for(int r=0;rHostBufferFreeAll(); -#endif + + acceleratorCopySynchronise(); // Complete all pending copy transfers + list.resize(0); // Delete the list + this->HostBufferFreeAll(); // Clean up the buffer allocs } +#endif +//////////////////////////////////////////// +// END PIPELINE MODE / NO CUDA AWARE MPI +//////////////////////////////////////////// + void CartesianCommunicator::StencilBarrier(void) { MPI_Barrier (ShmComm); diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 7e7dfac8..8e6206ef 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -132,6 +132,15 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, { return 2.0*bytes; } +double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int xmit_to_rank,int dox, + void *recv, + int recv_from_rank,int dor, + int xbytes,int rbytes, int dir) +{ + return xbytes+rbytes; +} double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank,int dox, diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index c7668f8b..ce11714f 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -43,8 +43,8 @@ Author: Christoph Lehner #define GRID_SYCL_LEVEL_ZERO_IPC #define SHM_SOCKETS #else -#undef NUMA_PLACE_HOSTBUF -#ifdef NUMA_PLACE_HOSTBUF +#ifdef HAVE_NUMAIF_H + #warning " Using NUMAIF " #include #endif #endif @@ -544,18 +544,19 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) #ifndef ACCELERATOR_AWARE_MPI printf("Host buffer allocate for GPU non-aware MPI\n"); HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host -#ifdef NUMA_PLACE_HOSTBUF +#ifdef HAVE_NUMAIF_H + #warning "Moving host buffers to specific NUMA domain" int numa; char *numa_name=(char *)getenv("MPI_BUF_NUMA"); if(numa_name) { - page_size = sysconf(_SC_PAGESIZE); + unsigned long page_size = sysconf(_SC_PAGESIZE); numa = atoi(numa_name); unsigned long page_count = bytes/page_size; - std::vector pages(pcount); - std::vector nodes(pcount,numa); - std::vector status(pcount,-1); + std::vector pages(page_count); + std::vector nodes(page_count,numa); + std::vector status(page_count,-1); for(unsigned long p=0;p::DhopInternalOverlappedComms(StencilImpl & st, // std::cout << " WilsonFermion5D Communicate Begin " < > requests; - auto id=traceStart("Communicate overlapped"); - st.CommunicateBegin(requests); - +#ifndef GRID_ACCELERATED ///////////////////////////// // Overlap with comms ///////////////////////////// - { - // std::cout << " WilsonFermion5D Comms merge " <::DhopInternalOverlappedComms(StencilImpl & st, GRID_TRACE("DhopInterior"); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); } - + +#ifdef GRID_ACCELERATED + ///////////////////////////// + // Overlap with comms -- on GPU the interior kernel call is nonblocking + ///////////////////////////// + st.CommunicateBegin(requests); + st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms +#endif + + ///////////////////////////// // Complete comms ///////////////////////////// // std::cout << " WilsonFermion5D Comms Complete " <StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. + for(int i=0;iStencilSendToRecvFromPrepare(MpiReqs, + Packets[i].send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].xbytes,Packets[i].rbytes,i); + } for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index dc68fd2d..2862d087 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -327,7 +327,10 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } + inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} +inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); } +inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); } inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} @@ -465,8 +468,7 @@ inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} -//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} -//inline void acceleratorCopySynchronise(void) { } + inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch @@ -483,6 +485,13 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize #endif +inline void acceleratorPin(void *ptr,unsigned long bytes) +{ +#ifdef GRID_SYCL + sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context()); +#endif +} + ////////////////////////////////////////////// // Common on all GPU targets ////////////////////////////////////////////// diff --git a/Makefile.am b/Makefile.am index d2a1a326..9addcbf5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,5 +1,5 @@ # additional include paths necessary to compile the C++ library -SUBDIRS = Grid HMC benchmarks tests examples +SUBDIRS = Grid benchmarks tests examples HMC include $(top_srcdir)/doxygen.inc diff --git a/configure.ac b/configure.ac index 0b71b834..e4b553bf 100644 --- a/configure.ac +++ b/configure.ac @@ -72,6 +72,7 @@ AC_CHECK_HEADERS(malloc/malloc.h) AC_CHECK_HEADERS(malloc.h) AC_CHECK_HEADERS(endian.h) AC_CHECK_HEADERS(execinfo.h) +AC_CHECK_HEADERS(numaif.h) AC_CHECK_DECLS([ntohll],[], [], [[#include ]]) AC_CHECK_DECLS([be64toh],[], [], [[#include ]]) @@ -245,9 +246,11 @@ AC_ARG_ENABLE([accelerator-aware-mpi], [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) +# Force accelerator CSHIFT now +AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device]) + case ${ac_ACCELERATOR_AWARE_MPI} in yes) - AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device]) AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; *);; esac diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs index 81b3128d..aebed04e 100644 --- a/systems/Aurora/benchmarks/bench2.pbs +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -27,10 +27,22 @@ export MPICH_OFI_NIC_POLICY=GPU #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +# +# Local vol 16.16.16.32 +# + +#VOL=32.64.64.96 + +for VOL in 32.32.32.96 32.64.64.96 +do +for AT in 32 +do CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 8 " + ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid $VOL \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap " echo $CMD $CMD +done +done diff --git a/systems/Aurora/benchmarks/gpu_tile.sh b/systems/Aurora/benchmarks/gpu_tile.sh index ddb25c5e..ef64299c 100755 --- a/systems/Aurora/benchmarks/gpu_tile.sh +++ b/systems/Aurora/benchmarks/gpu_tile.sh @@ -5,11 +5,11 @@ #export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1) export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 ); -export NUMA_MMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 ); +export NUMA_HMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 ); export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 ) export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]} -export NUMAM=${NUMA_PMAP[$PALS_LOCAL_RANKID]} +export NUMAH=${NUMA_HMAP[$PALS_LOCAL_RANKID]} export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} unset EnableWalkerPartition @@ -19,17 +19,19 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:7 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 +export MPI_BUF_NUMA=$NUMAH + echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " if [ $PALS_RANKID = "0" ] then - numactl -m $NUMAM -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" -# numactl -m $NUMAM -N $NUMAP "$@" + numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" +# numactl -p $NUMAP -N $NUMAP "$@" else - numactl -m $NUMAM -N $NUMAP "$@" + numactl -p $NUMAP -N $NUMAP "$@" fi diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 64bef44b..6e5512ff 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,6 +1,7 @@ #Ahead of time compile for PVC -export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " -export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions " + +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/" #JIT compile #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "