From 65920faebae56dd8e28d43442fe4010c2f491de0 Mon Sep 17 00:00:00 2001 From: Bartosz Kostrzewa Date: Mon, 31 Aug 2020 18:39:27 +0200 Subject: [PATCH 01/10] correct formatting of Benchmark_wilson_sweep output --- benchmarks/Benchmark_wilson_sweep.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc index f1473b44..100e09f5 100644 --- a/benchmarks/Benchmark_wilson_sweep.cc +++ b/benchmarks/Benchmark_wilson_sweep.cc @@ -89,6 +89,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage; std::cout << latt_size; + std::cout << "\t\t"; GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridRedBlackCartesian RBGrid(&Grid); From a9b92867a817deed5c041a268d97c54d0674e56a Mon Sep 17 00:00:00 2001 From: Bartosz Kostrzewa Date: Mon, 31 Aug 2020 18:41:17 +0200 Subject: [PATCH 02/10] use tabulator --- benchmarks/Benchmark_wilson_sweep.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc index 100e09f5..986a1831 100644 --- a/benchmarks/Benchmark_wilson_sweep.cc +++ b/benchmarks/Benchmark_wilson_sweep.cc @@ -89,7 +89,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage; std::cout << latt_size; - std::cout << "\t\t"; + std::cout << "\t\t"; GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridRedBlackCartesian RBGrid(&Grid); From 54523369a3096750dac18f2150a23d6d98709030 Mon Sep 17 00:00:00 2001 From: Bartosz Kostrzewa Date: Mon, 31 Aug 2020 19:39:36 +0200 Subject: [PATCH 03/10] do not use backspace in Coordinate output stream operator --- Grid/util/Coordinate.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/util/Coordinate.h b/Grid/util/Coordinate.h index 7f1d31c0..004fbc72 100644 --- a/Grid/util/Coordinate.h +++ b/Grid/util/Coordinate.h @@ -99,10 +99,10 @@ inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector 0) { - os << "\b"; + os << v[s]; + if( s < (v.size()-1) ){ + os << " "; + } } os << "]"; return os; From d3ce60713d87dd1baf7d9f7acf1097da1c932d1a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 15:44:13 -0400 Subject: [PATCH 04/10] UVM, Device and Lattice/aligned allocators --- Grid/allocator/AlignedAllocator.h | 57 +++++++++++++++++++++++++++---- Grid/allocator/MemoryManager.h | 4 +-- 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index ebb3162b..249732fb 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -65,8 +65,7 @@ public: MemoryManager::CpuFree((void *)__p,bytes); } - // FIXME: hack for the copy constructor, eventually it must be avoided - //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; + // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop void construct(pointer __p, const _Tp& __val) { assert(0);}; void construct(pointer __p) { }; void destroy(pointer __p) { }; @@ -74,6 +73,9 @@ public: template inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } +////////////////////////////////////////////////////////////////////////////////////// +// Unified virtual memory +////////////////////////////////////////////////////////////////////////////////////// template class uvmAllocator { public: @@ -109,22 +111,63 @@ public: MemoryManager::SharedFree((void *)__p,bytes); } - // FIXME: hack for the copy constructor, eventually it must be avoided void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; - //void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p) { }; void destroy(pointer __p) { }; }; template inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; } template inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; } +//////////////////////////////////////////////////////////////////////////////// +// Device memory +//////////////////////////////////////////////////////////////////////////////// +template +class devAllocator { +public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef _Tp* pointer; + typedef const _Tp* const_pointer; + typedef _Tp& reference; + typedef const _Tp& const_reference; + typedef _Tp value_type; + + template struct rebind { typedef devAllocator<_Tp1> other; }; + devAllocator() throw() { } + devAllocator(const devAllocator&) throw() { } + template devAllocator(const devAllocator<_Tp1>&) throw() { } + ~devAllocator() throw() { } + pointer address(reference __x) const { return &__x; } + size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } + + pointer allocate(size_type __n, const void* _p= 0) + { + size_type bytes = __n*sizeof(_Tp); + profilerAllocate(bytes); + _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); + assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); + return ptr; + } + + void deallocate(pointer __p, size_type __n) + { + size_type bytes = __n * sizeof(_Tp); + profilerFree(bytes); + MemoryManager::AcceleratorFree((void *)__p,bytes); + } + void construct(pointer __p, const _Tp& __val) { }; + void construct(pointer __p) { }; + void destroy(pointer __p) { }; +}; +template inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; } +template inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; } + //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -template using commAllocator = uvmAllocator; +//template using commAllocator = devAllocator; template using Vector = std::vector >; -template using commVector = std::vector >; -//template using Matrix = std::vector > >; +template using commVector = std::vector >; NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 23065c58..aac13aee 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -93,12 +93,12 @@ private: static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ; static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ; - static void *AcceleratorAllocate(size_t bytes); - static void AcceleratorFree (void *ptr,size_t bytes); static void PrintBytes(void); public: static void Init(void); static void InitMessage(void); + static void *AcceleratorAllocate(size_t bytes); + static void AcceleratorFree (void *ptr,size_t bytes); static void *SharedAllocate(size_t bytes); static void SharedFree (void *ptr,size_t bytes); static void *CpuAllocate(size_t bytes); From 0c3095e173bc2886b121edb3fafa78646948f5e8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 15:45:35 -0400 Subject: [PATCH 05/10] Comms buffers to device memory --- Grid/communicator/Communicator_mpi3.cc | 88 +++++++++----------------- 1 file changed, 31 insertions(+), 57 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 6130195d..28c7b8a4 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -302,60 +302,35 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, int bytes) { std::vector reqs(0); - // unsigned long xcrc = crc32(0L, Z_NULL, 0); - // unsigned long rcrc = crc32(0L, Z_NULL, 0); - // xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); - SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); - SendToRecvFromComplete(reqs); - // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); - // printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc); -} -void CartesianCommunicator::SendRecvPacket(void *xmit, - void *recv, - int sender, - int receiver, - int bytes) -{ - MPI_Status stat; - assert(sender != receiver); - int tag = sender; - if ( _processor == sender ) { - MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); - } - if ( _processor == receiver ) { - MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); - } -} -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ + unsigned long xcrc = crc32(0L, Z_NULL, 0); + unsigned long rcrc = crc32(0L, Z_NULL, 0); + int myrank = _processor; int ierr; - if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { - MPI_Request xrq; - MPI_Request rrq; + // Enforce no UVM in comms, device or host OK + int uvm; + auto + cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) xmit); + assert(cuerr == cudaSuccess ); + assert(uvm==0); - ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); - ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); + cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) recv); + assert(cuerr == cudaSuccess ); + assert(uvm==0); - assert(ierr==0); - list.push_back(xrq); - list.push_back(rrq); - } else { - // Give the CPU to MPI immediately; can use threads to overlap optionally - ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, - recv,bytes,MPI_CHAR,from, from, - communicator,MPI_STATUS_IGNORE); - assert(ierr==0); - } + // Give the CPU to MPI immediately; can use threads to overlap optionally + // printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); + ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, + recv,bytes,MPI_CHAR,from, from, + communicator,MPI_STATUS_IGNORE); + assert(ierr==0); + + // xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); + // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); + // printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush } - +// Basic Halo comms primitive double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int dest, void *recv, @@ -411,15 +386,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &waitall,int dir) -{ - SendToRecvFromComplete(waitall); -} -void CartesianCommunicator::StencilBarrier(void) -{ - MPI_Barrier (ShmComm); -} -void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) { int nreq=list.size(); @@ -430,6 +397,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector & assert(ierr==0); list.resize(0); } +void CartesianCommunicator::StencilBarrier(void) +{ + MPI_Barrier (ShmComm); +} +//void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) +//{ +//} void CartesianCommunicator::Barrier(void) { int ierr = MPI_Barrier(communicator); From b4255140d611f3cbb1ccf6148450f14f5e0539cb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 15:47:46 -0400 Subject: [PATCH 06/10] Stale data member eliminated --- Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h | 2 +- Grid/qcd/action/fermion/WilsonFermion5D.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 625eda63..ca660610 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -208,7 +208,7 @@ public: LebesgueOrder LebesgueEvenOdd; // Comms buffer - std::vector > comm_buf; + // std::vector > comm_buf; /////////////////////////////////////////////////////////////// // Conserved current utilities diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 804b1d10..80231bb4 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -215,7 +215,7 @@ public: LebesgueOrder LebesgueEvenOdd; // Comms buffer - std::vector > comm_buf; + // std::vector > comm_buf; }; From 85b1c5df39c33e3de9d3ab2917400b9c985428c8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 15:48:16 -0400 Subject: [PATCH 07/10] A never hit case that is not 100% confident is asserted for safety --- Grid/cshift/Cshift_common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index d296c024..40ab5032 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -222,6 +222,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA // Test_cshift_red_black code. // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout<<" Unthreaded warning -- buffer is not densely packed ??"< Date: Thu, 3 Sep 2020 15:49:13 -0400 Subject: [PATCH 08/10] Include cuda.h --- Grid/threads/Accelerator.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 74a3ea22..29d12904 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -90,6 +90,7 @@ void acceleratorInit(void); ////////////////////////////////////////////// #ifdef GRID_CUDA +#include #ifdef __CUDA_ARCH__ #define GRID_SIMT From 8244caff2507eab0a99a59f275bd1745efa7336d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 18:52:55 -0400 Subject: [PATCH 09/10] Remove the asynchronous non-Stencil calls. --- benchmarks/Benchmark_comms.cc | 108 ++++------------------------------ 1 file changed, 10 insertions(+), 98 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 44ccbd19..232030c8 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -74,90 +74,6 @@ int main (int argc, char ** argv) std::vector t_time(Nloop); time_statistics timestat; - std::cout< > xbuf(8); - std::vector > rbuf(8); - - int ncomm; - int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); - for(int mu=0;mu<8;mu++){ - xbuf[mu].resize(lat*lat*lat*Ls); - rbuf[mu].resize(lat*lat*lat*Ls); - // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < requests; - - ncomm=0; - for(int mu=0;mu<4;mu++){ - - if (mpi_layout[mu]>1 ) { - - ncomm++; - int comm_proc=1; - int xmit_to_rank; - int recv_from_rank; - Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu][0], - xmit_to_rank, - (void *)&rbuf[mu][0], - recv_from_rank, - bytes); - - comm_proc = mpi_layout[mu]-1; - - Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu+4][0], - xmit_to_rank, - (void *)&rbuf[mu+4][0], - recv_from_rank, - bytes); - - } - } - Grid.SendToRecvFromComplete(requests); - Grid.Barrier(); - double stop=usecond(); - t_time[i] = stop-start; // microseconds - } - - timestat.statistics(t_time); - - double dbytes = bytes*ppn; - double xbytes = dbytes*2.0*ncomm; - double rbytes = xbytes; - double bidibytes = xbytes+rbytes; - - std::cout< requests; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu][0], - xmit_to_rank, - (void *)&rbuf[mu][0], - recv_from_rank, - bytes); - Grid.SendToRecvFromComplete(requests); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); } comm_proc = mpi_layout[mu]-1; { std::vector requests; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu+4][0], - xmit_to_rank, - (void *)&rbuf[mu+4][0], - recv_from_rank, - bytes); - Grid.SendToRecvFromComplete(requests); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); } } } From a8309638d4be9faf49619a8220a9b06a4e16eb73 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 20:29:26 -0400 Subject: [PATCH 10/10] UVM check in MPI calls --- Grid/communicator/Communicator_mpi3.cc | 11 ++--------- Grid/threads/Accelerator.h | 22 ++++++++++++++++++++++ configure.ac | 5 ++--- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 28c7b8a4..83f71233 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -309,15 +309,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, int ierr; // Enforce no UVM in comms, device or host OK - int uvm; - auto - cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) xmit); - assert(cuerr == cudaSuccess ); - assert(uvm==0); - - cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) recv); - assert(cuerr == cudaSuccess ); - assert(uvm==0); + assert(acceleratorIsCommunicable(xmit)); + assert(acceleratorIsCommunicable(recv)); // Give the CPU to MPI immediately; can use threads to overlap optionally // printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 29d12904..1a3dfdc2 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -70,6 +70,7 @@ NAMESPACE_BEGIN(Grid); // // Memory management: // +// int acceleratorIsCommunicable(void *pointer); // void *acceleratorAllocShared(size_t bytes); // void acceleratorFreeShared(void *ptr); // @@ -166,6 +167,16 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} +inline int acceleratorIsCommunicable(void *ptr) +{ + int uvm; + auto + cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); + assert(cuerr == cudaSuccess ); + if(uvm) return 0; + else return 1; +} + #endif ////////////////////////////////////////////// @@ -220,6 +231,15 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline int acceleratorIsCommunicable(void *ptr) +{ +#if 0 + auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); + if ( uvm = cl::sycl::usm::alloc::shared ) return 1; + else return 0; +#endif + return 1; +} #endif @@ -299,6 +319,7 @@ inline void *acceleratorAllocShared(size_t bytes) return malloc(bytes); #endif }; +inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline void *acceleratorAllocDevice(size_t bytes) { @@ -353,6 +374,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} +inline int acceleratorIsCommunicable(void *ptr){ return 1; } #ifdef HAVE_MM_MALLOC_H inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; diff --git a/configure.ac b/configure.ac index f0bea6a4..5113757c 100644 --- a/configure.ac +++ b/configure.ac @@ -154,6 +154,7 @@ AC_ARG_ENABLE([accelerator], case ${ac_ACCELERATOR} in cuda) echo CUDA acceleration + LIBS="${LIBS} -lcuda" AC_DEFINE([GRID_CUDA],[1],[Use CUDA offload]);; sycl) echo SYCL acceleration @@ -323,7 +324,6 @@ case ${CXXTEST} in # CXXLD="nvcc -v -link" CXX="${CXXBASE} -x cu " CXXLD="${CXXBASE} -link" -# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr" CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" if test $ac_openmp = yes; then CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" @@ -483,8 +483,7 @@ case ${ac_SHM} in LDFLAGS_CPY=$LDFLAGS CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS" LDFLAGS="$AM_LDFLAGS $LDFLAGS" - AC_SEARCH_LIBS([shm_unlink], [rt], [], - [AC_MSG_ERROR("no library found for shm_unlink")]) + AC_SEARCH_LIBS([shm_unlink], [rt], [],[AC_MSG_ERROR("no library found for shm_unlink")]) CXXFLAGS=$CXXFLAGS_CPY LDFLAGS=$LDFLAGS_CPY ;;