mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge branch 'develop' of https://github.com/paboyle/Grid into develop
This commit is contained in:
commit
6dbd117aa5
@ -65,8 +65,7 @@ public:
|
||||
MemoryManager::CpuFree((void *)__p,bytes);
|
||||
}
|
||||
|
||||
// FIXME: hack for the copy constructor, eventually it must be avoided
|
||||
//void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
|
||||
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
||||
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
||||
void construct(pointer __p) { };
|
||||
void destroy(pointer __p) { };
|
||||
@ -74,6 +73,9 @@ public:
|
||||
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
|
||||
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Unified virtual memory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
template<typename _Tp>
|
||||
class uvmAllocator {
|
||||
public:
|
||||
@ -109,22 +111,63 @@ public:
|
||||
MemoryManager::SharedFree((void *)__p,bytes);
|
||||
}
|
||||
|
||||
// FIXME: hack for the copy constructor, eventually it must be avoided
|
||||
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
|
||||
//void construct(pointer __p, const _Tp& __val) { };
|
||||
void construct(pointer __p) { };
|
||||
void destroy(pointer __p) { };
|
||||
};
|
||||
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
|
||||
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Device memory
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<typename _Tp>
|
||||
class devAllocator {
|
||||
public:
|
||||
typedef std::size_t size_type;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
typedef _Tp* pointer;
|
||||
typedef const _Tp* const_pointer;
|
||||
typedef _Tp& reference;
|
||||
typedef const _Tp& const_reference;
|
||||
typedef _Tp value_type;
|
||||
|
||||
template<typename _Tp1> struct rebind { typedef devAllocator<_Tp1> other; };
|
||||
devAllocator() throw() { }
|
||||
devAllocator(const devAllocator&) throw() { }
|
||||
template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
|
||||
~devAllocator() throw() { }
|
||||
pointer address(reference __x) const { return &__x; }
|
||||
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
||||
|
||||
pointer allocate(size_type __n, const void* _p= 0)
|
||||
{
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
profilerAllocate(bytes);
|
||||
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
|
||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void deallocate(pointer __p, size_type __n)
|
||||
{
|
||||
size_type bytes = __n * sizeof(_Tp);
|
||||
profilerFree(bytes);
|
||||
MemoryManager::AcceleratorFree((void *)__p,bytes);
|
||||
}
|
||||
void construct(pointer __p, const _Tp& __val) { };
|
||||
void construct(pointer __p) { };
|
||||
void destroy(pointer __p) { };
|
||||
};
|
||||
template<typename _Tp> inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
|
||||
template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Template typedefs
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T> using commAllocator = uvmAllocator<T>;
|
||||
//template<class T> using commAllocator = devAllocator<T>;
|
||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
||||
template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
|
||||
//template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
|
||||
template<class T> using commVector = std::vector<T,devAllocator<T> >;
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@ -93,12 +93,12 @@ private:
|
||||
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
|
||||
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
|
||||
|
||||
static void *AcceleratorAllocate(size_t bytes);
|
||||
static void AcceleratorFree (void *ptr,size_t bytes);
|
||||
static void PrintBytes(void);
|
||||
public:
|
||||
static void Init(void);
|
||||
static void InitMessage(void);
|
||||
static void *AcceleratorAllocate(size_t bytes);
|
||||
static void AcceleratorFree (void *ptr,size_t bytes);
|
||||
static void *SharedAllocate(size_t bytes);
|
||||
static void SharedFree (void *ptr,size_t bytes);
|
||||
static void *CpuAllocate(size_t bytes);
|
||||
|
@ -302,60 +302,28 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int bytes)
|
||||
{
|
||||
std::vector<CommsRequest_t> reqs(0);
|
||||
// unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
||||
// unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
||||
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
||||
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||
SendToRecvFromComplete(reqs);
|
||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
|
||||
}
|
||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||
void *recv,
|
||||
int sender,
|
||||
int receiver,
|
||||
int bytes)
|
||||
{
|
||||
MPI_Status stat;
|
||||
assert(sender != receiver);
|
||||
int tag = sender;
|
||||
if ( _processor == sender ) {
|
||||
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
||||
}
|
||||
if ( _processor == receiver ) {
|
||||
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
||||
}
|
||||
}
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
||||
unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
||||
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
|
||||
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
// Enforce no UVM in comms, device or host OK
|
||||
assert(acceleratorIsCommunicable(xmit));
|
||||
assert(acceleratorIsCommunicable(recv));
|
||||
|
||||
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
|
||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
||||
recv,bytes,MPI_CHAR,from, from,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
assert(ierr==0);
|
||||
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
list.push_back(rrq);
|
||||
} else {
|
||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
||||
recv,bytes,MPI_CHAR,from, from,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
assert(ierr==0);
|
||||
}
|
||||
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
@ -411,15 +379,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
|
||||
return off_node_bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
SendToRecvFromComplete(waitall);
|
||||
}
|
||||
void CartesianCommunicator::StencilBarrier(void)
|
||||
{
|
||||
MPI_Barrier (ShmComm);
|
||||
}
|
||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||
{
|
||||
int nreq=list.size();
|
||||
|
||||
@ -430,6 +390,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
|
||||
assert(ierr==0);
|
||||
list.resize(0);
|
||||
}
|
||||
void CartesianCommunicator::StencilBarrier(void)
|
||||
{
|
||||
MPI_Barrier (ShmComm);
|
||||
}
|
||||
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
//{
|
||||
//}
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
int ierr = MPI_Barrier(communicator);
|
||||
|
@ -222,6 +222,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
// Test_cshift_red_black code.
|
||||
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
||||
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
||||
assert(0); // This will fail if hit on GPU
|
||||
autoView( rhs_v, rhs, CpuWrite);
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
|
@ -208,7 +208,7 @@ public:
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
// Comms buffer
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
|
@ -215,7 +215,7 @@ public:
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
// Comms buffer
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
|
||||
};
|
||||
|
@ -70,6 +70,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
//
|
||||
// Memory management:
|
||||
//
|
||||
// int acceleratorIsCommunicable(void *pointer);
|
||||
// void *acceleratorAllocShared(size_t bytes);
|
||||
// void acceleratorFreeShared(void *ptr);
|
||||
//
|
||||
@ -90,6 +91,7 @@ void acceleratorInit(void);
|
||||
//////////////////////////////////////////////
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
#include <cuda.h>
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define GRID_SIMT
|
||||
@ -165,6 +167,16 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
inline int acceleratorIsCommunicable(void *ptr)
|
||||
{
|
||||
int uvm;
|
||||
auto
|
||||
cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
|
||||
assert(cuerr == cudaSuccess );
|
||||
if(uvm) return 0;
|
||||
else return 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -219,6 +231,15 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
inline int acceleratorIsCommunicable(void *ptr)
|
||||
{
|
||||
#if 0
|
||||
auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
|
||||
if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
|
||||
else return 0;
|
||||
#endif
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -298,6 +319,7 @@ inline void *acceleratorAllocShared(size_t bytes)
|
||||
return malloc(bytes);
|
||||
#endif
|
||||
};
|
||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||
|
||||
inline void *acceleratorAllocDevice(size_t bytes)
|
||||
{
|
||||
@ -352,6 +374,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
||||
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
||||
|
@ -99,10 +99,10 @@ inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector<T,_nd
|
||||
{
|
||||
os << "[";
|
||||
for(int s=0;s<v.size();s++) {
|
||||
os << v[s] << " ";
|
||||
}
|
||||
if (v.size() > 0) {
|
||||
os << "\b";
|
||||
os << v[s];
|
||||
if( s < (v.size()-1) ){
|
||||
os << " ";
|
||||
}
|
||||
}
|
||||
os << "]";
|
||||
return os;
|
||||
|
@ -74,90 +74,6 @@ int main (int argc, char ** argv)
|
||||
std::vector<double> t_time(Nloop);
|
||||
time_statistics timestat;
|
||||
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
header();
|
||||
for(int lat=8;lat<=maxlat;lat+=4){
|
||||
for(int Ls=8;Ls<=8;Ls*=2){
|
||||
|
||||
Coordinate latt_size ({lat*mpi_layout[0],
|
||||
lat*mpi_layout[1],
|
||||
lat*mpi_layout[2],
|
||||
lat*mpi_layout[3]});
|
||||
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
RealD Nrank = Grid._Nprocessors;
|
||||
RealD Nnode = Grid.NodeCount();
|
||||
RealD ppn = Nrank/Nnode;
|
||||
|
||||
std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
|
||||
std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
|
||||
|
||||
int ncomm;
|
||||
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
||||
for(int mu=0;mu<8;mu++){
|
||||
xbuf[mu].resize(lat*lat*lat*Ls);
|
||||
rbuf[mu].resize(lat*lat*lat*Ls);
|
||||
// std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
|
||||
}
|
||||
|
||||
for(int i=0;i<Nloop;i++){
|
||||
double start=usecond();
|
||||
|
||||
std::vector<CommsRequest_t> requests;
|
||||
|
||||
ncomm=0;
|
||||
for(int mu=0;mu<4;mu++){
|
||||
|
||||
if (mpi_layout[mu]>1 ) {
|
||||
|
||||
ncomm++;
|
||||
int comm_proc=1;
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
Grid.SendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu][0],
|
||||
xmit_to_rank,
|
||||
(void *)&rbuf[mu][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
comm_proc = mpi_layout[mu]-1;
|
||||
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
Grid.SendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu+4][0],
|
||||
xmit_to_rank,
|
||||
(void *)&rbuf[mu+4][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
}
|
||||
}
|
||||
Grid.SendToRecvFromComplete(requests);
|
||||
Grid.Barrier();
|
||||
double stop=usecond();
|
||||
t_time[i] = stop-start; // microseconds
|
||||
}
|
||||
|
||||
timestat.statistics(t_time);
|
||||
|
||||
double dbytes = bytes*ppn;
|
||||
double xbytes = dbytes*2.0*ncomm;
|
||||
double rbytes = xbytes;
|
||||
double bidibytes = xbytes+rbytes;
|
||||
|
||||
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
|
||||
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
|
||||
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
|
||||
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
|
||||
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
|
||||
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
@ -206,26 +122,22 @@ int main (int argc, char ** argv)
|
||||
{
|
||||
std::vector<CommsRequest_t> requests;
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
Grid.SendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu][0],
|
||||
xmit_to_rank,
|
||||
(void *)&rbuf[mu][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
Grid.SendToRecvFromComplete(requests);
|
||||
Grid.SendToRecvFrom((void *)&xbuf[mu][0],
|
||||
xmit_to_rank,
|
||||
(void *)&rbuf[mu][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
}
|
||||
|
||||
comm_proc = mpi_layout[mu]-1;
|
||||
{
|
||||
std::vector<CommsRequest_t> requests;
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
Grid.SendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu+4][0],
|
||||
xmit_to_rank,
|
||||
(void *)&rbuf[mu+4][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
Grid.SendToRecvFromComplete(requests);
|
||||
Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
|
||||
xmit_to_rank,
|
||||
(void *)&rbuf[mu+4][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -89,6 +89,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
std::cout << GridLogMessage;
|
||||
std::cout << latt_size;
|
||||
std::cout << "\t\t";
|
||||
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
GridRedBlackCartesian RBGrid(&Grid);
|
||||
|
@ -154,6 +154,7 @@ AC_ARG_ENABLE([accelerator],
|
||||
case ${ac_ACCELERATOR} in
|
||||
cuda)
|
||||
echo CUDA acceleration
|
||||
LIBS="${LIBS} -lcuda"
|
||||
AC_DEFINE([GRID_CUDA],[1],[Use CUDA offload]);;
|
||||
sycl)
|
||||
echo SYCL acceleration
|
||||
@ -323,7 +324,6 @@ case ${CXXTEST} in
|
||||
# CXXLD="nvcc -v -link"
|
||||
CXX="${CXXBASE} -x cu "
|
||||
CXXLD="${CXXBASE} -link"
|
||||
# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
|
||||
CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
|
||||
if test $ac_openmp = yes; then
|
||||
CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
|
||||
@ -483,8 +483,7 @@ case ${ac_SHM} in
|
||||
LDFLAGS_CPY=$LDFLAGS
|
||||
CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
|
||||
LDFLAGS="$AM_LDFLAGS $LDFLAGS"
|
||||
AC_SEARCH_LIBS([shm_unlink], [rt], [],
|
||||
[AC_MSG_ERROR("no library found for shm_unlink")])
|
||||
AC_SEARCH_LIBS([shm_unlink], [rt], [],[AC_MSG_ERROR("no library found for shm_unlink")])
|
||||
CXXFLAGS=$CXXFLAGS_CPY
|
||||
LDFLAGS=$LDFLAGS_CPY
|
||||
;;
|
||||
|
Loading…
Reference in New Issue
Block a user