mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-25 21:25:56 +01:00
Changes to add back shared memory test on GPU
This commit is contained in:
parent
11dc2c5e1d
commit
4f89f603ae
@ -959,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
|
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
|
||||||
|
|
||||||
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
|
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
|
||||||
|
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
|
||||||
}
|
}
|
||||||
ShmBufferFreeAll();
|
ShmBufferFreeAll();
|
||||||
|
|
||||||
@ -1011,29 +1012,20 @@ void SharedMemory::SharedMemoryTest(void)
|
|||||||
check[0]=GlobalSharedMemory::WorldNode;
|
check[0]=GlobalSharedMemory::WorldNode;
|
||||||
check[1]=r;
|
check[1]=r;
|
||||||
check[2]=magic;
|
check[2]=magic;
|
||||||
// std::cerr << " ShmRank "<<ShmRank<<" storing "<<GlobalSharedMemory::WorldNode<<","<<r<<","<<std::hex<<magic<<" to buf "<<ShmCommBufs[r]
|
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
|
||||||
// <<std::dec<<std::endl;
|
|
||||||
acceleratorPut(ShmCommBufs[r][0],check[0]);
|
|
||||||
acceleratorPut(ShmCommBufs[r][1],check[1]);
|
|
||||||
acceleratorPut(ShmCommBufs[r][2],check[2]);
|
|
||||||
// GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ShmBarrier();
|
ShmBarrier();
|
||||||
for(uint64_t r=0;r<ShmSize;r++){
|
for(uint64_t r=0;r<ShmSize;r++){
|
||||||
ShmBarrier();
|
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
|
||||||
// GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
|
// accelerator_barrier();
|
||||||
// std::cerr << " ShmRank "<<ShmRank<<" read "<<check[0]<<","<<check[1]<<","<<std::hex<<check[2]<<" from buf "<<ShmCommBufs[r]
|
// std::cerr << " ShmRank "<<ShmRank<<" read "<<check[0]<<","<<check[1]<<","<<std::hex<<check[2]<<" from buf "<<ShmCommBufs[r]
|
||||||
// <<std::dec<<std::endl;
|
// <<std::dec<<std::endl;
|
||||||
check[0] = acceleratorGet(ShmCommBufs[r][0]);
|
|
||||||
check[1] = acceleratorGet(ShmCommBufs[r][1]);
|
|
||||||
check[2] = acceleratorGet(ShmCommBufs[r][2]);
|
|
||||||
ShmBarrier();
|
|
||||||
assert(check[0]==GlobalSharedMemory::WorldNode);
|
assert(check[0]==GlobalSharedMemory::WorldNode);
|
||||||
assert(check[1]==r);
|
assert(check[1]==r);
|
||||||
assert(check[2]==magic);
|
assert(check[2]==magic);
|
||||||
ShmBarrier();
|
|
||||||
}
|
}
|
||||||
|
ShmBarrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
void *SharedMemory::ShmBuffer(int rank)
|
void *SharedMemory::ShmBuffer(int rank)
|
||||||
|
@ -242,19 +242,33 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef int acceleratorEvent_t;
|
||||||
|
|
||||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||||
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
|
||||||
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
|
||||||
|
acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
|
||||||
|
acceleratorCopyFromDevice(from,to,bytes);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
||||||
|
inline void acceleratorEventWait(acceleratorEvent_t ev)
|
||||||
|
{
|
||||||
|
//auto discard=cudaStreamSynchronize(ev);
|
||||||
|
}
|
||||||
|
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
|
||||||
|
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr)
|
inline int acceleratorIsCommunicable(void *ptr)
|
||||||
@ -478,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
|||||||
inline void *acceleratorAllocHost(size_t bytes)
|
inline void *acceleratorAllocHost(size_t bytes)
|
||||||
{
|
{
|
||||||
void *ptr=NULL;
|
void *ptr=NULL;
|
||||||
auto err = hipMallocHost((void **)&ptr,bytes);
|
auto err = hipHostMalloc((void **)&ptr,bytes);
|
||||||
if( err != hipSuccess ) {
|
if( err != hipSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
||||||
@ -516,18 +530,30 @@ inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ a
|
|||||||
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
||||||
|
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
typedef int acceleratorEvent_t;
|
||||||
|
|
||||||
|
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
|
acceleratorCopyToDevice(from,to,bytes);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
|
acceleratorCopyFromDevice(from,to,bytes);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
||||||
|
|
||||||
|
inline void acceleratorEventWait(acceleratorEvent_t ev)
|
||||||
|
{
|
||||||
|
// auto discard=hipStreamSynchronize(ev);
|
||||||
|
}
|
||||||
|
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
inline void acceleratorPin(void *ptr,unsigned long bytes)
|
inline void acceleratorPin(void *ptr,unsigned long bytes)
|
||||||
@ -564,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)
|
|||||||
|
|
||||||
#undef GRID_SIMT
|
#undef GRID_SIMT
|
||||||
|
|
||||||
|
typedef int acceleratorEvent_t;
|
||||||
|
|
||||||
inline void acceleratorMem(void)
|
inline void acceleratorMem(void)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
@ -583,9 +611,12 @@ inline void acceleratorMem(void)
|
|||||||
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||||
|
|
||||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes) { acceleratorCopyToDevice(from,to,bytes); return 0; }
|
||||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes) { acceleratorCopyFromDevice(from,to,bytes); return 0; }
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
inline void acceleratorEventWait(acceleratorEvent_t ev){}
|
||||||
|
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
|
||||||
|
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); return 0;}
|
||||||
|
|
||||||
inline void acceleratorCopySynchronise(void) {};
|
inline void acceleratorCopySynchronise(void) {};
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||||
|
Loading…
x
Reference in New Issue
Block a user