1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 15:27:06 +01:00

Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc

Aim to reduce the amount of cuda and other code variations floating around all over the place.

Will move GpuInit iinto Accelerator.cc from Init.cc
Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
This commit is contained in:
Peter Boyle
2020-05-08 06:23:55 -07:00
parent 28a1fcaaff
commit f8b8e00090
13 changed files with 557 additions and 718 deletions

View File

@ -31,24 +31,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
//accelerator_inline void SIMTsynchronise(void)
accelerator_inline void synchronise(void)
{
#ifdef GRID_SIMT
#ifdef GRID_CUDA
// __syncthreads();
__syncwarp();
#endif
#endif
return;
}
#ifndef GRID_SIMT
//////////////////////////////////////////
// Trivial mapping of vectors on host
//////////////////////////////////////////
accelerator_inline int SIMTlane(int Nsimd) { return 0; } // CUDA specific
template<class vobj> accelerator_inline
vobj coalescedRead(const vobj & __restrict__ vec,int lane=0)
{
@ -68,7 +55,6 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
template<class vobj> accelerator_inline
void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
{
// vstream(vec, extracted);
vec = extracted;
}
template<class vobj> accelerator_inline
@ -77,31 +63,24 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
vstream(vec, extracted);
}
#else
#ifdef GRID_CUDA
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
#endif
#ifdef GRID_SYCL
//accelerator_inline int SIMTlane(int Nsimd) { return __spirv_BuiltInGlobalInvocationId[2]; } //SYCL specific
accelerator_inline int SIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
#endif
//////////////////////////////////////////
// Extract and insert slices on the GPU
//////////////////////////////////////////
template<class vobj> accelerator_inline
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=SIMTlane(vobj::Nsimd()))
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=acceleratorSIMTlane(vobj::Nsimd()))
{
return extractLane(lane,vec);
}
template<class vobj> accelerator_inline
typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=SIMTlane(vobj::Nsimd()))
typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=acceleratorSIMTlane(vobj::Nsimd()))
{
int mask = vobj::Nsimd() >> (ptype + 1);
int plane= doperm ? lane ^ mask : lane;
return extractLane(plane,vec);
}
template<class vobj> accelerator_inline
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=SIMTlane(vobj::Nsimd()))
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
{
insertLane(lane,vec,extracted);
}