mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-17 15:27:06 +01:00
Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc
Aim to reduce the amount of cuda and other code variations floating around all over the place. Will move GpuInit iinto Accelerator.cc from Init.cc Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
This commit is contained in:
@ -31,24 +31,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
//accelerator_inline void SIMTsynchronise(void)
|
||||
accelerator_inline void synchronise(void)
|
||||
{
|
||||
#ifdef GRID_SIMT
|
||||
#ifdef GRID_CUDA
|
||||
// __syncthreads();
|
||||
__syncwarp();
|
||||
#endif
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef GRID_SIMT
|
||||
//////////////////////////////////////////
|
||||
// Trivial mapping of vectors on host
|
||||
//////////////////////////////////////////
|
||||
accelerator_inline int SIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||
|
||||
template<class vobj> accelerator_inline
|
||||
vobj coalescedRead(const vobj & __restrict__ vec,int lane=0)
|
||||
{
|
||||
@ -68,7 +55,6 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
|
||||
template<class vobj> accelerator_inline
|
||||
void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
|
||||
{
|
||||
// vstream(vec, extracted);
|
||||
vec = extracted;
|
||||
}
|
||||
template<class vobj> accelerator_inline
|
||||
@ -77,31 +63,24 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
|
||||
vstream(vec, extracted);
|
||||
}
|
||||
#else
|
||||
#ifdef GRID_CUDA
|
||||
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
//accelerator_inline int SIMTlane(int Nsimd) { return __spirv_BuiltInGlobalInvocationId[2]; } //SYCL specific
|
||||
accelerator_inline int SIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Extract and insert slices on the GPU
|
||||
//////////////////////////////////////////
|
||||
template<class vobj> accelerator_inline
|
||||
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=SIMTlane(vobj::Nsimd()))
|
||||
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=acceleratorSIMTlane(vobj::Nsimd()))
|
||||
{
|
||||
return extractLane(lane,vec);
|
||||
}
|
||||
template<class vobj> accelerator_inline
|
||||
typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=SIMTlane(vobj::Nsimd()))
|
||||
typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=acceleratorSIMTlane(vobj::Nsimd()))
|
||||
{
|
||||
int mask = vobj::Nsimd() >> (ptype + 1);
|
||||
int plane= doperm ? lane ^ mask : lane;
|
||||
return extractLane(plane,vec);
|
||||
}
|
||||
template<class vobj> accelerator_inline
|
||||
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=SIMTlane(vobj::Nsimd()))
|
||||
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
|
||||
{
|
||||
insertLane(lane,vec,extracted);
|
||||
}
|
||||
|
Reference in New Issue
Block a user