From 6e2e904a0edae085d02965425e522ccdc4ca6875 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 4 Jun 2019 20:46:35 +0100 Subject: [PATCH] NVCC compiles happy. Start to develop strategy for writing generic code for GPU kernels and CPU kernels. --- Grid/tensors/Tensor_SIMT.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index 9b329748..fc146fc5 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid); ////////////////////////////////////////// // Trivial mapping of vectors on host ////////////////////////////////////////// + template accelerator_inline vobj coalescedRead(const vobj & __restrict__ vec) { @@ -57,23 +58,23 @@ void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted) vstream(vec, extracted); } #else +accelerator_inline int SIMTlane(int Nsimd){ return threadIdx.x % Nsimd; } // CUDA specific + ////////////////////////////////////////// // Extract and insert slices on the GPU ////////////////////////////////////////// template accelerator_inline typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec) { - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); - int lane = threadIdx.x % Nsimd; + const int Nsimd = vobj::Nsimd(); + int lane = SIMTlane(Nsimd); return extractLane(lane,vec); } template accelerator_inline typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm) { - constexpr int Nsimd = vobj::Nsimd(); - int lane = threadIdx.x % Nsimd; + const int Nsimd = vobj::Nsimd(); + int lane = SIMTlane(Nsimd); int mask = Nsimd >> (ptype + 1); int plane= doperm ? lane ^ mask : lane; return extractLane(plane,vec); @@ -81,10 +82,8 @@ typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec, template accelerator_inline void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted) { - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); - int lane = threadIdx.x % Nsimd; + const int Nsimd = vobj::Nsimd(); + int lane = SIMTlane(Nsimd); insertLane(lane,vec,extracted); } #endif