From 07c0c02f8c1d58605150c4729f7d1b3b32416045 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 11 May 2020 17:02:01 -0400 Subject: [PATCH] Speed up Cshift --- Grid/allocator/AlignedAllocator.h | 10 +- Grid/cartesian/Cartesian_base.h | 1 + Grid/cartesian/Cartesian_full.h | 4 + Grid/cartesian/Cartesian_red_black.h | 18 ++- Grid/cshift/Cshift_common.h | 85 +++++++------ Grid/threads/Accelerator.cc | 182 ++++++++++++++++++++++++++- Grid/threads/Accelerator.h | 178 ++++++++++++++------------ Grid/threads/Threads.h | 6 + Grid/util/Init.cc | 139 +------------------- tests/core/Test_cshift_red_black.cc | 2 +- tests/core/Test_cshift_rotate.cc | 1 + tests/core/Test_main.cc | 12 +- 12 files changed, 373 insertions(+), 265 deletions(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index a29c8bcb..c8742d3e 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -29,7 +29,6 @@ Author: Peter Boyle #ifndef GRID_ALIGNED_ALLOCATOR_H #define GRID_ALIGNED_ALLOCATOR_H - NAMESPACE_BEGIN(Grid); /*Move control to configure.ac and Config.h*/ @@ -157,6 +156,15 @@ public: assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); +#if 0 + size_type page_size=4096; + size_type pages = (bytes+page_size-1)/page_size; + uint8_t *bp = (uint8_t *)ptr; + + accelerator_for(pg,pages,1,{ + bp[pg*page_size]=0; + }); +#endif return ptr; } diff --git a/Grid/cartesian/Cartesian_base.h b/Grid/cartesian/Cartesian_base.h index 87472cc9..ae1fd1fd 100644 --- a/Grid/cartesian/Cartesian_base.h +++ b/Grid/cartesian/Cartesian_base.h @@ -81,6 +81,7 @@ public: bool _isCheckerBoarded; int LocallyPeriodic; + Coordinate _checker_dim_mask; public: diff --git a/Grid/cartesian/Cartesian_full.h b/Grid/cartesian/Cartesian_full.h index c083817b..31a67bf0 100644 --- a/Grid/cartesian/Cartesian_full.h +++ b/Grid/cartesian/Cartesian_full.h @@ -38,6 +38,7 @@ class GridCartesian: public GridBase { public: int dummy; + Coordinate _checker_dim_mask; virtual int CheckerBoardFromOindexTable (int Oindex) { return 0; } @@ -104,6 +105,7 @@ public: _ldimensions.resize(_ndimension); _rdimensions.resize(_ndimension); _simd_layout.resize(_ndimension); + _checker_dim_mask.resize(_ndimension);; _lstart.resize(_ndimension); _lend.resize(_ndimension); @@ -114,6 +116,8 @@ public: for (int d = 0; d < _ndimension; d++) { + _checker_dim_mask[d]=0; + _fdimensions[d] = dimensions[d]; // Global dimensions _gdimensions[d] = _fdimensions[d]; // Global dimensions _simd_layout[d] = simd_layout[d]; diff --git a/Grid/cartesian/Cartesian_red_black.h b/Grid/cartesian/Cartesian_red_black.h index 34f763d2..b71981f5 100644 --- a/Grid/cartesian/Cartesian_red_black.h +++ b/Grid/cartesian/Cartesian_red_black.h @@ -35,12 +35,28 @@ static const int CbRed =0; static const int CbBlack=1; static const int Even =CbRed; static const int Odd =CbBlack; + +accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk) +{ + int nd=rdim.size(); + Coordinate coor(nd); + + Lexicographic::CoorFromIndex(coor,oindex,rdim); + + int linear=0; + for(int d=0;d _checker_board; diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index 954342cb..fe9afc62 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -29,6 +29,8 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); +extern Vector > Cshift_table; + /////////////////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// @@ -46,7 +48,8 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen int e2=rhs.Grid()->_slice_block[dimension]; int ent = 0; - static Vector > table; table.resize(e1*e2); + if(Cshift_table.size()_slice_stride[dimension]; auto rhs_v = rhs.View(); @@ -55,7 +58,7 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen for(int b=0;b(off+bo+b,so+o+b); + Cshift_table[ent++] = std::pair(off+bo+b,so+o+b); } } } else { @@ -65,13 +68,15 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen int o = n*stride; int ocb=1<CheckerBoardFromOindex(o+b); if ( ocb &cbmask ) { - table[ent++]=std::pair (off+bo++,so+o+b); + Cshift_table[ent++]=std::pair (off+bo++,so+o+b); } } } } - thread_for(i,ent,{ - buffer[table[i].first]=rhs_v[table[i].second]; + auto buffer_p = & buffer[0]; + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + buffer_p[table[i].first]=rhs_v[table[i].second]; }); } @@ -97,34 +102,36 @@ Gather_plane_extract(const Lattice &rhs, auto rhs_v = rhs.View(); if ( cbmask ==0x3){ - thread_for_collapse(2,n,e1,{ - for(int b=0;b(temp,pointers,offset); - } - }); + }); } else { - // Case of SIMD split AND checker dim cannot currently be hit, except in - // Test_cshift_red_black code. - std::cout << " Dense packed buffer WARNING " <_rdimensions; + Coordinate cdm =rhs.Grid()->_checker_dim_mask; + std::cout << " Dense packed buffer WARNING " <CheckerBoardFromOindex(o+b); + int oindex = o+b; + + int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); + + int ocb=1<(temp,pointers,offset); } - } - }); + }); } } @@ -145,7 +152,8 @@ template void Scatter_plane_simple (Lattice &rhs,commVector_slice_block[dimension]; int stride=rhs.Grid()->_slice_stride[dimension]; - static std::vector > table; table.resize(e1*e2); + if(Cshift_table.size() void Scatter_plane_simple (Lattice &rhs,commVector_slice_stride[dimension]; int bo =n*rhs.Grid()->_slice_block[dimension]; - table[ent++] = std::pair(so+o+b,bo+b); + Cshift_table[ent++] = std::pair(so+o+b,bo+b); } } @@ -165,15 +173,17 @@ template void Scatter_plane_simple (Lattice &rhs,commVector_slice_stride[dimension]; int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb & cbmask ) { - table[ent++]=std::pair (so+o+b,bo++); + Cshift_table[ent++]=std::pair (so+o+b,bo++); } } } } auto rhs_v = rhs.View(); - thread_for(i,ent,{ - rhs_v[table[i].first]=buffer[table[i].second]; + auto buffer_p = & buffer[0]; + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + rhs_v[table[i].first]=buffer_p[table[i].second]; }); } @@ -195,13 +205,11 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA if(cbmask ==0x3 ) { auto rhs_v = rhs.View(); - thread_for_collapse(2,n,e1,{ - for(int b=0;b_slice_stride[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension]; merge(rhs_v[so+o+b],pointers,offset); - } - }); + }); } else { // Case of SIMD split AND checker dim cannot currently be hit, except in @@ -225,6 +233,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// + template void Copy_plane(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -239,14 +248,16 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc int e2=rhs.Grid()->_slice_block[dimension]; int stride = rhs.Grid()->_slice_stride[dimension]; - static std::vector > table; table.resize(e1*e2); + + if(Cshift_table.size()(lo+o,ro+o); + Cshift_table[ent++] = std::pair(lo+o,ro+o); } } } else { @@ -255,7 +266,7 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int o =n*stride+b; int ocb=1<CheckerBoardFromOindex(o); if ( ocb&cbmask ) { - table[ent++] = std::pair(lo+o,ro+o); + Cshift_table[ent++] = std::pair(lo+o,ro+o); } } } @@ -263,7 +274,8 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs auto rhs_v = rhs.View(); auto lhs_v = lhs.View(); - thread_for(i,ent,{ + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ lhs_v[table[i].first]=rhs_v[table[i].second]; }); @@ -271,7 +283,6 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs template void Copy_plane_permute(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) { - int rd = rhs.Grid()->_rdimensions[dimension]; if ( !rhs.Grid()->CheckerBoarded(dimension) ) { @@ -285,27 +296,29 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice_slice_block [dimension]; int stride = rhs.Grid()->_slice_stride[dimension]; - static std::vector > table; table.resize(e1*e2); + if(Cshift_table.size()(lo+o+b,ro+o+b); + Cshift_table[ent++] = std::pair(lo+o+b,ro+o+b); }} } else { for(int n=0;nCheckerBoardFromOindex(o+b); - if ( ocb&cbmask ) table[ent++] = std::pair(lo+o+b,ro+o+b); + if ( ocb&cbmask ) Cshift_table[ent++] = std::pair(lo+o+b,ro+o+b); }} } auto rhs_v = rhs.View(); auto lhs_v = lhs.View(); - thread_for(i,ent,{ + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); }); } diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 4f2198f8..18cc406d 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -1,10 +1,186 @@ #include NAMESPACE_BEGIN(Grid); -uint32_t accelerator_threads; +uint32_t accelerator_threads=8; uint32_t acceleratorThreads(void) {return accelerator_threads;}; void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; -#ifdef GRID_SYCL -cl::sycl::queue *theGridAccelerator; + +#ifdef GRID_CUDA +cudaDeviceProp *gpu_props; +void acceleratorInit(void) +{ + int nDevices = 1; + cudaGetDeviceCount(&nDevices); + gpu_props = new cudaDeviceProp[nDevices]; + + char * localRankStr = NULL; + int rank = 0, world_rank=0; +#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" +#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" +#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" +#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + + for (int i = 0; i < nDevices; i++) { + +#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); +#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); + + cudaGetDeviceProperties(&gpu_props[i], i); + if ( world_rank == 0) { + cudaDeviceProp prop; + prop = gpu_props[i]; + printf("GpuInit: ========================\n"); + printf("GpuInit: Device Number : %d\n", i); + printf("GpuInit: ========================\n"); + printf("GpuInit: Device identifier: %s\n", prop.name); + + GPU_PROP(managedMemory); + GPU_PROP(isMultiGpuBoard); + GPU_PROP(warpSize); + // GPU_PROP(unifiedAddressing); + // GPU_PROP(l2CacheSize); + // GPU_PROP(singleToDoublePrecisionPerfRatio); + } + } +#ifdef GRID_IBM_SUMMIT + // IBM Jsrun makes cuda Device numbering screwy and not match rank + if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n"); +#else + if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n"); + cudaSetDevice(rank); #endif + if ( world_rank == 0 ) printf("GpuInit: ================================================\n"); +} +#endif + +#ifdef GRID_HIP +hipDeviceProp_t *gpu_props; +void acceleratorInit(void) +{ + int nDevices = 1; + hipGetDeviceCount(&nDevices); + gpu_props = new hipDeviceProp_t[nDevices]; + + char * localRankStr = NULL; + int rank = 0, world_rank=0; +#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" +#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" +#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" +#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + + for (int i = 0; i < nDevices; i++) { + +#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); +#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); + + hipGetDeviceProperties(&gpu_props[i], i); + if ( world_rank == 0) { + hipDeviceProp_t prop; + prop = gpu_props[i]; + printf("GpuInit: ========================\n"); + printf("GpuInit: Device Number : %d\n", i); + printf("GpuInit: ========================\n"); + printf("GpuInit: Device identifier: %s\n", prop.name); + + // GPU_PROP(managedMemory); + GPU_PROP(isMultiGpuBoard); + GPU_PROP(warpSize); + // GPU_PROP(unifiedAddressing); + // GPU_PROP(l2CacheSize); + // GPU_PROP(singleToDoublePrecisionPerfRatio); + } + } +#ifdef GRID_IBM_SUMMIT + // IBM Jsrun makes cuda Device numbering screwy and not match rank + if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n"); +#else + if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n"); + cudaSetDevice(rank); +#endif + if ( world_rank == 0 ) printf("GpuInit: ================================================\n"); +} +#endif + + +#ifdef GRID_SYCL + +cl::sycl::queue *theGridAccelerator; + +void acceleratorInit(void) +{ + int nDevices = 1; + cl::sycl::gpu_selector selector; + cl::sycl::device selectedDevice { selector }; + theGridAccelerator = new sycl::queue (selectedDevice); + + char * localRankStr = NULL; + int rank = 0, world_rank=0; +#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" +#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" +#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" +#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + + if ( world_rank == 0 ) { + GridBanner(); + } + /* + for (int i = 0; i < nDevices; i++) { + +#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); +#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); + + cudaGetDeviceProperties(&gpu_props[i], i); + if ( world_rank == 0) { + cudaDeviceProp prop; + prop = gpu_props[i]; + printf("GpuInit: ========================\n"); + printf("GpuInit: Device Number : %d\n", i); + printf("GpuInit: ========================\n"); + printf("GpuInit: Device identifier: %s\n", prop.name); + } + } + */ + if ( world_rank == 0 ) { + printf("GpuInit: ================================================\n"); + } +} +#endif + +#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP)) +void acceleratorInit(void){} +#endif + NAMESPACE_END(Grid); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 6f2e0b04..1569b22b 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -51,6 +51,7 @@ NAMESPACE_BEGIN(Grid); // // Warp control and info: // +// acceleratorInit; // void acceleratorSynchronise(void); // synch warp etc.. // int acceleratorSIMTlane(int Nsimd); // @@ -69,6 +70,7 @@ NAMESPACE_BEGIN(Grid); uint32_t acceleratorThreads(void); void acceleratorThreads(uint32_t); +void acceleratorInit(void); ////////////////////////////////////////////// // CUDA acceleration @@ -83,6 +85,32 @@ void acceleratorThreads(uint32_t); #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline +accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.x; } // CUDA specific + +#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + { \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator lane,Iterator iter1,Iterator iter2) mutable { \ + __VA_ARGS__; \ + }; \ + int nt=acceleratorThreads(); \ + dim3 cu_threads(nsimd,acceleratorThreads(),1); \ + dim3 cu_blocks (1,(num1+nt-1)/nt,num2); \ + LambdaApply<<>>(nsimd,num1,num2,lambda); \ + } + +template __global__ +void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) +{ + uint64_t x = threadIdx.x;//+ blockDim.x*blockIdx.x; + uint64_t y = threadIdx.y + blockDim.y*blockIdx.y; + uint64_t z = threadIdx.z + blockDim.z*blockIdx.z; + if ( (x < num1) && (y>>(nsimd,num,lambda); \ - } - -#define accelerator_for( iterator, num, nsimd, ... ) \ - accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \ - accelerator_barrier(dummy); - inline void *acceleratorAllocShared(size_t bytes) { void *ptr=NULL; @@ -133,15 +145,6 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; -template __global__ -void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda) -{ - uint64_t isite = threadIdx.y; - uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x; - if ( (osite >()[0]; } // SYCL specific + +#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ - cl::sycl::range<3> local {acceleratorThreads(),1,nsimd}; \ - cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \ + int nt=acceleratorThreads(); \ + unsigned long unum1 = num1; \ + unsigned long unum2 = num2; \ + cl::sycl::range<3> local {nsimd,nt,1}; \ + cl::sycl::range<3> global{nsimd,unum1,unum2}; \ cgh.parallel_for( \ cl::sycl::nd_range<3>(global,local), \ [=] (cl::sycl::nd_item<3> item) mutable { \ - auto iterator = item.get_global_id(0); \ - auto lane = item.get_global_id(2); \ + auto lane = item.get_global_id(0); \ + auto iter1 = item.get_global_id(1); \ + auto iter2 = item.get_global_id(2); \ { __VA_ARGS__ }; \ }); \ }); +dim3 cu_threads(nsimd,acceleratorThreads(),1); \ +dim3 cu_blocks (1,(num1+nt-1)/n,num2); \ #define accelerator_barrier(dummy) theGridAccelerator->wait(); -#define accelerator_for( iterator, num, nsimd, ... ) \ - accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \ - accelerator_barrier(dummy); - inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; @@ -204,33 +211,49 @@ NAMESPACE_BEGIN(Grid); #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline + +/*These routines define mapping from thread grid to loop & vector lane indexing */ +accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_x; } // HIP specific + +#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + { \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator lane,Iterator iter1,Iterator iter2 ) mutable { \ + { __VA_ARGS__;} \ + }; \ + int nt=acceleratorThreads(); \ + dim3 hip_threads(nsimd,nt,1); \ + dim3 hip_blocks (1,(num1+nt-1)/nt,num2); \ + hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ + 0,0, \ + nsimd,num1,num2,lambda); \ + } + + +template __global__ +void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) +{ + uint64_t x = hipThreadIdx_x;//+ hipBlockDim_x*hipBlockIdx_x; + uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y; + uint64_t z = hipThreadIdx_z + hipBlockDim_z*hipBlockIdx_z; + if ( (x < numx) && (y __global__ -void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda) -{ - uint64_t isite = hipThreadIdx_y; - uint64_t osite = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x; - if ( (osite @@ -303,7 +337,6 @@ inline void acceleratorFreeShared(void *ptr){free(ptr);}; inline void acceleratorFreeDevice(void *ptr){free(ptr);}; #endif - #endif // CPU target /////////////////////////////////////////////////// @@ -325,25 +358,4 @@ accelerator_inline void acceleratorSynchronise(void) return; } -//////////////////////////////////////////////////// -// Address subvectors on accelerators -//////////////////////////////////////////////////// -#ifdef GRID_SIMT - -#ifdef GRID_CUDA -accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific -#endif -#ifdef GRID_SYCL -accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific -#endif -#ifdef GRID_HIP -accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_y; } // HIP specific -#endif - -#else - -accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific - -#endif - NAMESPACE_END(Grid); diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h index 84989853..a9fa13ea 100644 --- a/Grid/threads/Threads.h +++ b/Grid/threads/Threads.h @@ -58,6 +58,12 @@ Author: paboyle #endif #define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i gputhreads(0); -#ifndef GRID_CUDA - std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was" - << " not compiled with GPU support" << std::endl; -#endif - arg= GridCmdOptionPayload(argv,argv+argc,"--gpu-threads"); + arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads"); GridCmdOptionIntVector(arg,gputhreads); assert(gputhreads.size()==1); - gpu_threads=gputhreads[0]; + acceleratorThreads(gputhreads[0]); } if( GridCmdOptionExists(argv,argv+argc,"--cores") ){ @@ -245,8 +235,6 @@ static int Grid_is_initialised; ///////////////////////////////////////////////////////// void GridBanner(void) { - static int printed =0; - if( !printed ) { std::cout <