1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-18 07:47:06 +01:00

Speed up Cshift

This commit is contained in:
Peter Boyle
2020-05-11 17:02:01 -04:00
parent 8c31c065b5
commit 07c0c02f8c
12 changed files with 373 additions and 265 deletions

View File

@ -1,10 +1,186 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
uint32_t accelerator_threads;
uint32_t accelerator_threads=8;
uint32_t acceleratorThreads(void) {return accelerator_threads;};
void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
#ifdef GRID_SYCL
cl::sycl::queue *theGridAccelerator;
#ifdef GRID_CUDA
cudaDeviceProp *gpu_props;
void acceleratorInit(void)
{
int nDevices = 1;
cudaGetDeviceCount(&nDevices);
gpu_props = new cudaDeviceProp[nDevices];
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
cudaGetDeviceProperties(&gpu_props[i], i);
if ( world_rank == 0) {
cudaDeviceProp prop;
prop = gpu_props[i];
printf("GpuInit: ========================\n");
printf("GpuInit: Device Number : %d\n", i);
printf("GpuInit: ========================\n");
printf("GpuInit: Device identifier: %s\n", prop.name);
GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize);
// GPU_PROP(unifiedAddressing);
// GPU_PROP(l2CacheSize);
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
#else
if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n");
cudaSetDevice(rank);
#endif
if ( world_rank == 0 ) printf("GpuInit: ================================================\n");
}
#endif
#ifdef GRID_HIP
hipDeviceProp_t *gpu_props;
void acceleratorInit(void)
{
int nDevices = 1;
hipGetDeviceCount(&nDevices);
gpu_props = new hipDeviceProp_t[nDevices];
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
hipGetDeviceProperties(&gpu_props[i], i);
if ( world_rank == 0) {
hipDeviceProp_t prop;
prop = gpu_props[i];
printf("GpuInit: ========================\n");
printf("GpuInit: Device Number : %d\n", i);
printf("GpuInit: ========================\n");
printf("GpuInit: Device identifier: %s\n", prop.name);
// GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize);
// GPU_PROP(unifiedAddressing);
// GPU_PROP(l2CacheSize);
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
#else
if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n");
cudaSetDevice(rank);
#endif
if ( world_rank == 0 ) printf("GpuInit: ================================================\n");
}
#endif
#ifdef GRID_SYCL
cl::sycl::queue *theGridAccelerator;
void acceleratorInit(void)
{
int nDevices = 1;
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (selectedDevice);
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
if ( world_rank == 0 ) {
GridBanner();
}
/*
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
cudaGetDeviceProperties(&gpu_props[i], i);
if ( world_rank == 0) {
cudaDeviceProp prop;
prop = gpu_props[i];
printf("GpuInit: ========================\n");
printf("GpuInit: Device Number : %d\n", i);
printf("GpuInit: ========================\n");
printf("GpuInit: Device identifier: %s\n", prop.name);
}
}
*/
if ( world_rank == 0 ) {
printf("GpuInit: ================================================\n");
}
}
#endif
#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
void acceleratorInit(void){}
#endif
NAMESPACE_END(Grid);

View File

@ -51,6 +51,7 @@ NAMESPACE_BEGIN(Grid);
//
// Warp control and info:
//
// acceleratorInit;
// void acceleratorSynchronise(void); // synch warp etc..
// int acceleratorSIMTlane(int Nsimd);
//
@ -69,6 +70,7 @@ NAMESPACE_BEGIN(Grid);
uint32_t acceleratorThreads(void);
void acceleratorThreads(uint32_t);
void acceleratorInit(void);
//////////////////////////////////////////////
// CUDA acceleration
@ -83,6 +85,32 @@ void acceleratorThreads(uint32_t);
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.x; } // CUDA specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator lane,Iterator iter1,Iterator iter2) mutable { \
__VA_ARGS__; \
}; \
int nt=acceleratorThreads(); \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks (1,(num1+nt-1)/nt,num2); \
LambdaApply<<<cu_blocks,cu_threads>>>(nsimd,num1,num2,lambda); \
}
template<typename lambda> __global__
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
{
uint64_t x = threadIdx.x;//+ blockDim.x*blockIdx.x;
uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
uint64_t z = threadIdx.z + blockDim.z*blockIdx.z;
if ( (x < num1) && (y<num2) && (z<num3) ) {
Lambda(x,y,z);
}
}
#define accelerator_barrier(dummy) \
{ \
cudaDeviceSynchronize(); \
@ -91,25 +119,9 @@ void acceleratorThreads(uint32_t);
printf("Cuda error %s \n", cudaGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
exit(0); \
} \
}
#define accelerator_forNB( iterator, num, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
__VA_ARGS__; \
}; \
dim3 cu_threads(acceleratorThreads(),nsimd); \
dim3 cu_blocks ((num+acceleratorThreads()-1)/acceleratorThreads()); \
LambdaApply<<<cu_blocks,cu_threads>>>(nsimd,num,lambda); \
}
#define accelerator_for( iterator, num, nsimd, ... ) \
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
inline void *acceleratorAllocShared(size_t bytes)
{
void *ptr=NULL;
@ -133,15 +145,6 @@ inline void *acceleratorAllocDevice(size_t bytes)
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
template<typename lambda> __global__
void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
{
uint64_t isite = threadIdx.y;
uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
if ( (osite <Osites) && (isite<Isites) ) {
Lambda(isite,osite);
}
}
#endif
@ -164,25 +167,29 @@ extern cl::sycl::queue *theGridAccelerator;
#define accelerator
#define accelerator_inline strong_inline
#define accelerator_forNB(iterator,num,nsimd, ... ) \
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[0]; } // SYCL specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
cl::sycl::range<3> local {acceleratorThreads(),1,nsimd}; \
cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
int nt=acceleratorThreads(); \
unsigned long unum1 = num1; \
unsigned long unum2 = num2; \
cl::sycl::range<3> local {nsimd,nt,1}; \
cl::sycl::range<3> global{nsimd,unum1,unum2}; \
cgh.parallel_for<class dslash>( \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) mutable { \
auto iterator = item.get_global_id(0); \
auto lane = item.get_global_id(2); \
auto lane = item.get_global_id(0); \
auto iter1 = item.get_global_id(1); \
auto iter2 = item.get_global_id(2); \
{ __VA_ARGS__ }; \
}); \
});
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks (1,(num1+nt-1)/n,num2); \
#define accelerator_barrier(dummy) theGridAccelerator->wait();
#define accelerator_for( iterator, num, nsimd, ... ) \
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
@ -204,33 +211,49 @@ NAMESPACE_BEGIN(Grid);
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
/*These routines define mapping from thread grid to loop & vector lane indexing */
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_x; } // HIP specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator lane,Iterator iter1,Iterator iter2 ) mutable { \
{ __VA_ARGS__;} \
}; \
int nt=acceleratorThreads(); \
dim3 hip_threads(nsimd,nt,1); \
dim3 hip_blocks (1,(num1+nt-1)/nt,num2); \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0, \
nsimd,num1,num2,lambda); \
}
template<typename lambda> __global__
void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
{
uint64_t x = hipThreadIdx_x;//+ hipBlockDim_x*hipBlockIdx_x;
uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
uint64_t z = hipThreadIdx_z + hipBlockDim_z*hipBlockIdx_z;
if ( (x < numx) && (y<numy) && (z<numz) ) {
Lambda(x,y,z);
}
}
#define accelerator_barrier(dummy) \
{ \
hipDeviceSynchronize(); \
auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \
printf("HIP error %s \n", hipGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
exit(0); \
} \
}
#define accelerator_forNB( iterator, num, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
__VA_ARGS__; \
}; \
dim3 hip_threads(acceleratorThreads(),nsimd); \
dim3 hip_blocks ((num+acceleratorThreads()-1)/acceleratorThreads()); \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,0,0,num,nsimd,lambda);\
}
#define accelerator_for( iterator, num, nsimd, ... ) \
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
inline void *acceleratorAllocShared(size_t bytes)
{
void *ptr=NULL;
@ -241,6 +264,7 @@ inline void *acceleratorAllocShared(size_t bytes)
}
return ptr;
};
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
@ -251,18 +275,25 @@ inline void *acceleratorAllocDevice(size_t bytes)
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
template<typename lambda> __global__
void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
{
uint64_t isite = hipThreadIdx_y;
uint64_t osite = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
if ( (osite <Osites) && (isite<Isites) ) {
Lambda(isite,osite);
}
}
#endif
//////////////////////////////////////////////
// Common on all GPU targets
//////////////////////////////////////////////
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
#define accelerator_for( iter, num, nsimd, ... ) \
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
#endif
@ -280,6 +311,9 @@ void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_barrier(dummy)
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
@ -303,7 +337,6 @@ inline void acceleratorFreeShared(void *ptr){free(ptr);};
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
#endif
#endif // CPU target
///////////////////////////////////////////////////
@ -325,25 +358,4 @@ accelerator_inline void acceleratorSynchronise(void)
return;
}
////////////////////////////////////////////////////
// Address subvectors on accelerators
////////////////////////////////////////////////////
#ifdef GRID_SIMT
#ifdef GRID_CUDA
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
#endif
#ifdef GRID_SYCL
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
#endif
#ifdef GRID_HIP
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_y; } // HIP specific
#endif
#else
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
#endif
NAMESPACE_END(Grid);

View File

@ -58,6 +58,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#endif
#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for2d( i1, n1,i2,n2, ... ) \
DO_PRAGMA(omp parallel for collapse(2)) \
for ( uint64_t i1=0;i1<n1;i1++) { \
for ( uint64_t i2=0;i2<n2;i2++) { \
{ __VA_ARGS__ } ; \
}}
#define thread_foreach( i, container, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
#define thread_for_in_region( i, num, ... ) DO_PRAGMA(omp for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for_collapse2( i, num, ... ) DO_PRAGMA(omp parallel for collapse(2)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;