1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-19 00:07:05 +01:00

Speed up Cshift

This commit is contained in:
Peter Boyle
2020-05-11 17:02:01 -04:00
parent 8c31c065b5
commit 07c0c02f8c
12 changed files with 373 additions and 265 deletions

View File

@ -73,12 +73,6 @@ feenableexcept (unsigned int excepts)
}
#endif
uint32_t gpu_threads=8;
#ifdef GRID_SYCL
cl::sycl::queue *theGridAccelerator;
#endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////
@ -196,16 +190,12 @@ void GridParseLayout(char **argv,int argc,
assert(ompthreads.size()==1);
GridThread::SetThreads(ompthreads[0]);
}
if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){
if( GridCmdOptionExists(argv,argv+argc,"--accelerator-threads") ){
std::vector<int> gputhreads(0);
#ifndef GRID_CUDA
std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was"
<< " not compiled with GPU support" << std::endl;
#endif
arg= GridCmdOptionPayload(argv,argv+argc,"--gpu-threads");
arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads");
GridCmdOptionIntVector(arg,gputhreads);
assert(gputhreads.size()==1);
gpu_threads=gputhreads[0];
acceleratorThreads(gputhreads[0]);
}
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
@ -245,8 +235,6 @@ static int Grid_is_initialised;
/////////////////////////////////////////////////////////
void GridBanner(void)
{
static int printed =0;
if( !printed ) {
std::cout <<std::endl;
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
@ -282,125 +270,7 @@ void GridBanner(void)
std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
#endif
std::cout << std::endl;
printed=1;
}
}
#ifdef GRID_CUDA
cudaDeviceProp *gpu_props;
void GridGpuInit(void)
{
int nDevices = 1;
cudaGetDeviceCount(&nDevices);
gpu_props = new cudaDeviceProp[nDevices];
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
if ( world_rank == 0 ) {
GridBanner();
}
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
cudaGetDeviceProperties(&gpu_props[i], i);
if ( world_rank == 0) {
cudaDeviceProp prop;
prop = gpu_props[i];
printf("GpuInit: ========================\n");
printf("GpuInit: Device Number : %d\n", i);
printf("GpuInit: ========================\n");
printf("GpuInit: Device identifier: %s\n", prop.name);
GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize);
// GPU_PROP(unifiedAddressing);
// GPU_PROP(l2CacheSize);
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
#else
if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n");
cudaSetDevice(rank);
#endif
if ( world_rank == 0 ) printf("GpuInit: ================================================\n");
}
#endif
#ifdef GRID_SYCL
void GridGpuInit(void)
{
int nDevices = 1;
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (selectedDevice);
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
if ( world_rank == 0 ) {
GridBanner();
}
/*
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
cudaGetDeviceProperties(&gpu_props[i], i);
if ( world_rank == 0) {
cudaDeviceProp prop;
prop = gpu_props[i];
printf("GpuInit: ========================\n");
printf("GpuInit: Device Number : %d\n", i);
printf("GpuInit: ========================\n");
printf("GpuInit: Device identifier: %s\n", prop.name);
}
}
*/
if ( world_rank == 0 ) {
printf("GpuInit: ================================================\n");
}
}
#endif
#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))
void GridGpuInit(void){}
#endif
void Grid_init(int *argc,char ***argv)
{
@ -414,7 +284,7 @@ void Grid_init(int *argc,char ***argv)
//////////////////////////////////////////////////////////
// Early intialisation necessities without rank knowledge
//////////////////////////////////////////////////////////
GridGpuInit(); // Must come first to set device prior to MPI init
acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
int MB;
@ -483,7 +353,6 @@ void Grid_init(int *argc,char ***argv)
std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){
MemoryProfiler::debug = true;
MemoryProfiler::stats = &dbgMemStats;