#include NAMESPACE_BEGIN(Grid); int world_rank; // Use to control world rank for print guarding int acceleratorAbortOnGpuError=1; uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; #define ENV_LOCAL_RANK_PALS "PALS_LOCAL_RANKID" #define ENV_RANK_PALS "PALS_RANKID" #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" #define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID" #define ENV_RANK_SLURM "SLURM_PROCID" #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" #ifdef GRID_CUDA cudaDeviceProp *gpu_props; cudaStream_t copyStream; cudaStream_t computeStream; void acceleratorInit(void) { int nDevices = 1; cudaGetDeviceCount(&nDevices); gpu_props = new cudaDeviceProp[nDevices]; char * localRankStr = NULL; int rank = 0; world_rank=0; if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);} // We extract the local rank initialization using an environment variable if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { if (!world_rank) printf("OPENMPI detected\n"); rank = atoi(localRankStr); } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { if (!world_rank) printf("MVAPICH detected\n"); rank = atoi(localRankStr); } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) { if (!world_rank) printf("SLURM detected\n"); rank = atoi(localRankStr); } else { if (!world_rank) printf("MPI version is unknown - bad things may happen\n"); } size_t totalDeviceMem=0; for (int i = 0; i < nDevices; i++) { #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit[%d]: " #canMapHostMemory ": " FMT" \n",rank,prop.canMapHostMemory); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); cudaGetDeviceProperties(&gpu_props[i], i); cudaDeviceProp prop; prop = gpu_props[i]; totalDeviceMem = prop.totalGlobalMem; if ( world_rank == 0) { if ( i==rank ) { printf("AcceleratorCudaInit[%d]: ========================\n",rank); printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i); printf("AcceleratorCudaInit[%d]: ========================\n",rank); printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name); GPU_PROP_FMT(totalGlobalMem,"%lld"); GPU_PROP(managedMemory); GPU_PROP(isMultiGpuBoard); GPU_PROP(warpSize); GPU_PROP(pciBusID); GPU_PROP(pciDeviceID); printf("AcceleratorCudaInit[%d]: maxGridSize (%d,%d,%d)\n",rank,prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]); } // GPU_PROP(unifiedAddressing); // GPU_PROP(l2CacheSize); // GPU_PROP(singleToDoublePrecisionPerfRatio); } } MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours #undef GPU_PROP_FMT #undef GPU_PROP #ifdef GRID_DEFAULT_GPU int device = 0; // IBM Jsrun makes cuda Device numbering screwy and not match rank if ( world_rank == 0 ) { printf("AcceleratorCudaInit: using default device \n"); printf("AcceleratorCudaInit: assume user either uses\n"); printf("AcceleratorCudaInit: a) IBM jsrun, or \n"); printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n"); printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n"); } #else int device = rank; printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank); printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n"); #endif cudaSetDevice(device); cudaStreamCreate(©Stream); cudaStreamCreate(&computeStream); const int len=64; char busid[len]; if( rank == world_rank ) { cudaDeviceGetPCIBusId(busid, len, device); printf("local rank %d device %d bus id: %s\n", rank, device, busid); } if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n"); } #endif #ifdef GRID_HIP hipDeviceProp_t *gpu_props; hipStream_t copyStream; hipStream_t computeStream; void acceleratorInit(void) { int nDevices = 1; hipGetDeviceCount(&nDevices); gpu_props = new hipDeviceProp_t[nDevices]; char * localRankStr = NULL; int rank = 0; world_rank=0; // We extract the local rank initialization using an environment variable if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { rank = atoi(localRankStr); } if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { rank = atoi(localRankStr); } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);} if ( world_rank == 0 ) printf("world_rank %d has %d devices\n",world_rank,nDevices); size_t totalDeviceMem=0; for (int i = 0; i < nDevices; i++) { #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); auto r=hipGetDeviceProperties(&gpu_props[i], i); hipDeviceProp_t prop; prop = gpu_props[i]; totalDeviceMem = prop.totalGlobalMem; if ( world_rank == 0) { printf("AcceleratorHipInit: ========================\n"); printf("AcceleratorHipInit: Device Number : %d\n", i); printf("AcceleratorHipInit: ========================\n"); printf("AcceleratorHipInit: Device identifier: %s\n", prop.name); GPU_PROP_FMT(totalGlobalMem,"%lu"); // GPU_PROP(managedMemory); GPU_PROP(isMultiGpuBoard); GPU_PROP(warpSize); // GPU_PROP(unifiedAddressing); // GPU_PROP(l2CacheSize); // GPU_PROP(singleToDoublePrecisionPerfRatio); } } MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours #undef GPU_PROP_FMT #undef GPU_PROP #ifdef GRID_DEFAULT_GPU if ( world_rank == 0 ) { printf("AcceleratorHipInit: using default device \n"); printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n"); printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n"); } int device = 0; #else if ( world_rank == 0 ) { printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank); printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n"); } int device = rank; #endif hipSetDevice(device); hipStreamCreate(©Stream); hipStreamCreate(&computeStream); const int len=64; char busid[len]; if( rank == world_rank ) { hipDeviceGetPCIBusId(busid, len, device); printf("local rank %d device %d bus id: %s\n", rank, device, busid); } if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n"); } #endif #ifdef GRID_SYCL cl::sycl::queue *theGridAccelerator; cl::sycl::queue *theCopyAccelerator; void acceleratorInit(void) { int nDevices = 1; cl::sycl::gpu_selector selector; cl::sycl::device selectedDevice { selector }; theGridAccelerator = new sycl::queue (selectedDevice); // theCopyAccelerator = new sycl::queue (selectedDevice); theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. #ifdef GRID_SYCL_LEVEL_ZERO_IPC zeInit(0); #endif char * localRankStr = NULL; int rank = 0; world_rank=0; // We extract the local rank initialization using an environment variable if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { rank = atoi(localRankStr); } if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { rank = atoi(localRankStr); } if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL) { rank = atoi(localRankStr); } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_PALS )) != NULL) { world_rank = atoi(localRankStr);} char hostname[HOST_NAME_MAX+1]; gethostname(hostname, HOST_NAME_MAX+1); if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); auto devices = cl::sycl::device::get_devices(); for(int d = 0;d().c_str()); #define GPU_PROP_FMT(prop,FMT) \ printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info()); #define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); if ( world_rank == 0) { GPU_PROP_STR(vendor); GPU_PROP_STR(version); // GPU_PROP_STR(device_type); /* GPU_PROP(max_compute_units); GPU_PROP(native_vector_width_char); GPU_PROP(native_vector_width_short); GPU_PROP(native_vector_width_int); GPU_PROP(native_vector_width_long); GPU_PROP(native_vector_width_float); GPU_PROP(native_vector_width_double); GPU_PROP(native_vector_width_half); GPU_PROP(address_bits); GPU_PROP(half_fp_config); GPU_PROP(single_fp_config); */ // GPU_PROP(double_fp_config); GPU_PROP(global_mem_size); } } if ( world_rank == 0 ) { auto name = theGridAccelerator->get_device().get_info(); printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); printf("AcceleratorSyclInit: ================================================\n"); } } #endif #if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP)) void acceleratorInit(void){} #endif NAMESPACE_END(Grid);