mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
SLurm stop nodes using same GPU
This commit is contained in:
parent
0b787e9fe0
commit
1efe30d6cc
@ -16,40 +16,53 @@ void acceleratorInit(void)
|
|||||||
char * localRankStr = NULL;
|
char * localRankStr = NULL;
|
||||||
int rank = 0, world_rank=0;
|
int rank = 0, world_rank=0;
|
||||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
|
||||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||||
|
#define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID"
|
||||||
|
#define ENV_RANK_SLURM "SLURM_PROCID"
|
||||||
|
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||||
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
||||||
// We extract the local rank initialization using an environment variable
|
// We extract the local rank initialization using an environment variable
|
||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) {
|
||||||
{
|
printf("OPENMPI detected\n");
|
||||||
rank = atoi(localRankStr);
|
rank = atoi(localRankStr);
|
||||||
}
|
} else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) {
|
||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
|
printf("MVAPICH detected\n");
|
||||||
{
|
|
||||||
rank = atoi(localRankStr);
|
rank = atoi(localRankStr);
|
||||||
|
} else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) {
|
||||||
|
printf("SLURM detected\n");
|
||||||
|
rank = atoi(localRankStr);
|
||||||
|
} else {
|
||||||
|
printf("MPI version is unknown - bad things may happen\n");
|
||||||
}
|
}
|
||||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
|
||||||
size_t totalDeviceMem=0;
|
size_t totalDeviceMem=0;
|
||||||
for (int i = 0; i < nDevices; i++) {
|
for (int i = 0; i < nDevices; i++) {
|
||||||
|
|
||||||
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit[%d]: " #canMapHostMemory ": " FMT" \n",rank,prop.canMapHostMemory);
|
||||||
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
||||||
cudaGetDeviceProperties(&gpu_props[i], i);
|
cudaGetDeviceProperties(&gpu_props[i], i);
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
prop = gpu_props[i];
|
prop = gpu_props[i];
|
||||||
totalDeviceMem = prop.totalGlobalMem;
|
totalDeviceMem = prop.totalGlobalMem;
|
||||||
if ( world_rank == 0) {
|
if ( world_rank == 0) {
|
||||||
printf("AcceleratorCudaInit: ========================\n");
|
#ifndef GRID_IBM_SUMMIT
|
||||||
printf("AcceleratorCudaInit: Device Number : %d\n", i);
|
if ( i==rank ) {
|
||||||
printf("AcceleratorCudaInit: ========================\n");
|
printf("AcceleratorCudaInit[%d]: ========================\n",rank);
|
||||||
printf("AcceleratorCudaInit: Device identifier: %s\n", prop.name);
|
printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i);
|
||||||
|
printf("AcceleratorCudaInit[%d]: ========================\n",rank);
|
||||||
|
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
|
||||||
|
|
||||||
GPU_PROP_FMT(totalGlobalMem,"%lld");
|
GPU_PROP_FMT(totalGlobalMem,"%lld");
|
||||||
GPU_PROP(managedMemory);
|
GPU_PROP(managedMemory);
|
||||||
GPU_PROP(isMultiGpuBoard);
|
GPU_PROP(isMultiGpuBoard);
|
||||||
GPU_PROP(warpSize);
|
GPU_PROP(warpSize);
|
||||||
|
GPU_PROP(pciBusID);
|
||||||
|
GPU_PROP(pciDeviceID);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// GPU_PROP(unifiedAddressing);
|
// GPU_PROP(unifiedAddressing);
|
||||||
// GPU_PROP(l2CacheSize);
|
// GPU_PROP(l2CacheSize);
|
||||||
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||||
@ -61,9 +74,9 @@ void acceleratorInit(void)
|
|||||||
|
|
||||||
#ifdef GRID_IBM_SUMMIT
|
#ifdef GRID_IBM_SUMMIT
|
||||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||||
if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - NOT setting device to node rank\n");
|
if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - use default device\n");
|
||||||
#else
|
#else
|
||||||
if ( world_rank == 0 ) printf("AcceleratorCudaInit: setting device to node rank\n");
|
printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
|
||||||
cudaSetDevice(rank);
|
cudaSetDevice(rank);
|
||||||
#endif
|
#endif
|
||||||
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
|
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
|
||||||
|
Loading…
x
Reference in New Issue
Block a user