mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 01:35:36 +00:00
Summit jsrun GPU mapping updates. Conffigure with --enable-jsrun
This commit is contained in:
parent
f31e3278a6
commit
ec8e060ec7
@ -41,9 +41,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <sys/shm.h>
|
#include <sys/shm.h>
|
||||||
#include <sys/mman.h>
|
#include <sys/mman.h>
|
||||||
#include <zlib.h>
|
#include <zlib.h>
|
||||||
#ifdef HAVE_NUMAIF_H
|
|
||||||
#include <numaif.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
@ -99,6 +96,7 @@ public:
|
|||||||
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
|
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
|
||||||
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
|
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
|
||||||
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
|
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
|
||||||
|
static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
// Provide shared memory facilities off comm world
|
// Provide shared memory facilities off comm world
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
|
@ -155,6 +155,37 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
|
|||||||
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
|
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
|
||||||
else OptimalCommunicatorSharedMemory(processors,optimal_comm);
|
else OptimalCommunicatorSharedMemory(processors,optimal_comm);
|
||||||
}
|
}
|
||||||
|
static inline int divides(int a,int b)
|
||||||
|
{
|
||||||
|
return ( b == ( (b/a)*a ) );
|
||||||
|
}
|
||||||
|
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
|
||||||
|
{
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Assert power of two shm_size.
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
|
||||||
|
assert(log2size != -1);
|
||||||
|
|
||||||
|
int ndimension = WorldDims.size();
|
||||||
|
ShmDims=Coordinate(ndimension,1);
|
||||||
|
|
||||||
|
std::vector<int> primes({2,3,5});
|
||||||
|
|
||||||
|
int dim = 0;
|
||||||
|
int AutoShmSize = 1;
|
||||||
|
while(AutoShmSize != WorldShmSize) {
|
||||||
|
for(int p=0;p<primes.size();p++) {
|
||||||
|
int prime=primes[p];
|
||||||
|
if ( divides(prime,WorldDims[dim]/ShmDims[dim]) ) {
|
||||||
|
AutoShmSize*=prime;
|
||||||
|
ShmDims[dim]*=prime;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dim=(dim+1) %ndimension;
|
||||||
|
}
|
||||||
|
}
|
||||||
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
|
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
|
||||||
{
|
{
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
@ -221,17 +252,13 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
|
|||||||
// in a maximally symmetrical way
|
// in a maximally symmetrical way
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
int ndimension = processors.size();
|
int ndimension = processors.size();
|
||||||
std::vector<int> processor_coor(ndimension);
|
Coordinate processor_coor(ndimension);
|
||||||
std::vector<int> WorldDims = processors.toVector();
|
Coordinate WorldDims = processors;
|
||||||
std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
|
Coordinate ShmDims (ndimension); Coordinate NodeDims (ndimension);
|
||||||
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
|
Coordinate ShmCoor (ndimension); Coordinate NodeCoor (ndimension); Coordinate WorldCoor(ndimension);
|
||||||
std::vector<int> HyperCoor(ndimension);
|
Coordinate HyperCoor(ndimension);
|
||||||
int dim = 0;
|
|
||||||
for(int l2=0;l2<log2size;l2++){
|
GetShmDims(WorldDims,ShmDims);
|
||||||
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
|
|
||||||
ShmDims[dim]*=2;
|
|
||||||
dim=(dim+1)%ndimension;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Establish torus of processes and nodes with sub-blockings
|
// Establish torus of processes and nodes with sub-blockings
|
||||||
@ -281,11 +308,6 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
|
|||||||
}
|
}
|
||||||
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
|
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
|
||||||
{
|
{
|
||||||
////////////////////////////////////////////////////////////////
|
|
||||||
// Assert power of two shm_size.
|
|
||||||
////////////////////////////////////////////////////////////////
|
|
||||||
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
|
|
||||||
assert(log2size != -1);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Identify subblock of ranks on node spreading across dims
|
// Identify subblock of ranks on node spreading across dims
|
||||||
@ -293,15 +315,10 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
|
|||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
int ndimension = processors.size();
|
int ndimension = processors.size();
|
||||||
Coordinate processor_coor(ndimension);
|
Coordinate processor_coor(ndimension);
|
||||||
Coordinate WorldDims = processors; Coordinate ShmDims(ndimension,1); Coordinate NodeDims (ndimension);
|
Coordinate WorldDims = processors; Coordinate ShmDims(ndimension); Coordinate NodeDims (ndimension);
|
||||||
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
|
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
|
||||||
int dim = 0;
|
|
||||||
for(int l2=0;l2<log2size;l2++){
|
|
||||||
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
|
|
||||||
ShmDims[dim]*=2;
|
|
||||||
dim=(dim+1)%ndimension;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
GetShmDims(WorldDims,ShmDims);
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Establish torus of processes and nodes with sub-blockings
|
// Establish torus of processes and nodes with sub-blockings
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
@ -418,7 +435,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// e.g. DGX1, supermicro board,
|
// e.g. DGX1, supermicro board,
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
||||||
|
#ifdef GRID_IBM_SUMMIT
|
||||||
|
std::cout << header << "flag IBM_SUMMIT disabled CUDA set device: ensure jsrun is used correctly" <<std::endl;
|
||||||
|
#else
|
||||||
cudaSetDevice(WorldShmRank);
|
cudaSetDevice(WorldShmRank);
|
||||||
|
#endif
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -292,7 +292,7 @@ void GridGpuInit(void)
|
|||||||
gpu_props = new cudaDeviceProp[nDevices];
|
gpu_props = new cudaDeviceProp[nDevices];
|
||||||
|
|
||||||
char * localRankStr = NULL;
|
char * localRankStr = NULL;
|
||||||
int rank = 0, device = 0, world_rank=0;
|
int rank = 0, world_rank=0;
|
||||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||||
@ -301,23 +301,16 @@ void GridGpuInit(void)
|
|||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||||
{
|
{
|
||||||
rank = atoi(localRankStr);
|
rank = atoi(localRankStr);
|
||||||
device = rank %nDevices;
|
|
||||||
}
|
}
|
||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
|
||||||
{
|
{
|
||||||
rank = atoi(localRankStr);
|
rank = atoi(localRankStr);
|
||||||
device = rank %nDevices;
|
|
||||||
}
|
}
|
||||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
|
||||||
cudaSetDevice(device);
|
|
||||||
if ( world_rank == 0 ) {
|
if ( world_rank == 0 ) {
|
||||||
GridBanner();
|
GridBanner();
|
||||||
printf("GpuInit: ================================================\n");
|
|
||||||
printf("GpuInit: Setting up Cuda Device map before first MPI call\n",nDevices);
|
|
||||||
printf("GpuInit: ================================================\n");
|
|
||||||
printf("GpuInit: Cuda reports %d GPUs on MPI rank 0\n",nDevices);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < nDevices; i++) {
|
for (int i = 0; i < nDevices; i++) {
|
||||||
@ -325,7 +318,6 @@ void GridGpuInit(void)
|
|||||||
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
||||||
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
||||||
|
|
||||||
// cudaGetDeviceProperties(&prop, i);
|
|
||||||
cudaGetDeviceProperties(&gpu_props[i], i);
|
cudaGetDeviceProperties(&gpu_props[i], i);
|
||||||
if ( world_rank == 0) {
|
if ( world_rank == 0) {
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
@ -334,15 +326,13 @@ void GridGpuInit(void)
|
|||||||
printf("GpuInit: Device Number : %d\n", i);
|
printf("GpuInit: Device Number : %d\n", i);
|
||||||
printf("GpuInit: ========================\n");
|
printf("GpuInit: ========================\n");
|
||||||
printf("GpuInit: Device identifier: %s\n", prop.name);
|
printf("GpuInit: Device identifier: %s\n", prop.name);
|
||||||
// printf("GpuInit: Peak Memory Bandwidth (GB/s): %f\n",(float)2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
|
|
||||||
GPU_PROP(managedMemory);
|
GPU_PROP(managedMemory);
|
||||||
GPU_PROP(isMultiGpuBoard);
|
GPU_PROP(isMultiGpuBoard);
|
||||||
GPU_PROP(warpSize);
|
GPU_PROP(warpSize);
|
||||||
#if 0
|
// GPU_PROP(unifiedAddressing);
|
||||||
GPU_PROP(unifiedAddressing);
|
// GPU_PROP(l2CacheSize);
|
||||||
GPU_PROP(l2CacheSize);
|
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||||
GPU_PROP(singleToDoublePrecisionPerfRatio);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ( world_rank == 0 ) {
|
if ( world_rank == 0 ) {
|
||||||
|
32
configure.ac
32
configure.ac
@ -67,7 +67,6 @@ AC_CHECK_HEADERS(malloc/malloc.h)
|
|||||||
AC_CHECK_HEADERS(malloc.h)
|
AC_CHECK_HEADERS(malloc.h)
|
||||||
AC_CHECK_HEADERS(endian.h)
|
AC_CHECK_HEADERS(endian.h)
|
||||||
AC_CHECK_HEADERS(execinfo.h)
|
AC_CHECK_HEADERS(execinfo.h)
|
||||||
AC_CHECK_HEADERS(numaif.h)
|
|
||||||
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
|
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
|
||||||
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
|
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
|
||||||
|
|
||||||
@ -136,6 +135,18 @@ case ${ac_SFW_FP16} in
|
|||||||
AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
|
AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
############### SUMMIT JSRUN
|
||||||
|
AC_ARG_ENABLE([jsrun],
|
||||||
|
[AC_HELP_STRING([--enable-jsrun=yes|no], [enable IBMs jsrun resource manager for SUMMIT])],
|
||||||
|
[ac_JSRUN=${enable_jsrun}], [ac_JSRUN=no])
|
||||||
|
case ${ac_JSRUN} in
|
||||||
|
yes)
|
||||||
|
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
|
||||||
|
no);;
|
||||||
|
*)
|
||||||
|
AC_MSG_ERROR(["JSRUN option not supported ${ac_JSRUN}"]);;
|
||||||
|
esac
|
||||||
|
|
||||||
############### Intel libraries
|
############### Intel libraries
|
||||||
AC_ARG_ENABLE([mkl],
|
AC_ARG_ENABLE([mkl],
|
||||||
[AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
|
[AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
|
||||||
@ -173,19 +184,6 @@ AC_ARG_WITH([hdf5],
|
|||||||
[AM_CXXFLAGS="-I$with_hdf5/include $AM_CXXFLAGS"]
|
[AM_CXXFLAGS="-I$with_hdf5/include $AM_CXXFLAGS"]
|
||||||
[AM_LDFLAGS="-L$with_hdf5/lib $AM_LDFLAGS"])
|
[AM_LDFLAGS="-L$with_hdf5/lib $AM_LDFLAGS"])
|
||||||
|
|
||||||
############### first-touch
|
|
||||||
AC_ARG_ENABLE([numa],
|
|
||||||
[AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])],
|
|
||||||
[ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
|
|
||||||
|
|
||||||
case ${ac_NUMA} in
|
|
||||||
no)
|
|
||||||
;;
|
|
||||||
yes)
|
|
||||||
AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
|
|
||||||
*)
|
|
||||||
AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
|
|
||||||
esac
|
|
||||||
|
|
||||||
############### Checks for library functions
|
############### Checks for library functions
|
||||||
CXXFLAGS_CPY=$CXXFLAGS
|
CXXFLAGS_CPY=$CXXFLAGS
|
||||||
@ -241,10 +239,6 @@ AC_SEARCH_LIBS([crc32], [z],
|
|||||||
[have_zlib=true] [LIBS="${LIBS} -lz"],
|
[have_zlib=true] [LIBS="${LIBS} -lz"],
|
||||||
[AC_MSG_ERROR(zlib library was not found in your system.)])
|
[AC_MSG_ERROR(zlib library was not found in your system.)])
|
||||||
|
|
||||||
AC_SEARCH_LIBS([move_pages], [numa],
|
|
||||||
[AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
|
|
||||||
[have_libnuma=true] [LIBS="${LIBS} -lnuma"],
|
|
||||||
[AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
|
|
||||||
|
|
||||||
AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
|
AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
|
||||||
[AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
|
[AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
|
||||||
@ -263,7 +257,7 @@ AC_ARG_ENABLE([gen-simd-width],
|
|||||||
[AS_HELP_STRING([--enable-gen-simd-width=size],
|
[AS_HELP_STRING([--enable-gen-simd-width=size],
|
||||||
[size (in bytes) of the generic SIMD vectors (default: 32)])],
|
[size (in bytes) of the generic SIMD vectors (default: 32)])],
|
||||||
[ac_gen_simd_width=$enable_gen_simd_width],
|
[ac_gen_simd_width=$enable_gen_simd_width],
|
||||||
[ac_gen_simd_width=32])
|
[ac_gen_simd_width=64])
|
||||||
|
|
||||||
AC_ARG_ENABLE([gen-scalar],
|
AC_ARG_ENABLE([gen-scalar],
|
||||||
[AS_HELP_STRING([--enable-gen-scalar=yes|no],
|
[AS_HELP_STRING([--enable-gen-scalar=yes|no],
|
||||||
|
Loading…
Reference in New Issue
Block a user