mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-30 19:44:32 +00:00 
			
		
		
		
	Summit jsrun GPU mapping updates. Conffigure with --enable-jsrun
This commit is contained in:
		| @@ -41,9 +41,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <sys/shm.h> | #include <sys/shm.h> | ||||||
| #include <sys/mman.h> | #include <sys/mman.h> | ||||||
| #include <zlib.h> | #include <zlib.h> | ||||||
| #ifdef HAVE_NUMAIF_H |  | ||||||
| #include <numaif.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| @@ -99,6 +96,7 @@ public: | |||||||
|   static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian |   static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian | ||||||
|   static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian |   static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian | ||||||
|   static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian |   static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian | ||||||
|  |   static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims); | ||||||
|   /////////////////////////////////////////////////// |   /////////////////////////////////////////////////// | ||||||
|   // Provide shared memory facilities off comm world |   // Provide shared memory facilities off comm world | ||||||
|   /////////////////////////////////////////////////// |   /////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -155,6 +155,37 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M | |||||||
|   if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm); |   if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm); | ||||||
|   else                          OptimalCommunicatorSharedMemory(processors,optimal_comm); |   else                          OptimalCommunicatorSharedMemory(processors,optimal_comm); | ||||||
| } | } | ||||||
|  | static inline int divides(int a,int b) | ||||||
|  | { | ||||||
|  |   return ( b == ( (b/a)*a ) ); | ||||||
|  | } | ||||||
|  | void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) | ||||||
|  | { | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Assert power of two shm_size. | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE); | ||||||
|  |   assert(log2size != -1); | ||||||
|  |  | ||||||
|  |   int ndimension = WorldDims.size(); | ||||||
|  |   ShmDims=Coordinate(ndimension,1); | ||||||
|  |  | ||||||
|  |   std::vector<int> primes({2,3,5}); | ||||||
|  |  | ||||||
|  |   int dim = 0; | ||||||
|  |   int AutoShmSize = 1; | ||||||
|  |   while(AutoShmSize != WorldShmSize) { | ||||||
|  |     for(int p=0;p<primes.size();p++) { | ||||||
|  |       int prime=primes[p]; | ||||||
|  |       if ( divides(prime,WorldDims[dim]/ShmDims[dim]) ) { | ||||||
|  | 	AutoShmSize*=prime; | ||||||
|  | 	ShmDims[dim]*=prime; | ||||||
|  | 	break; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     dim=(dim+1) %ndimension; | ||||||
|  |   } | ||||||
|  | } | ||||||
| void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) | void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) | ||||||
| { | { | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
| @@ -221,17 +252,13 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo | |||||||
|   // in a maximally symmetrical way |   // in a maximally symmetrical way | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   int ndimension              = processors.size(); |   int ndimension              = processors.size(); | ||||||
|   std::vector<int> processor_coor(ndimension); |   Coordinate processor_coor(ndimension); | ||||||
|   std::vector<int> WorldDims = processors.toVector(); |   Coordinate WorldDims = processors; | ||||||
|   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension); |   Coordinate ShmDims  (ndimension);  Coordinate NodeDims (ndimension); | ||||||
|   std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension); |   Coordinate ShmCoor  (ndimension);    Coordinate NodeCoor (ndimension);    Coordinate WorldCoor(ndimension); | ||||||
|   std::vector<int> HyperCoor(ndimension); |   Coordinate HyperCoor(ndimension); | ||||||
|   int dim = 0; |  | ||||||
|   for(int l2=0;l2<log2size;l2++){ |   GetShmDims(WorldDims,ShmDims); | ||||||
|     while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension; |  | ||||||
|     ShmDims[dim]*=2; |  | ||||||
|     dim=(dim+1)%ndimension; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   // Establish torus of processes and nodes with sub-blockings |   // Establish torus of processes and nodes with sub-blockings | ||||||
| @@ -281,11 +308,6 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo | |||||||
| } | } | ||||||
| void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) | void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) | ||||||
| { | { | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   // Assert power of two shm_size. |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE); |  | ||||||
|   assert(log2size != -1); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   // Identify subblock of ranks on node spreading across dims |   // Identify subblock of ranks on node spreading across dims | ||||||
| @@ -293,15 +315,10 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce | |||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   int ndimension              = processors.size(); |   int ndimension              = processors.size(); | ||||||
|   Coordinate processor_coor(ndimension); |   Coordinate processor_coor(ndimension); | ||||||
|   Coordinate WorldDims = processors; Coordinate ShmDims(ndimension,1);  Coordinate NodeDims (ndimension); |   Coordinate WorldDims = processors; Coordinate ShmDims(ndimension);  Coordinate NodeDims (ndimension); | ||||||
|   Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension); |   Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension); | ||||||
|   int dim = 0; |  | ||||||
|   for(int l2=0;l2<log2size;l2++){ |  | ||||||
|     while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension; |  | ||||||
|     ShmDims[dim]*=2; |  | ||||||
|     dim=(dim+1)%ndimension; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|  |   GetShmDims(WorldDims,ShmDims); | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   // Establish torus of processes and nodes with sub-blockings |   // Establish torus of processes and nodes with sub-blockings | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
| @@ -418,7 +435,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|   // e.g. DGX1, supermicro board,  |   // e.g. DGX1, supermicro board,  | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   //  cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); |   //  cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); | ||||||
|  | #ifdef GRID_IBM_SUMMIT | ||||||
|  |   std::cout << header << "flag IBM_SUMMIT disabled CUDA set device: ensure jsrun is used correctly" <<std::endl; | ||||||
|  | #else | ||||||
|   cudaSetDevice(WorldShmRank); |   cudaSetDevice(WorldShmRank); | ||||||
|  | #endif | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Each MPI rank should allocate our own buffer |   // Each MPI rank should allocate our own buffer | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -292,7 +292,7 @@ void GridGpuInit(void) | |||||||
|   gpu_props = new cudaDeviceProp[nDevices]; |   gpu_props = new cudaDeviceProp[nDevices]; | ||||||
|  |  | ||||||
|   char * localRankStr = NULL; |   char * localRankStr = NULL; | ||||||
|   int rank = 0, device = 0, world_rank=0;  |   int rank = 0, world_rank=0;  | ||||||
| #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK" | #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK" | ||||||
| #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" | #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" | ||||||
| #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK" | #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK" | ||||||
| @@ -301,23 +301,16 @@ void GridGpuInit(void) | |||||||
|   if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) |   if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) | ||||||
|   { |   { | ||||||
|     rank = atoi(localRankStr);		 |     rank = atoi(localRankStr);		 | ||||||
|     device = rank %nDevices; |  | ||||||
|   } |   } | ||||||
|   if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) |   if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) | ||||||
|   { |   { | ||||||
|     rank = atoi(localRankStr);		 |     rank = atoi(localRankStr);		 | ||||||
|     device = rank %nDevices; |  | ||||||
|   } |   } | ||||||
|   if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);} |   if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);} | ||||||
|   if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} |   if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} | ||||||
|  |  | ||||||
|   cudaSetDevice(device); |  | ||||||
|   if ( world_rank == 0 ) { |   if ( world_rank == 0 ) { | ||||||
|     GridBanner(); |     GridBanner(); | ||||||
|     printf("GpuInit: ================================================\n"); |  | ||||||
|     printf("GpuInit: Setting up Cuda Device map before first MPI call\n",nDevices); |  | ||||||
|     printf("GpuInit: ================================================\n"); |  | ||||||
|     printf("GpuInit: Cuda reports %d GPUs on MPI rank 0\n",nDevices); |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   for (int i = 0; i < nDevices; i++) { |   for (int i = 0; i < nDevices; i++) { | ||||||
| @@ -325,7 +318,6 @@ void GridGpuInit(void) | |||||||
| #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); | #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); | ||||||
| #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d"); | #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d"); | ||||||
|      |      | ||||||
|     //      cudaGetDeviceProperties(&prop, i); |  | ||||||
|     cudaGetDeviceProperties(&gpu_props[i], i); |     cudaGetDeviceProperties(&gpu_props[i], i); | ||||||
|     if ( world_rank == 0) { |     if ( world_rank == 0) { | ||||||
|       cudaDeviceProp prop;  |       cudaDeviceProp prop;  | ||||||
| @@ -334,15 +326,13 @@ void GridGpuInit(void) | |||||||
|       printf("GpuInit: Device Number    : %d\n", i); |       printf("GpuInit: Device Number    : %d\n", i); | ||||||
|       printf("GpuInit: ========================\n"); |       printf("GpuInit: ========================\n"); | ||||||
|       printf("GpuInit: Device identifier: %s\n", prop.name); |       printf("GpuInit: Device identifier: %s\n", prop.name); | ||||||
|       //      printf("GpuInit:   Peak Memory Bandwidth (GB/s): %f\n",(float)2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6); |  | ||||||
|       GPU_PROP(managedMemory); |       GPU_PROP(managedMemory); | ||||||
|       GPU_PROP(isMultiGpuBoard); |       GPU_PROP(isMultiGpuBoard); | ||||||
|       GPU_PROP(warpSize); |       GPU_PROP(warpSize); | ||||||
| #if 0 |       //      GPU_PROP(unifiedAddressing); | ||||||
|       GPU_PROP(unifiedAddressing); |       //      GPU_PROP(l2CacheSize); | ||||||
|       GPU_PROP(l2CacheSize); |       //      GPU_PROP(singleToDoublePrecisionPerfRatio); | ||||||
|       GPU_PROP(singleToDoublePrecisionPerfRatio); |  | ||||||
| #endif |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   if ( world_rank == 0 ) { |   if ( world_rank == 0 ) { | ||||||
|   | |||||||
							
								
								
									
										32
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										32
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -67,7 +67,6 @@ AC_CHECK_HEADERS(malloc/malloc.h) | |||||||
| AC_CHECK_HEADERS(malloc.h) | AC_CHECK_HEADERS(malloc.h) | ||||||
| AC_CHECK_HEADERS(endian.h) | AC_CHECK_HEADERS(endian.h) | ||||||
| AC_CHECK_HEADERS(execinfo.h) | AC_CHECK_HEADERS(execinfo.h) | ||||||
| AC_CHECK_HEADERS(numaif.h) |  | ||||||
| AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) | AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) | ||||||
| AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) | AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) | ||||||
|  |  | ||||||
| @@ -136,6 +135,18 @@ case ${ac_SFW_FP16} in | |||||||
|       AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; |       AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; | ||||||
| esac | esac | ||||||
|  |  | ||||||
|  | ############### SUMMIT JSRUN | ||||||
|  | AC_ARG_ENABLE([jsrun], | ||||||
|  |     [AC_HELP_STRING([--enable-jsrun=yes|no], [enable IBMs jsrun resource manager for SUMMIT])], | ||||||
|  |     [ac_JSRUN=${enable_jsrun}], [ac_JSRUN=no]) | ||||||
|  | case ${ac_JSRUN} in | ||||||
|  |     yes) | ||||||
|  |       AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);; | ||||||
|  |     no);; | ||||||
|  |     *) | ||||||
|  |       AC_MSG_ERROR(["JSRUN option not supported ${ac_JSRUN}"]);; | ||||||
|  | esac | ||||||
|  |  | ||||||
| ############### Intel libraries | ############### Intel libraries | ||||||
| AC_ARG_ENABLE([mkl], | AC_ARG_ENABLE([mkl], | ||||||
|     [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], |     [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], | ||||||
| @@ -173,19 +184,6 @@ AC_ARG_WITH([hdf5], | |||||||
|     [AM_CXXFLAGS="-I$with_hdf5/include $AM_CXXFLAGS"] |     [AM_CXXFLAGS="-I$with_hdf5/include $AM_CXXFLAGS"] | ||||||
|     [AM_LDFLAGS="-L$with_hdf5/lib $AM_LDFLAGS"]) |     [AM_LDFLAGS="-L$with_hdf5/lib $AM_LDFLAGS"]) | ||||||
|  |  | ||||||
| ############### first-touch |  | ||||||
| AC_ARG_ENABLE([numa], |  | ||||||
|     [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])], |  | ||||||
|     [ac_NUMA=${enable_NUMA}],[ac_NUMA=no]) |  | ||||||
|  |  | ||||||
| case ${ac_NUMA} in |  | ||||||
|     no) |  | ||||||
|         ;; |  | ||||||
|     yes) |  | ||||||
|         AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);; |  | ||||||
|     *) |  | ||||||
|         AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);; |  | ||||||
| esac |  | ||||||
|  |  | ||||||
| ############### Checks for library functions | ############### Checks for library functions | ||||||
| CXXFLAGS_CPY=$CXXFLAGS | CXXFLAGS_CPY=$CXXFLAGS | ||||||
| @@ -241,10 +239,6 @@ AC_SEARCH_LIBS([crc32], [z], | |||||||
|                [have_zlib=true] [LIBS="${LIBS} -lz"], |                [have_zlib=true] [LIBS="${LIBS} -lz"], | ||||||
| 	       [AC_MSG_ERROR(zlib library was not found in your system.)]) | 	       [AC_MSG_ERROR(zlib library was not found in your system.)]) | ||||||
|  |  | ||||||
| AC_SEARCH_LIBS([move_pages], [numa], |  | ||||||
|                [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])] |  | ||||||
|                [have_libnuma=true] [LIBS="${LIBS} -lnuma"], |  | ||||||
| 	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)]) |  | ||||||
|  |  | ||||||
| AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp], | AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp], | ||||||
|                [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])] |                [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])] | ||||||
| @@ -263,7 +257,7 @@ AC_ARG_ENABLE([gen-simd-width], | |||||||
|             [AS_HELP_STRING([--enable-gen-simd-width=size], |             [AS_HELP_STRING([--enable-gen-simd-width=size], | ||||||
|             [size (in bytes) of the generic SIMD vectors (default: 32)])], |             [size (in bytes) of the generic SIMD vectors (default: 32)])], | ||||||
|             [ac_gen_simd_width=$enable_gen_simd_width], |             [ac_gen_simd_width=$enable_gen_simd_width], | ||||||
|             [ac_gen_simd_width=32]) |             [ac_gen_simd_width=64]) | ||||||
|  |  | ||||||
| AC_ARG_ENABLE([gen-scalar], | AC_ARG_ENABLE([gen-scalar], | ||||||
|             [AS_HELP_STRING([--enable-gen-scalar=yes|no], |             [AS_HELP_STRING([--enable-gen-scalar=yes|no], | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user