Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2026-03-02 02:26:12 +00:00 · 2019-11-21 20:09:31 +00:00
parent feb1ff3494 8ef6175acc
commit f4d27e7090
9 changed files with 102 additions and 76 deletions
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -62,9 +62,12 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
    basis_v[k] = basis[k].View();
  }

+  std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
+
  thread_region
  {
-    std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
+    vobj* B = Bt.data() + Nm * thread_num();
+
    thread_for_in_region(ss, grid->oSites(),{
      for(int j=j0; j<j1; ++j) B[j]=0.;
      
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -41,9 +41,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/shm.h>
 #include <sys/mman.h>
 #include <zlib.h>
-#ifdef HAVE_NUMAIF_H
-#include <numaif.h>
-#endif

 NAMESPACE_BEGIN(Grid);

@@ -99,6 +96,7 @@ public:
  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
  ///////////////////////////////////////////////////
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -155,6 +155,35 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm);
 }
+static inline int divides(int a,int b)
+{
+  return ( b == ( (b/a)*a ) );
+}
+void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
+{
+  ////////////////////////////////////////////////////////////////
+  // Powers of 2,3,5 only in prime decomposition for now
+  ////////////////////////////////////////////////////////////////
+  int ndimension = WorldDims.size();
+  ShmDims=Coordinate(ndimension,1);
+
+  std::vector<int> primes({2,3,5});
+
+  int dim = 0;
+  int AutoShmSize = 1;
+  while(AutoShmSize != WorldShmSize) {
+    for(int p=0;p<primes.size();p++) {
+      int prime=primes[p];
+      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
+        && divides(prime,WorldShmSize/AutoShmSize)  ) {
+	AutoShmSize*=prime;
+	ShmDims[dim]*=prime;
+	break;
+      }
+    }
+    dim=(dim+1) %ndimension;
+  }
+}
 void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
@@ -221,17 +250,13 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int ndimension              = processors.size();
-  std::vector<int> processor_coor(ndimension);
-  std::vector<int> WorldDims = processors.toVector();
-  std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
-  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
-  std::vector<int> HyperCoor(ndimension);
-  int dim = 0;
-  for(int l2=0;l2<log2size;l2++){
-    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
-    ShmDims[dim]*=2;
-    dim=(dim+1)%ndimension;
-    }
+  Coordinate processor_coor(ndimension);
+  Coordinate WorldDims = processors;
+  Coordinate ShmDims  (ndimension);  Coordinate NodeDims (ndimension);
+  Coordinate ShmCoor  (ndimension);    Coordinate NodeCoor (ndimension);    Coordinate WorldCoor(ndimension);
+  Coordinate HyperCoor(ndimension);
+
+  GetShmDims(WorldDims,ShmDims);

  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
@@ -281,27 +306,16 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
 }
 void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
-  ////////////////////////////////////////////////////////////////
-  // Assert power of two shm_size.
-  ////////////////////////////////////////////////////////////////
-  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
-  assert(log2size != -1);
-
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int ndimension              = processors.size();
  Coordinate processor_coor(ndimension);
-  Coordinate WorldDims = processors; Coordinate ShmDims(ndimension,1);  Coordinate NodeDims (ndimension);
+  Coordinate WorldDims = processors; Coordinate ShmDims(ndimension);  Coordinate NodeDims (ndimension);
  Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension);
-  int dim = 0;
-  for(int l2=0;l2<log2size;l2++){
-    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
-    ShmDims[dim]*=2;
-    dim=(dim+1)%ndimension;
-  }

+  GetShmDims(WorldDims,ShmDims);
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -418,7 +432,14 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // e.g. DGX1, supermicro board, 
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  //  cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
-  cudaSetDevice(WorldShmRank);
+
+#ifdef GRID_IBM_SUMMIT
+  // IBM Jsrun makes cuda Device numbering screwy and not match rank
+    std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
+#else
+    std::cout << "setting device to WorldShmRank"<<std::endl;
+    cudaSetDevice(WorldShmRank);
+#endif
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -445,7 +466,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
    cudaIpcMemHandle_t handle;
-
+    
    if ( r==WorldShmRank ) { 
      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  cudaSuccess) {
@@ -714,6 +735,24 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 

+#ifdef GRID_IBM_SUMMIT
+  // Hide the shared memory path between sockets 
+  // if even number of nodes
+  if ( (ShmSize & 0x1)==0 ) {
+    int SocketSize = ShmSize/2;
+    int mySocket = ShmRank/SocketSize; 
+    for(int r=0;r<size;r++){
+      int hisRank=ShmRanks[r];
+      if ( hisRank!= MPI_UNDEFINED ) {
+	int hisSocket=hisRank/SocketSize;
+	if ( hisSocket != mySocket ) {
+	  ShmRanks[r] = MPI_UNDEFINED;
+	}
+      }
+    }
+  }
+#endif
+
  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -178,8 +178,8 @@ public:
 private:
  void dealloc(void)
  {
-    alignedAllocator<vobj> alloc;
    if( this->_odata_size ) {
+      alignedAllocator<vobj> alloc;
      alloc.deallocate(this->_odata,this->_odata_size);
      this->_odata=nullptr;
      this->_odata_size=0;
@@ -187,15 +187,17 @@ private:
  }
  void resize(uint64_t size)
  {
-    alignedAllocator<vobj> alloc;
    if ( this->_odata_size != size ) {
+      alignedAllocator<vobj> alloc;
+
      dealloc();
+      
+      this->_odata_size = size;
+      if ( size ) 
+	this->_odata      = alloc.allocate(this->_odata_size);
+      else 
+	this->_odata      = nullptr;
    }
-    this->_odata_size = size;
-    if ( size ) 
-      this->_odata      = alloc.allocate(this->_odata_size);
-    else 
-      this->_odata      = nullptr;
  }
 public:
  /////////////////////////////////////////////////////////////////////////////////
@@ -346,7 +348,7 @@ public:
  void reset(GridBase* grid) {
    if (this->_grid != grid) {
      this->_grid = grid;
-      this->_odata.resize(grid->oSites());
+      this->resize(grid->oSites());
      this->checkerboard = 0;
    }
  }
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -292,7 +292,7 @@ void GridGpuInit(void)
  gpu_props = new cudaDeviceProp[nDevices];

  char * localRankStr = NULL;
-  int rank = 0, device = 0, world_rank=0; 
+  int rank = 0, world_rank=0; 
 #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
@@ -301,23 +301,16 @@ void GridGpuInit(void)
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
    rank = atoi(localRankStr);		
-    device = rank %nDevices;
  }
  if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
  {
    rank = atoi(localRankStr);		
-    device = rank %nDevices;
  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}

-  cudaSetDevice(device);
  if ( world_rank == 0 ) {
    GridBanner();
-    printf("GpuInit: ================================================\n");
-    printf("GpuInit: Setting up Cuda Device map before first MPI call\n",nDevices);
-    printf("GpuInit: ================================================\n");
-    printf("GpuInit: Cuda reports %d GPUs on MPI rank 0\n",nDevices);
  }

  for (int i = 0; i < nDevices; i++) {
@@ -325,7 +318,6 @@ void GridGpuInit(void)
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
-    //      cudaGetDeviceProperties(&prop, i);
    cudaGetDeviceProperties(&gpu_props[i], i);
    if ( world_rank == 0) {
      cudaDeviceProp prop; 
@@ -334,15 +326,13 @@ void GridGpuInit(void)
      printf("GpuInit: Device Number    : %d\n", i);
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device identifier: %s\n", prop.name);
-      //      printf("GpuInit:   Peak Memory Bandwidth (GB/s): %f\n",(float)2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
+
      GPU_PROP(managedMemory);
      GPU_PROP(isMultiGpuBoard);
      GPU_PROP(warpSize);
-#if 0
-      GPU_PROP(unifiedAddressing);
-      GPU_PROP(l2CacheSize);
-      GPU_PROP(singleToDoublePrecisionPerfRatio);
-#endif
+      //      GPU_PROP(unifiedAddressing);
+      //      GPU_PROP(l2CacheSize);
+      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
  if ( world_rank == 0 ) {
--- a/Hadrons/Global.hpp
+++ b/Hadrons/Global.hpp
@@ -272,7 +272,7 @@ struct Correlator: Serializable
 {
    GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>),
                                    Metadata,             info,
-                                    std::vector<Complex>, corr);
+                                    std::vector<Scalar>, corr);
 };

 END_HADRONS_NAMESPACE
--- a/Hadrons/Modules/MContraction/WeakEye3pt.hpp
+++ b/Hadrons/Modules/MContraction/WeakEye3pt.hpp
@@ -144,7 +144,7 @@ void TWeakEye3pt<FImpl>::execute(void)
 {
    LOG(Message) << "Computing mesonic weak 3pt contractions, eye topologies" << std::endl;
    LOG(Message) << "gIn : " << par().gammaIn << std::endl;
-    LOG(Message) << "gOut: " << par().gammaIn << std::endl;
+    LOG(Message) << "gOut: " << par().gammaOut << std::endl;
    LOG(Message) << "tOut: " << par().tOut << std::endl;
    LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
    LOG(Message) << "qbr : " << par().qBarRight << std::endl;
--- a/Hadrons/Modules/MContraction/WeakNonEye3pt.hpp
+++ b/Hadrons/Modules/MContraction/WeakNonEye3pt.hpp
@@ -144,7 +144,7 @@ void TWeakNonEye3pt<FImpl>::execute(void)
 {
    LOG(Message) << "Computing mesonic weak 3pt contractions, non-eye topologies" << std::endl;
    LOG(Message) << "gIn : " << par().gammaIn << std::endl;
-    LOG(Message) << "gOut: " << par().gammaIn << std::endl;
+    LOG(Message) << "gOut: " << par().gammaOut << std::endl;
    LOG(Message) << "ql  : " << par().qLeft << std::endl;
    LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
    LOG(Message) << "qr  : " << par().qRight << std::endl;
--- a/configure.ac
+++ b/configure.ac
@@ -67,7 +67,6 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
-AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])

@@ -136,6 +135,18 @@ case ${ac_SFW_FP16} in
      AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac

+############### SUMMIT JSRUN
+AC_ARG_ENABLE([summit],
+    [AC_HELP_STRING([--enable-summit=yes|no], [enable IBMs jsrun resource manager for SUMMIT])],
+    [ac_JSRUN=${enable_summit}], [ac_SUMMIT=no])
+case ${ac_SUMMIT} in
+    no);;
+    yes)
+      AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
+    *)
+      AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
+esac
+
 ############### Intel libraries
 AC_ARG_ENABLE([mkl],
    [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
@@ -173,19 +184,6 @@ AC_ARG_WITH([hdf5],
    [AM_CXXFLAGS="-I$with_hdf5/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_hdf5/lib $AM_LDFLAGS"])

-############### first-touch
-AC_ARG_ENABLE([numa],
-    [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])],
-    [ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
-
-case ${ac_NUMA} in
-    no)
-        ;;
-    yes)
-        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
-    *)
-        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
-esac

 ############### Checks for library functions
 CXXFLAGS_CPY=$CXXFLAGS
@@ -241,10 +239,6 @@ AC_SEARCH_LIBS([crc32], [z],
               [have_zlib=true] [LIBS="${LIBS} -lz"],
 	       [AC_MSG_ERROR(zlib library was not found in your system.)])

-AC_SEARCH_LIBS([move_pages], [numa],
-               [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
-               [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
-	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])

 AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
               [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
@@ -261,9 +255,9 @@ AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=code],

 AC_ARG_ENABLE([gen-simd-width],
            [AS_HELP_STRING([--enable-gen-simd-width=size],
-            [size (in bytes) of the generic SIMD vectors (default: 32)])],
+            [size (in bytes) of the generic SIMD vectors (default: 64)])],
            [ac_gen_simd_width=$enable_gen_simd_width],
-            [ac_gen_simd_width=32])
+            [ac_gen_simd_width=64])

 AC_ARG_ENABLE([gen-scalar],
            [AS_HELP_STRING([--enable-gen-scalar=yes|no],