Merge branch 'develop' into feature/distil

* develop: Change to reporting NVCC timer support Fix nocompilee under NVCC --enable-summit flag IBM summit optimisation. Synchronise in node is still btweeen 2 halves of AC922, so could be a little faster Sliced propagator contraction was not producing any results because buf.size()=0 several typos in hadrons
2026-05-30 05:54:17 +01:00 · 2019-11-30 16:47:03 +00:00
parent 2db814f2b7 62b3799c77
commit 7983ff2fdd
9 changed files with 57 additions and 32 deletions
@@ -162,11 +162,8 @@ static inline int divides(int a,int b)
 void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
 {
  ////////////////////////////////////////////////////////////////
-  // Assert power of two shm_size.
+  // Powers of 2,3,5 only in prime decomposition for now
  ////////////////////////////////////////////////////////////////
-  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
-  assert(log2size != -1);
-
  int ndimension = WorldDims.size();
  ShmDims=Coordinate(ndimension,1);

@@ -177,7 +174,8 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
  while(AutoShmSize != WorldShmSize) {
    for(int p=0;p<primes.size();p++) {
      int prime=primes[p];
-      if ( divides(prime,WorldDims[dim]/ShmDims[dim]) ) {
+      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
+        && divides(prime,WorldShmSize/AutoShmSize)  ) {
 	AutoShmSize*=prime;
 	ShmDims[dim]*=prime;
 	break;
@@ -308,7 +306,6 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
 }
 void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
-
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
@@ -435,10 +432,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // e.g. DGX1, supermicro board, 
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  //  cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
+
 #ifdef GRID_IBM_SUMMIT
-  std::cout << header << "flag IBM_SUMMIT disabled CUDA set device: ensure jsrun is used correctly" <<std::endl;
+  // IBM Jsrun makes cuda Device numbering screwy and not match rank
+    std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
 #else
-  cudaSetDevice(WorldShmRank);
+    std::cout << "setting device to WorldShmRank"<<std::endl;
+    cudaSetDevice(WorldShmRank);
 #endif
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
@@ -466,7 +466,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
    cudaIpcMemHandle_t handle;
-
+    
    if ( r==WorldShmRank ) { 
      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  cudaSuccess) {
@@ -735,6 +735,24 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 

+#ifdef GRID_IBM_SUMMIT
+  // Hide the shared memory path between sockets 
+  // if even number of nodes
+  if ( (ShmSize & 0x1)==0 ) {
+    int SocketSize = ShmSize/2;
+    int mySocket = ShmRank/SocketSize; 
+    for(int r=0;r<size;r++){
+      int hisRank=ShmRanks[r];
+      if ( hisRank!= MPI_UNDEFINED ) {
+	int hisSocket=hisRank/SocketSize;
+	if ( hisSocket != mySocket ) {
+	  ShmRanks[r] = MPI_UNDEFINED;
+	}
+      }
+    }
+  }
+#endif
+
  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
@@ -44,8 +44,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <sys/syscall.h>
 #endif
 #ifdef __x86_64__
+#ifdef GRID_NVCC
+accelerator_inline uint64_t __rdtsc(void) {  return 0; }
+accelerator_inline uint64_t __rdpmc(int ) {  return 0; }
+#else
 #include <x86intrin.h>
 #endif
+#endif

 NAMESPACE_BEGIN(Grid);

@@ -89,13 +94,8 @@ inline uint64_t cyclecount(void){
  return tmp;
 }
 #elif defined __x86_64__
-#ifdef GRID_NVCC
-accelerator_inline uint64_t __rdtsc(void) {  return 0; }
-#endif
 inline uint64_t cyclecount(void){ 
  return __rdtsc();
-  //  unsigned int dummy;
-  // return __rdtscp(&dummy);
 }
 #else

@@ -42,8 +42,8 @@ public:
  typedef typename FImpl::SitePropagator pobj;
  typedef typename ComplexField::vector_object vobj;
  
-  static constexpr int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
-  static constexpr Complex epsilon_sgn[6]= {1,1,1,-1,-1,-1};
+  static const int epsilon[6][3] ;
+  static const Complex epsilon_sgn[6];

  private: 
  template <class mobj, class robj>
@@ -83,10 +83,15 @@ public:
 				 robj &result);
 };

-template <class FImpl>
-constexpr int BaryonUtils<FImpl>::epsilon[6][3];
-template <class FImpl>
-constexpr Complex BaryonUtils<FImpl>::epsilon_sgn[6];
+template <class FImpl> 
+const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
+template <class FImpl> 
+const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
+						    Complex(1),
+						    Complex(1),
+						    Complex(-1),
+						    Complex(-1),
+						    Complex(-1)};

 template <class FImpl>
 template <class mobj, class robj>
@@ -1233,7 +1233,7 @@ public:
  };
  
  void Report(void) {
-#define AVERAGE(A) _grid->GlobalSum(A);A/=NP;
+#define AVERAGE(A) 
 #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
    RealD NP = _grid->_Nprocessors;
    RealD NN = _grid->NodeCount();
@@ -1281,11 +1281,13 @@ public:
 	std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
 	std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
      }
+      /*
      PRINTIT(mpi3synctime);
      PRINTIT(mpi3synctime_g);
      PRINTIT(shmmergetime);
      PRINTIT(splicetime);
      PRINTIT(nosplicetime);
+      */
    }
 #undef PRINTIT
 #undef AVERAGE
@@ -272,7 +272,7 @@ struct Correlator: Serializable
 {
    GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>),
                                    Metadata,             info,
-                                    std::vector<Complex>, corr);
+                                    std::vector<Scalar>, corr);
 };

 END_HADRONS_NAMESPACE
@@ -199,7 +199,7 @@ void TMeson<FImpl1, FImpl2>::execute(void)
            Gamma gSnk(gammaList[i].first);
            Gamma gSrc(gammaList[i].second);
            
-            for (unsigned int t = 0; t < buf.size(); ++t)
+            for (unsigned int t = 0; t < nt; ++t)
            {
                result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
            }
@@ -144,7 +144,7 @@ void TWeakEye3pt<FImpl>::execute(void)
 {
    LOG(Message) << "Computing mesonic weak 3pt contractions, eye topologies" << std::endl;
    LOG(Message) << "gIn : " << par().gammaIn << std::endl;
-    LOG(Message) << "gOut: " << par().gammaIn << std::endl;
+    LOG(Message) << "gOut: " << par().gammaOut << std::endl;
    LOG(Message) << "tOut: " << par().tOut << std::endl;
    LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
    LOG(Message) << "qbr : " << par().qBarRight << std::endl;
@@ -144,7 +144,7 @@ void TWeakNonEye3pt<FImpl>::execute(void)
 {
    LOG(Message) << "Computing mesonic weak 3pt contractions, non-eye topologies" << std::endl;
    LOG(Message) << "gIn : " << par().gammaIn << std::endl;
-    LOG(Message) << "gOut: " << par().gammaIn << std::endl;
+    LOG(Message) << "gOut: " << par().gammaOut << std::endl;
    LOG(Message) << "ql  : " << par().qLeft << std::endl;
    LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
    LOG(Message) << "qr  : " << par().qRight << std::endl;
@@ -136,15 +136,15 @@ case ${ac_SFW_FP16} in
 esac

 ############### SUMMIT JSRUN
-AC_ARG_ENABLE([jsrun],
-    [AC_HELP_STRING([--enable-jsrun=yes|no], [enable IBMs jsrun resource manager for SUMMIT])],
-    [ac_JSRUN=${enable_jsrun}], [ac_JSRUN=no])
-case ${ac_JSRUN} in
+AC_ARG_ENABLE([summit],
+    [AC_HELP_STRING([--enable-summit=yes|no], [enable IBMs jsrun resource manager for SUMMIT])],
+    [ac_JSRUN=${enable_summit}], [ac_SUMMIT=no])
+case ${ac_SUMMIT} in
+    no);;
    yes)
      AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
-    no);;
    *)
-      AC_MSG_ERROR(["JSRUN option not supported ${ac_JSRUN}"]);;
+      AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
 esac

 ############### Intel libraries
@@ -255,7 +255,7 @@ AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=code],

 AC_ARG_ENABLE([gen-simd-width],
            [AS_HELP_STRING([--enable-gen-simd-width=size],
-            [size (in bytes) of the generic SIMD vectors (default: 32)])],
+            [size (in bytes) of the generic SIMD vectors (default: 64)])],
            [ac_gen_simd_width=$enable_gen_simd_width],
            [ac_gen_simd_width=64])