Merge branch 'develop' of github.com:paboyle/Grid into develop

2025-06-18 07:47:06 +01:00 · 2021-10-05 21:13:01 +01:00
parent d899ee80fc 4016e705fc
commit b5c81a02b6
9 changed files with 170 additions and 25 deletions
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid);
 #define AccSmall (3)
 #define Shared   (4)
 #define SharedSmall (5)
+#undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 void MemoryManager::PrintBytes(void)
 {
-  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
-  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
-  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
+  std::cout << " MemoryManager : PrintBytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
+  std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared      Mbytes "<<std::endl;
+  std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
+  std::cout << " MemoryManager : "<<(total_host>>20)  <<" cpu         Mbytes "<<std::endl;
+  uint64_t cacheBytes;
+  cacheBytes = CacheBytes[Cpu];
+  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
+  cacheBytes = CacheBytes[Acc];
+  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
+  cacheBytes = CacheBytes[Shared];
+  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
+  
+#ifdef GRID_CUDA
+  cuda_mem();
+#endif
+  
 }

 //////////////////////////////////////////////////////////////////////
@ -24,86 +40,114 @@ void MemoryManager::PrintBytes(void)
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
-
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
+uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
 //////////////////////////////////////////////////////////////////////
 void *MemoryManager::AcceleratorAllocate(size_t bytes)
 {
+  total_device+=bytes;
  void *ptr = (void *) Lookup(bytes,Acc);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocDevice(bytes);
-    total_device+=bytes;
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"AcceleratorAllocate "<<std::endl;
+  PrintBytes();
+#endif
  return ptr;
 }
 void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 {
+  total_device-=bytes;
  void *__freeme = Insert(ptr,bytes,Acc);
  if ( __freeme ) {
    acceleratorFreeDevice(__freeme);
-    total_device-=bytes;
-    //    PrintBytes();
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"AcceleratorFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 void *MemoryManager::SharedAllocate(size_t bytes)
 {
+  total_shared+=bytes;
  void *ptr = (void *) Lookup(bytes,Shared);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
-    total_shared+=bytes;
-    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
-    //    PrintBytes();
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"SharedAllocate "<<std::endl;
+  PrintBytes();
+#endif
  return ptr;
 }
 void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 {
+  total_shared-=bytes;
  void *__freeme = Insert(ptr,bytes,Shared);
  if ( __freeme ) {
    acceleratorFreeShared(__freeme);
-    total_shared-=bytes;
-    //    PrintBytes();
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"SharedFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 #ifdef GRID_UVM
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
+  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
-    total_host+=bytes;
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuAllocate "<<std::endl;
+  PrintBytes();
+#endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
+  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeShared(__freeme);
-    total_host-=bytes;
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 #else
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
+  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocCpu(bytes);
-    total_host+=bytes;
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuAllocate "<<std::endl;
+  PrintBytes();
+#endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
+  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeCpu(__freeme);
-    total_host-=bytes;
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 #endif

@ -181,13 +225,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type + small;
-  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
+  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
 #endif
 }

-void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
+void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@ -211,6 +255,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries

  if ( entries[v].valid ) {
    ret = entries[v].address;
+    cacheBytes -= entries[v].bytes;
    entries[v].valid = 0;
    entries[v].address = NULL;
    entries[v].bytes = 0;
@ -219,6 +264,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  entries[v].address=ptr;
  entries[v].bytes  =bytes;
  entries[v].valid  =1;
+  cacheBytes += bytes;

  return ret;
 }
@ -228,13 +274,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type+small;
-  return Lookup(bytes,Entries[cache],Ncache[cache]);
+  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
 #endif
 }

-void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
+void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@ -243,6 +289,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
  for(int e=0;e<ncache;e++){
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
      entries[e].valid = 0;
+      cacheBytes -= entries[e].bytes;
      return entries[e].address;
    }
  }
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@ -82,14 +82,15 @@ private:
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
+  static uint64_t CacheBytes[NallocType];

  /////////////////////////////////////////////////
  // Free pool
  /////////////////////////////////////////////////
  static void *Insert(void *ptr,size_t bytes,int type) ;
  static void *Lookup(size_t bytes,int type) ;
-  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
-  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
+  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
+  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;

  static void PrintBytes(void);
 public:
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -3,7 +3,7 @@

 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
+//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 #define dprintf(...)


@ -429,6 +429,7 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 }
 void  MemoryManager::Print(void)
 {
+  PrintBytes();
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@ -74,11 +74,13 @@ void acceleratorInit(void)
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
+
  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP

 #ifdef GRID_DEFAULT_GPU
+  int device = 0;
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 ) {
    printf("AcceleratorCudaInit: using default device \n");
@ -87,10 +89,20 @@ void acceleratorInit(void)
    printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
  }
 #else
+  int device = rank;
  printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
  printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
-  cudaSetDevice(rank);
 #endif
+
+  cudaSetDevice(device);
+
+  const int len=64;
+  char busid[len];
+  if( rank == world_rank ) { 
+    cudaDeviceGetPCIBusId(busid, len, device);
+    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
+  }
+
  if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
 }
 #endif
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -115,6 +115,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #endif
 } // CUDA specific

+inline void cuda_mem(void)
+{
+  size_t free_t,total_t,used_t;
+  cudaMemGetInfo(&free_t,&total_t);
+  used_t=total_t-free_t;
+  std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
+}
+
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
    int nt=acceleratorThreads();					\
@ -125,7 +133,11 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    };									\
    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
+    std::cout << "========================== CUDA KERNEL CALL\n";	\
+    cuda_mem();								\
    LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
+    cuda_mem();								\
+    std::cout << "========================== CUDA KERNEL DONE\n";	\
  }

 #define accelerator_for6dNB(iter1, num1,				\
--- a/systems/Summit/config-command
+++ b/systems/Summit/config-command
@ -0,0 +1,14 @@
+../../configure --enable-comms=mpi \
+	      --enable-simd=GPU \
+	      --enable-gen-simd-width=32 \
+	      --enable-unified=no \
+	       --enable-shm=nvlink \
+	       --disable-gparity \
+	       --enable-setdevice \
+	       --disable-fermion-reps \
+	       --enable-accelerator=cuda \
+	       --prefix /ccs/home/paboyle/prefix \
+	       CXX=nvcc \
+	       LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \
+	       CXXFLAGS="-ccbin mpicxx -gencode arch=compute_70,code=sm_70 -I/ccs/home/paboyle/prefix/include/ -std=c++14"
+
--- a/systems/Summit/dwf16.lsf
+++ b/systems/Summit/dwf16.lsf
@ -0,0 +1,25 @@
+#!/bin/bash
+#BSUB -P LGT104
+#BSUB -W 2:00
+#BSUB -nnodes 4
+#BSUB -J DWF
+
+export OMP_NUM_THREADS=6
+export PAMI_IBV_ADAPTER_AFFINITY=1
+export PAMI_ENABLE_STRIPING=1
+export OPT="--comms-concurrent --comms-overlap "
+
+APP="./benchmarks/Benchmark_comms_host_device  --mpi 2.2.2.3 "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+
+
+
+
+
--- a/systems/Summit/dwf4.lsf
+++ b/systems/Summit/dwf4.lsf
@ -0,0 +1,25 @@
+#!/bin/bash
+#BSUB -P LGT104
+#BSUB -W 2:00
+#BSUB -nnodes 4
+#BSUB -J DWF
+
+export OMP_NUM_THREADS=6
+export PAMI_IBV_ADAPTER_AFFINITY=1
+export PAMI_ENABLE_STRIPING=1
+export OPT="--comms-concurrent --comms-overlap "
+#export GRID_ALLOC_NCACHE_LARGE=1
+export APP="./benchmarks/Benchmark_comms_host_device  --mpi 2.2.2.3 "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+
+
+
+
+
--- a/systems/Summit/sourceme-cuda10.sh
+++ b/systems/Summit/sourceme-cuda10.sh
@ -0,0 +1,8 @@
+export UCX_GDR_COPY_RCACHE=no
+export UCX_MEMTYPE_CACHE=n
+export UCX_RNDV_SCHEME=put_zcopy
+module load gcc/7.5.0
+module load cuda/10.2.89
+#cuda/11.4.0
+export LD_LIBRARY_PATH=/ccs/home/paboyle/prefix/lib/:$LD_LIBRARY_PATH 
+