Memory verbose and tracking, shrink default cache

Print PCI device IDs on node 0
2025-12-23 22:24:30 +00:00 · 2021-10-05 11:41:03 -04:00
parent cffc736bb3
commit 8ed0b57b09
5 changed files with 98 additions and 25 deletions
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -74,11 +74,13 @@ void acceleratorInit(void)
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
+
  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP

 #ifdef GRID_DEFAULT_GPU
+  int device = 0;
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 ) {
    printf("AcceleratorCudaInit: using default device \n");
@@ -87,10 +89,20 @@ void acceleratorInit(void)
    printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
  }
 #else
+  int device = rank;
  printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
  printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
-  cudaSetDevice(rank);
 #endif
+
+  cudaSetDevice(device);
+
+  const int len=64;
+  char busid[len];
+  if( rank == world_rank ) { 
+    cudaDeviceGetPCIBusId(busid, len, device);
+    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
+  }
+
  if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
 }
 #endif
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -115,6 +115,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #endif
 } // CUDA specific

+inline void cuda_mem(void)
+{
+  size_t free_t,total_t,used_t;
+  cudaMemGetInfo(&free_t,&total_t);
+  used_t=total_t-free_t;
+  std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
+}
+
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
    int nt=acceleratorThreads();					\
@@ -125,7 +133,11 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    };									\
    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
+    std::cout << "========================== CUDA KERNEL CALL\n";	\
+    cuda_mem();								\
    LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
+    cuda_mem();								\
+    std::cout << "========================== CUDA KERNEL DONE\n";	\
  }

 #define accelerator_for6dNB(iter1, num1,				\