mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-18 07:47:06 +01:00
Memory verbose and tracking, shrink default cache
Print PCI device IDs on node 0
This commit is contained in:
@ -74,11 +74,13 @@ void acceleratorInit(void)
|
||||
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||
}
|
||||
}
|
||||
|
||||
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
|
||||
#undef GPU_PROP_FMT
|
||||
#undef GPU_PROP
|
||||
|
||||
#ifdef GRID_DEFAULT_GPU
|
||||
int device = 0;
|
||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||
if ( world_rank == 0 ) {
|
||||
printf("AcceleratorCudaInit: using default device \n");
|
||||
@ -87,10 +89,20 @@ void acceleratorInit(void)
|
||||
printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
|
||||
}
|
||||
#else
|
||||
int device = rank;
|
||||
printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
|
||||
printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
|
||||
cudaSetDevice(rank);
|
||||
#endif
|
||||
|
||||
cudaSetDevice(device);
|
||||
|
||||
const int len=64;
|
||||
char busid[len];
|
||||
if( rank == world_rank ) {
|
||||
cudaDeviceGetPCIBusId(busid, len, device);
|
||||
printf("local rank %d device %d bus id: %s\n", rank, device, busid);
|
||||
}
|
||||
|
||||
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
|
||||
}
|
||||
#endif
|
||||
|
@ -115,6 +115,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#endif
|
||||
} // CUDA specific
|
||||
|
||||
inline void cuda_mem(void)
|
||||
{
|
||||
size_t free_t,total_t,used_t;
|
||||
cudaMemGetInfo(&free_t,&total_t);
|
||||
used_t=total_t-free_t;
|
||||
std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
|
||||
}
|
||||
|
||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
{ \
|
||||
int nt=acceleratorThreads(); \
|
||||
@ -125,7 +133,11 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
std::cout << "========================== CUDA KERNEL CALL\n"; \
|
||||
cuda_mem(); \
|
||||
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \
|
||||
cuda_mem(); \
|
||||
std::cout << "========================== CUDA KERNEL DONE\n"; \
|
||||
}
|
||||
|
||||
#define accelerator_for6dNB(iter1, num1, \
|
||||
|
Reference in New Issue
Block a user