1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Merge branch 'develop' of github.com:paboyle/Grid into develop

This commit is contained in:
Antonin Portelli 2021-10-05 21:13:01 +01:00
commit b5c81a02b6
9 changed files with 170 additions and 25 deletions

View File

@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid);
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
#undef GRID_MM_VERBOSE
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : PrintBytes "<<std::endl;
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_host>>20) <<" cpu Mbytes "<<std::endl;
uint64_t cacheBytes;
cacheBytes = CacheBytes[Cpu];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Acc];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Shared];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
#ifdef GRID_CUDA
cuda_mem();
#endif
}
//////////////////////////////////////////////////////////////////////
@ -24,86 +40,114 @@ void MemoryManager::PrintBytes(void)
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
//////////////////////////////////////////////////////////////////////
void *MemoryManager::AcceleratorAllocate(size_t bytes)
{
total_device+=bytes;
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes);
total_device+=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
{
total_device-=bytes;
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) {
acceleratorFreeDevice(__freeme);
total_device-=bytes;
// PrintBytes();
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorFree "<<std::endl;
PrintBytes();
#endif
}
void *MemoryManager::SharedAllocate(size_t bytes)
{
total_shared+=bytes;
void *ptr = (void *) Lookup(bytes,Shared);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_shared+=bytes;
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
// PrintBytes();
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::SharedFree (void *ptr,size_t bytes)
{
total_shared-=bytes;
void *__freeme = Insert(ptr,bytes,Shared);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_shared-=bytes;
// PrintBytes();
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedFree "<<std::endl;
PrintBytes();
#endif
}
#ifdef GRID_UVM
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_host+=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_host-=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#else
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocCpu(bytes);
total_host+=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeCpu(__freeme);
total_host-=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#endif
@ -181,13 +225,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else
return ptr;
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
@ -211,6 +255,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
if ( entries[v].valid ) {
ret = entries[v].address;
cacheBytes -= entries[v].bytes;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
@ -219,6 +264,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
cacheBytes += bytes;
return ret;
}
@ -228,13 +274,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache]);
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else
return NULL;
#endif
}
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
@ -243,6 +289,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
cacheBytes -= entries[e].bytes;
return entries[e].address;
}
}

View File

@ -82,14 +82,15 @@ private:
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
static uint64_t CacheBytes[NallocType];
/////////////////////////////////////////////////
// Free pool
/////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Lookup(size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
static void PrintBytes(void);
public:

View File

@ -3,7 +3,7 @@
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
#define dprintf(...)
@ -429,6 +429,7 @@ void MemoryManager::NotifyDeletion(void *_ptr)
}
void MemoryManager::Print(void)
{
PrintBytes();
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;

View File

@ -74,11 +74,13 @@ void acceleratorInit(void)
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
#undef GPU_PROP_FMT
#undef GPU_PROP
#ifdef GRID_DEFAULT_GPU
int device = 0;
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) {
printf("AcceleratorCudaInit: using default device \n");
@ -87,10 +89,20 @@ void acceleratorInit(void)
printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
}
#else
int device = rank;
printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
cudaSetDevice(rank);
#endif
cudaSetDevice(device);
const int len=64;
char busid[len];
if( rank == world_rank ) {
cudaDeviceGetPCIBusId(busid, len, device);
printf("local rank %d device %d bus id: %s\n", rank, device, busid);
}
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
}
#endif

View File

@ -115,6 +115,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#endif
} // CUDA specific
inline void cuda_mem(void)
{
size_t free_t,total_t,used_t;
cudaMemGetInfo(&free_t,&total_t);
used_t=total_t-free_t;
std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
}
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
int nt=acceleratorThreads(); \
@ -125,7 +133,11 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
}; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
std::cout << "========================== CUDA KERNEL CALL\n"; \
cuda_mem(); \
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \
cuda_mem(); \
std::cout << "========================== CUDA KERNEL DONE\n"; \
}
#define accelerator_for6dNB(iter1, num1, \

View File

@ -0,0 +1,14 @@
../../configure --enable-comms=mpi \
--enable-simd=GPU \
--enable-gen-simd-width=32 \
--enable-unified=no \
--enable-shm=nvlink \
--disable-gparity \
--enable-setdevice \
--disable-fermion-reps \
--enable-accelerator=cuda \
--prefix /ccs/home/paboyle/prefix \
CXX=nvcc \
LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_70,code=sm_70 -I/ccs/home/paboyle/prefix/include/ -std=c++14"

25
systems/Summit/dwf16.lsf Normal file
View File

@ -0,0 +1,25 @@
#!/bin/bash
#BSUB -P LGT104
#BSUB -W 2:00
#BSUB -nnodes 4
#BSUB -J DWF
export OMP_NUM_THREADS=6
export PAMI_IBV_ADAPTER_AFFINITY=1
export PAMI_ENABLE_STRIPING=1
export OPT="--comms-concurrent --comms-overlap "
APP="./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.3 "
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP

25
systems/Summit/dwf4.lsf Normal file
View File

@ -0,0 +1,25 @@
#!/bin/bash
#BSUB -P LGT104
#BSUB -W 2:00
#BSUB -nnodes 4
#BSUB -J DWF
export OMP_NUM_THREADS=6
export PAMI_IBV_ADAPTER_AFFINITY=1
export PAMI_ENABLE_STRIPING=1
export OPT="--comms-concurrent --comms-overlap "
#export GRID_ALLOC_NCACHE_LARGE=1
export APP="./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.3 "
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP

View File

@ -0,0 +1,8 @@
export UCX_GDR_COPY_RCACHE=no
export UCX_MEMTYPE_CACHE=n
export UCX_RNDV_SCHEME=put_zcopy
module load gcc/7.5.0
module load cuda/10.2.89
#cuda/11.4.0
export LD_LIBRARY_PATH=/ccs/home/paboyle/prefix/lib/:$LD_LIBRARY_PATH