diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index c1a4d93a..30be510b 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid); #define AccSmall (3) #define Shared (4) #define SharedSmall (5) +#undef GRID_MM_VERBOSE uint64_t total_shared; uint64_t total_device; uint64_t total_host;; void MemoryManager::PrintBytes(void) { - std::cout << " MemoryManager : "<>20)<<" shared Mbytes "<>20)<<" accelerator Mbytes "<>20) <<" cpu Mbytes "<>20) <<" cpu cache Mbytes "<>20) <<" acc cache Mbytes "<>20) <<" shared cache Mbytes "<0); #ifdef GRID_OMP @@ -211,6 +255,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries if ( entries[v].valid ) { ret = entries[v].address; + cacheBytes -= entries[v].bytes; entries[v].valid = 0; entries[v].address = NULL; entries[v].bytes = 0; @@ -219,6 +264,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries entries[v].address=ptr; entries[v].bytes =bytes; entries[v].valid =1; + cacheBytes += bytes; return ret; } @@ -228,13 +274,13 @@ void *MemoryManager::Lookup(size_t bytes,int type) #ifdef ALLOCATION_CACHE bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); int cache = type+small; - return Lookup(bytes,Entries[cache],Ncache[cache]); + return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); #else return NULL; #endif } -void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) +void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) { assert(ncache>0); #ifdef GRID_OMP @@ -243,6 +289,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach for(int e=0;e>>(num1,num2,nsimd,lambda); \ + cuda_mem(); \ + std::cout << "========================== CUDA KERNEL DONE\n"; \ } #define accelerator_for6dNB(iter1, num1, \ diff --git a/systems/Summit/config-command b/systems/Summit/config-command new file mode 100644 index 00000000..b565addc --- /dev/null +++ b/systems/Summit/config-command @@ -0,0 +1,14 @@ +../../configure --enable-comms=mpi \ + --enable-simd=GPU \ + --enable-gen-simd-width=32 \ + --enable-unified=no \ + --enable-shm=nvlink \ + --disable-gparity \ + --enable-setdevice \ + --disable-fermion-reps \ + --enable-accelerator=cuda \ + --prefix /ccs/home/paboyle/prefix \ + CXX=nvcc \ + LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \ + CXXFLAGS="-ccbin mpicxx -gencode arch=compute_70,code=sm_70 -I/ccs/home/paboyle/prefix/include/ -std=c++14" + diff --git a/systems/Summit/dwf16.lsf b/systems/Summit/dwf16.lsf new file mode 100644 index 00000000..16f4b82d --- /dev/null +++ b/systems/Summit/dwf16.lsf @@ -0,0 +1,25 @@ +#!/bin/bash +#BSUB -P LGT104 +#BSUB -W 2:00 +#BSUB -nnodes 4 +#BSUB -J DWF + +export OMP_NUM_THREADS=6 +export PAMI_IBV_ADAPTER_AFFINITY=1 +export PAMI_ENABLE_STRIPING=1 +export OPT="--comms-concurrent --comms-overlap " + +APP="./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.3 " +jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP + +APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT " +jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP + +APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT " +jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP + + + + + + diff --git a/systems/Summit/dwf4.lsf b/systems/Summit/dwf4.lsf new file mode 100644 index 00000000..fcd80bcb --- /dev/null +++ b/systems/Summit/dwf4.lsf @@ -0,0 +1,25 @@ +#!/bin/bash +#BSUB -P LGT104 +#BSUB -W 2:00 +#BSUB -nnodes 4 +#BSUB -J DWF + +export OMP_NUM_THREADS=6 +export PAMI_IBV_ADAPTER_AFFINITY=1 +export PAMI_ENABLE_STRIPING=1 +export OPT="--comms-concurrent --comms-overlap " +#export GRID_ALLOC_NCACHE_LARGE=1 +export APP="./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.3 " +jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP + +APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT " +jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP + +APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT " +jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP + + + + + + diff --git a/systems/Summit/sourceme-cuda10.sh b/systems/Summit/sourceme-cuda10.sh new file mode 100644 index 00000000..58217613 --- /dev/null +++ b/systems/Summit/sourceme-cuda10.sh @@ -0,0 +1,8 @@ +export UCX_GDR_COPY_RCACHE=no +export UCX_MEMTYPE_CACHE=n +export UCX_RNDV_SCHEME=put_zcopy +module load gcc/7.5.0 +module load cuda/10.2.89 +#cuda/11.4.0 +export LD_LIBRARY_PATH=/ccs/home/paboyle/prefix/lib/:$LD_LIBRARY_PATH +