mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge branch 'develop' of github.com:paboyle/Grid into develop
This commit is contained in:
commit
b5c81a02b6
@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
#define AccSmall (3)
|
#define AccSmall (3)
|
||||||
#define Shared (4)
|
#define Shared (4)
|
||||||
#define SharedSmall (5)
|
#define SharedSmall (5)
|
||||||
|
#undef GRID_MM_VERBOSE
|
||||||
uint64_t total_shared;
|
uint64_t total_shared;
|
||||||
uint64_t total_device;
|
uint64_t total_device;
|
||||||
uint64_t total_host;;
|
uint64_t total_host;;
|
||||||
void MemoryManager::PrintBytes(void)
|
void MemoryManager::PrintBytes(void)
|
||||||
{
|
{
|
||||||
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
|
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
|
||||||
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
|
std::cout << " MemoryManager : PrintBytes "<<std::endl;
|
||||||
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
|
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
|
||||||
|
std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared Mbytes "<<std::endl;
|
||||||
|
std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
|
||||||
|
std::cout << " MemoryManager : "<<(total_host>>20) <<" cpu Mbytes "<<std::endl;
|
||||||
|
uint64_t cacheBytes;
|
||||||
|
cacheBytes = CacheBytes[Cpu];
|
||||||
|
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
|
||||||
|
cacheBytes = CacheBytes[Acc];
|
||||||
|
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
|
||||||
|
cacheBytes = CacheBytes[Shared];
|
||||||
|
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
|
||||||
|
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
cuda_mem();
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
@ -24,86 +40,114 @@ void MemoryManager::PrintBytes(void)
|
|||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
|
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
|
||||||
int MemoryManager::Victim[MemoryManager::NallocType];
|
int MemoryManager::Victim[MemoryManager::NallocType];
|
||||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
|
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
|
||||||
|
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
// Actual allocation and deallocation utils
|
// Actual allocation and deallocation utils
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
void *MemoryManager::AcceleratorAllocate(size_t bytes)
|
void *MemoryManager::AcceleratorAllocate(size_t bytes)
|
||||||
{
|
{
|
||||||
|
total_device+=bytes;
|
||||||
void *ptr = (void *) Lookup(bytes,Acc);
|
void *ptr = (void *) Lookup(bytes,Acc);
|
||||||
if ( ptr == (void *) NULL ) {
|
if ( ptr == (void *) NULL ) {
|
||||||
ptr = (void *) acceleratorAllocDevice(bytes);
|
ptr = (void *) acceleratorAllocDevice(bytes);
|
||||||
total_device+=bytes;
|
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_MM_VERBOSE
|
||||||
|
std::cout <<"AcceleratorAllocate "<<std::endl;
|
||||||
|
PrintBytes();
|
||||||
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
|
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
|
||||||
{
|
{
|
||||||
|
total_device-=bytes;
|
||||||
void *__freeme = Insert(ptr,bytes,Acc);
|
void *__freeme = Insert(ptr,bytes,Acc);
|
||||||
if ( __freeme ) {
|
if ( __freeme ) {
|
||||||
acceleratorFreeDevice(__freeme);
|
acceleratorFreeDevice(__freeme);
|
||||||
total_device-=bytes;
|
|
||||||
// PrintBytes();
|
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_MM_VERBOSE
|
||||||
|
std::cout <<"AcceleratorFree "<<std::endl;
|
||||||
|
PrintBytes();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
void *MemoryManager::SharedAllocate(size_t bytes)
|
void *MemoryManager::SharedAllocate(size_t bytes)
|
||||||
{
|
{
|
||||||
|
total_shared+=bytes;
|
||||||
void *ptr = (void *) Lookup(bytes,Shared);
|
void *ptr = (void *) Lookup(bytes,Shared);
|
||||||
if ( ptr == (void *) NULL ) {
|
if ( ptr == (void *) NULL ) {
|
||||||
ptr = (void *) acceleratorAllocShared(bytes);
|
ptr = (void *) acceleratorAllocShared(bytes);
|
||||||
total_shared+=bytes;
|
|
||||||
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
|
|
||||||
// PrintBytes();
|
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_MM_VERBOSE
|
||||||
|
std::cout <<"SharedAllocate "<<std::endl;
|
||||||
|
PrintBytes();
|
||||||
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
void MemoryManager::SharedFree (void *ptr,size_t bytes)
|
void MemoryManager::SharedFree (void *ptr,size_t bytes)
|
||||||
{
|
{
|
||||||
|
total_shared-=bytes;
|
||||||
void *__freeme = Insert(ptr,bytes,Shared);
|
void *__freeme = Insert(ptr,bytes,Shared);
|
||||||
if ( __freeme ) {
|
if ( __freeme ) {
|
||||||
acceleratorFreeShared(__freeme);
|
acceleratorFreeShared(__freeme);
|
||||||
total_shared-=bytes;
|
|
||||||
// PrintBytes();
|
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_MM_VERBOSE
|
||||||
|
std::cout <<"SharedFree "<<std::endl;
|
||||||
|
PrintBytes();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#ifdef GRID_UVM
|
#ifdef GRID_UVM
|
||||||
void *MemoryManager::CpuAllocate(size_t bytes)
|
void *MemoryManager::CpuAllocate(size_t bytes)
|
||||||
{
|
{
|
||||||
|
total_host+=bytes;
|
||||||
void *ptr = (void *) Lookup(bytes,Cpu);
|
void *ptr = (void *) Lookup(bytes,Cpu);
|
||||||
if ( ptr == (void *) NULL ) {
|
if ( ptr == (void *) NULL ) {
|
||||||
ptr = (void *) acceleratorAllocShared(bytes);
|
ptr = (void *) acceleratorAllocShared(bytes);
|
||||||
total_host+=bytes;
|
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_MM_VERBOSE
|
||||||
|
std::cout <<"CpuAllocate "<<std::endl;
|
||||||
|
PrintBytes();
|
||||||
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
||||||
{
|
{
|
||||||
|
total_host-=bytes;
|
||||||
NotifyDeletion(_ptr);
|
NotifyDeletion(_ptr);
|
||||||
void *__freeme = Insert(_ptr,bytes,Cpu);
|
void *__freeme = Insert(_ptr,bytes,Cpu);
|
||||||
if ( __freeme ) {
|
if ( __freeme ) {
|
||||||
acceleratorFreeShared(__freeme);
|
acceleratorFreeShared(__freeme);
|
||||||
total_host-=bytes;
|
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_MM_VERBOSE
|
||||||
|
std::cout <<"CpuFree "<<std::endl;
|
||||||
|
PrintBytes();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
void *MemoryManager::CpuAllocate(size_t bytes)
|
void *MemoryManager::CpuAllocate(size_t bytes)
|
||||||
{
|
{
|
||||||
|
total_host+=bytes;
|
||||||
void *ptr = (void *) Lookup(bytes,Cpu);
|
void *ptr = (void *) Lookup(bytes,Cpu);
|
||||||
if ( ptr == (void *) NULL ) {
|
if ( ptr == (void *) NULL ) {
|
||||||
ptr = (void *) acceleratorAllocCpu(bytes);
|
ptr = (void *) acceleratorAllocCpu(bytes);
|
||||||
total_host+=bytes;
|
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_MM_VERBOSE
|
||||||
|
std::cout <<"CpuAllocate "<<std::endl;
|
||||||
|
PrintBytes();
|
||||||
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
||||||
{
|
{
|
||||||
|
total_host-=bytes;
|
||||||
NotifyDeletion(_ptr);
|
NotifyDeletion(_ptr);
|
||||||
void *__freeme = Insert(_ptr,bytes,Cpu);
|
void *__freeme = Insert(_ptr,bytes,Cpu);
|
||||||
if ( __freeme ) {
|
if ( __freeme ) {
|
||||||
acceleratorFreeCpu(__freeme);
|
acceleratorFreeCpu(__freeme);
|
||||||
total_host-=bytes;
|
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_MM_VERBOSE
|
||||||
|
std::cout <<"CpuFree "<<std::endl;
|
||||||
|
PrintBytes();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -181,13 +225,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
|||||||
#ifdef ALLOCATION_CACHE
|
#ifdef ALLOCATION_CACHE
|
||||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||||
int cache = type + small;
|
int cache = type + small;
|
||||||
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
|
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
|
||||||
#else
|
#else
|
||||||
return ptr;
|
return ptr;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
|
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
|
||||||
{
|
{
|
||||||
assert(ncache>0);
|
assert(ncache>0);
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
@ -211,6 +255,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
|
|||||||
|
|
||||||
if ( entries[v].valid ) {
|
if ( entries[v].valid ) {
|
||||||
ret = entries[v].address;
|
ret = entries[v].address;
|
||||||
|
cacheBytes -= entries[v].bytes;
|
||||||
entries[v].valid = 0;
|
entries[v].valid = 0;
|
||||||
entries[v].address = NULL;
|
entries[v].address = NULL;
|
||||||
entries[v].bytes = 0;
|
entries[v].bytes = 0;
|
||||||
@ -219,6 +264,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
|
|||||||
entries[v].address=ptr;
|
entries[v].address=ptr;
|
||||||
entries[v].bytes =bytes;
|
entries[v].bytes =bytes;
|
||||||
entries[v].valid =1;
|
entries[v].valid =1;
|
||||||
|
cacheBytes += bytes;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -228,13 +274,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
|
|||||||
#ifdef ALLOCATION_CACHE
|
#ifdef ALLOCATION_CACHE
|
||||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||||
int cache = type+small;
|
int cache = type+small;
|
||||||
return Lookup(bytes,Entries[cache],Ncache[cache]);
|
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
|
||||||
#else
|
#else
|
||||||
return NULL;
|
return NULL;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
|
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
|
||||||
{
|
{
|
||||||
assert(ncache>0);
|
assert(ncache>0);
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
@ -243,6 +289,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
|
|||||||
for(int e=0;e<ncache;e++){
|
for(int e=0;e<ncache;e++){
|
||||||
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
||||||
entries[e].valid = 0;
|
entries[e].valid = 0;
|
||||||
|
cacheBytes -= entries[e].bytes;
|
||||||
return entries[e].address;
|
return entries[e].address;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -82,14 +82,15 @@ private:
|
|||||||
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
|
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
|
||||||
static int Victim[NallocType];
|
static int Victim[NallocType];
|
||||||
static int Ncache[NallocType];
|
static int Ncache[NallocType];
|
||||||
|
static uint64_t CacheBytes[NallocType];
|
||||||
|
|
||||||
/////////////////////////////////////////////////
|
/////////////////////////////////////////////////
|
||||||
// Free pool
|
// Free pool
|
||||||
/////////////////////////////////////////////////
|
/////////////////////////////////////////////////
|
||||||
static void *Insert(void *ptr,size_t bytes,int type) ;
|
static void *Insert(void *ptr,size_t bytes,int type) ;
|
||||||
static void *Lookup(size_t bytes,int type) ;
|
static void *Lookup(size_t bytes,int type) ;
|
||||||
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
|
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
|
||||||
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
|
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
|
||||||
|
|
||||||
static void PrintBytes(void);
|
static void PrintBytes(void);
|
||||||
public:
|
public:
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
#warning "Using explicit device memory copies"
|
#warning "Using explicit device memory copies"
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
|
//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
|
||||||
#define dprintf(...)
|
#define dprintf(...)
|
||||||
|
|
||||||
|
|
||||||
@ -429,6 +429,7 @@ void MemoryManager::NotifyDeletion(void *_ptr)
|
|||||||
}
|
}
|
||||||
void MemoryManager::Print(void)
|
void MemoryManager::Print(void)
|
||||||
{
|
{
|
||||||
|
PrintBytes();
|
||||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||||
std::cout << GridLogDebug << "Memory Manager " << std::endl;
|
std::cout << GridLogDebug << "Memory Manager " << std::endl;
|
||||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||||
|
@ -74,11 +74,13 @@ void acceleratorInit(void)
|
|||||||
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
|
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
|
||||||
#undef GPU_PROP_FMT
|
#undef GPU_PROP_FMT
|
||||||
#undef GPU_PROP
|
#undef GPU_PROP
|
||||||
|
|
||||||
#ifdef GRID_DEFAULT_GPU
|
#ifdef GRID_DEFAULT_GPU
|
||||||
|
int device = 0;
|
||||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||||
if ( world_rank == 0 ) {
|
if ( world_rank == 0 ) {
|
||||||
printf("AcceleratorCudaInit: using default device \n");
|
printf("AcceleratorCudaInit: using default device \n");
|
||||||
@ -87,10 +89,20 @@ void acceleratorInit(void)
|
|||||||
printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
|
printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
int device = rank;
|
||||||
printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
|
printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
|
||||||
printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
|
printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
|
||||||
cudaSetDevice(rank);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
cudaSetDevice(device);
|
||||||
|
|
||||||
|
const int len=64;
|
||||||
|
char busid[len];
|
||||||
|
if( rank == world_rank ) {
|
||||||
|
cudaDeviceGetPCIBusId(busid, len, device);
|
||||||
|
printf("local rank %d device %d bus id: %s\n", rank, device, busid);
|
||||||
|
}
|
||||||
|
|
||||||
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
|
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -115,6 +115,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
#endif
|
#endif
|
||||||
} // CUDA specific
|
} // CUDA specific
|
||||||
|
|
||||||
|
inline void cuda_mem(void)
|
||||||
|
{
|
||||||
|
size_t free_t,total_t,used_t;
|
||||||
|
cudaMemGetInfo(&free_t,&total_t);
|
||||||
|
used_t=total_t-free_t;
|
||||||
|
std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
{ \
|
{ \
|
||||||
int nt=acceleratorThreads(); \
|
int nt=acceleratorThreads(); \
|
||||||
@ -125,7 +133,11 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
}; \
|
}; \
|
||||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||||
|
std::cout << "========================== CUDA KERNEL CALL\n"; \
|
||||||
|
cuda_mem(); \
|
||||||
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \
|
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \
|
||||||
|
cuda_mem(); \
|
||||||
|
std::cout << "========================== CUDA KERNEL DONE\n"; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define accelerator_for6dNB(iter1, num1, \
|
#define accelerator_for6dNB(iter1, num1, \
|
||||||
|
14
systems/Summit/config-command
Normal file
14
systems/Summit/config-command
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
../../configure --enable-comms=mpi \
|
||||||
|
--enable-simd=GPU \
|
||||||
|
--enable-gen-simd-width=32 \
|
||||||
|
--enable-unified=no \
|
||||||
|
--enable-shm=nvlink \
|
||||||
|
--disable-gparity \
|
||||||
|
--enable-setdevice \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
--enable-accelerator=cuda \
|
||||||
|
--prefix /ccs/home/paboyle/prefix \
|
||||||
|
CXX=nvcc \
|
||||||
|
LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \
|
||||||
|
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_70,code=sm_70 -I/ccs/home/paboyle/prefix/include/ -std=c++14"
|
||||||
|
|
25
systems/Summit/dwf16.lsf
Normal file
25
systems/Summit/dwf16.lsf
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#BSUB -P LGT104
|
||||||
|
#BSUB -W 2:00
|
||||||
|
#BSUB -nnodes 4
|
||||||
|
#BSUB -J DWF
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=6
|
||||||
|
export PAMI_IBV_ADAPTER_AFFINITY=1
|
||||||
|
export PAMI_ENABLE_STRIPING=1
|
||||||
|
export OPT="--comms-concurrent --comms-overlap "
|
||||||
|
|
||||||
|
APP="./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.3 "
|
||||||
|
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
|
||||||
|
|
||||||
|
APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
|
||||||
|
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
|
||||||
|
|
||||||
|
APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
|
||||||
|
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
25
systems/Summit/dwf4.lsf
Normal file
25
systems/Summit/dwf4.lsf
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#BSUB -P LGT104
|
||||||
|
#BSUB -W 2:00
|
||||||
|
#BSUB -nnodes 4
|
||||||
|
#BSUB -J DWF
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=6
|
||||||
|
export PAMI_IBV_ADAPTER_AFFINITY=1
|
||||||
|
export PAMI_ENABLE_STRIPING=1
|
||||||
|
export OPT="--comms-concurrent --comms-overlap "
|
||||||
|
#export GRID_ALLOC_NCACHE_LARGE=1
|
||||||
|
export APP="./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.3 "
|
||||||
|
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
|
||||||
|
|
||||||
|
APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
|
||||||
|
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
|
||||||
|
|
||||||
|
APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
|
||||||
|
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
8
systems/Summit/sourceme-cuda10.sh
Normal file
8
systems/Summit/sourceme-cuda10.sh
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
export UCX_GDR_COPY_RCACHE=no
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
module load gcc/7.5.0
|
||||||
|
module load cuda/10.2.89
|
||||||
|
#cuda/11.4.0
|
||||||
|
export LD_LIBRARY_PATH=/ccs/home/paboyle/prefix/lib/:$LD_LIBRARY_PATH
|
||||||
|
|
Loading…
Reference in New Issue
Block a user