From ebb60330c90e085d30a799fb4176cf3faa2635cb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 17 May 2020 16:34:25 -0400 Subject: [PATCH] Automatic data motion options beginning --- Grid/GridCore.h | 2 +- Grid/allocator/AlignedAllocator.h | 127 +--------- Grid/allocator/AllocationCache.cc | 159 ++++++++++++ Grid/allocator/AllocationCache.h | 93 +++++++ Grid/allocator/Allocator.h | 4 + Grid/allocator/MemoryCacheDeviceMem.cc | 338 +++++++++++++++++++++++++ Grid/allocator/MemoryCacheShared.cc | 27 ++ Grid/allocator/MemoryStats.cc | 67 +++++ Grid/allocator/MemoryStats.h | 95 +++++++ Grid/lattice/Lattice_ET.h | 70 ++++- Grid/lattice/Lattice_base.h | 79 +++++- Grid/threads/Accelerator.h | 31 ++- Grid/util/Init.cc | 2 + configure.ac | 17 +- 14 files changed, 963 insertions(+), 148 deletions(-) create mode 100644 Grid/allocator/AllocationCache.cc create mode 100644 Grid/allocator/AllocationCache.h create mode 100644 Grid/allocator/Allocator.h create mode 100644 Grid/allocator/MemoryCacheDeviceMem.cc create mode 100644 Grid/allocator/MemoryCacheShared.cc create mode 100644 Grid/allocator/MemoryStats.cc create mode 100644 Grid/allocator/MemoryStats.h diff --git a/Grid/GridCore.h b/Grid/GridCore.h index f7c1267a..2209f960 100644 --- a/Grid/GridCore.h +++ b/Grid/GridCore.h @@ -47,7 +47,7 @@ Author: paboyle #include #include #include -#include +#include #include #include #include diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index c8742d3e..c3a32cd3 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -26,102 +26,10 @@ Author: Peter Boyle See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#ifndef GRID_ALIGNED_ALLOCATOR_H -#define GRID_ALIGNED_ALLOCATOR_H +#pragma once NAMESPACE_BEGIN(Grid); -/*Move control to configure.ac and Config.h*/ -#define POINTER_CACHE -/*Pinning pages is costly*/ -/*Could maintain separate large and small allocation caches*/ -#ifdef POINTER_CACHE -class PointerCache { -private: - - static const int Ncache=128; - static int victim; - - typedef struct { - void *address; - size_t bytes; - int valid; - } PointerCacheEntry; - - static PointerCacheEntry Entries[Ncache]; - -public: - - static void *Insert(void *ptr,size_t bytes) ; - static void *Lookup(size_t bytes) ; - -}; -#endif - -std::string sizeString(size_t bytes); - -struct MemoryStats -{ - size_t totalAllocated{0}, maxAllocated{0}, - currentlyAllocated{0}, totalFreed{0}; -}; - -class MemoryProfiler -{ -public: - static MemoryStats *stats; - static bool debug; -}; - -#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")" -#define profilerDebugPrint \ - if (MemoryProfiler::stats) \ - { \ - auto s = MemoryProfiler::stats; \ - std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \ - std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \ - << std::endl; \ - std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \ - << std::endl; \ - std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \ - << std::endl; \ - std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \ - << std::endl; \ - } - -#define profilerAllocate(bytes) \ - if (MemoryProfiler::stats) \ - { \ - auto s = MemoryProfiler::stats; \ - s->totalAllocated += (bytes); \ - s->currentlyAllocated += (bytes); \ - s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \ - } \ - if (MemoryProfiler::debug) \ - { \ - std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \ - profilerDebugPrint; \ - } - -#define profilerFree(bytes) \ - if (MemoryProfiler::stats) \ - { \ - auto s = MemoryProfiler::stats; \ - s->totalFreed += (bytes); \ - s->currentlyAllocated -= (bytes); \ - } \ - if (MemoryProfiler::debug) \ - { \ - std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \ - profilerDebugPrint; \ - } - -void check_huge_pages(void *Buf,uint64_t BYTES); - -//////////////////////////////////////////////////////////////////// -// A lattice of something, but assume the something is SIMDized. -//////////////////////////////////////////////////////////////////// - template class alignedAllocator { public: @@ -144,42 +52,23 @@ public: pointer allocate(size_type __n, const void* _p= 0) { size_type bytes = __n*sizeof(_Tp); + profilerAllocate(bytes); -#ifdef POINTER_CACHE - _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); -#else - pointer ptr = nullptr; -#endif - - if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) acceleratorAllocShared(bytes); - + _Tp *ptr = (_Tp*) AllocationCache::CpuAllocate(bytes); + assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); -#if 0 - size_type page_size=4096; - size_type pages = (bytes+page_size-1)/page_size; - uint8_t *bp = (uint8_t *)ptr; - - accelerator_for(pg,pages,1,{ - bp[pg*page_size]=0; - }); -#endif return ptr; } - void deallocate(pointer __p, size_type __n) { + void deallocate(pointer __p, size_type __n) + { size_type bytes = __n * sizeof(_Tp); profilerFree(bytes); -#ifdef POINTER_CACHE - pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); -#else - pointer __freeme = __p; -#endif - - if ( __freeme ) acceleratorFreeShared((void *)__freeme); + AllocationCache::CpuFree((void *)__p,bytes); } // FIXME: hack for the copy constructor, eventually it must be avoided @@ -201,4 +90,4 @@ template using Matrix = std::vector + +NAMESPACE_BEGIN(Grid); + +/*Allocation types, saying which pointer cache should be used*/ +#define Cpu (0) +#define CpuSmall (1) +#define Acc (2) +#define AccSmall (3) + +////////////////////////////////////////////////////////////////////// +// Data tables for recently freed pooiniter caches +////////////////////////////////////////////////////////////////////// +AllocationCache::AllocationCacheEntry AllocationCache::Entries[AllocationCache::NallocType][AllocationCache::NallocCacheMax]; +int AllocationCache::Victim[AllocationCache::NallocType]; +int AllocationCache::Ncache[AllocationCache::NallocType]; + +////////////////////////////////////////////////////////////////////// +// Actual allocation and deallocation utils +////////////////////////////////////////////////////////////////////// +void *AllocationCache::AcceleratorAllocate(size_t bytes) +{ + void *ptr = (void *) Lookup(bytes,Acc); + + if ( ptr == (void *) NULL ) + ptr = (void *) acceleratorAllocDevice(bytes); + + return ptr; +} +void AllocationCache::AcceleratorFree (void *ptr,size_t bytes) +{ + void *__freeme = Insert(ptr,bytes,Acc); + + if ( __freeme ) acceleratorFreeShared(__freeme); +} +void *AllocationCache::CpuAllocate(size_t bytes) +{ + void *ptr = (void *) Lookup(bytes,Cpu); + + if ( ptr == (void *) NULL ) { + ptr = (void *) acceleratorAllocShared(bytes); + // std::cout <<"CpuAllocate: allocated pointer "<=0){ Evict(e); } + + // If present remove entry and free accelerator too. + // Can we ever hit a free event with a view still in scope? + void *__freeme = Insert(ptr,bytes,Cpu); + // std::cout <<"CpuFree cached pointer "<=0) && (Nc < NallocCacheMax)) { + Ncache[Cpu]=Nc; + Ncache[Acc]=Nc; + } + } + + str= getenv("GRID_ALLOC_NCACHE_SMALL"); + if ( str ) { + Nc = atoi(str); + if ( (Nc>=0) && (Nc < NallocCacheMax)) { + Ncache[CpuSmall]=Nc; + Ncache[AccSmall]=Nc; + } + } +} + +void *AllocationCache::Insert(void *ptr,size_t bytes,int type) +{ + bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); + int cache = type + small; + return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]); +} +void *AllocationCache::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) +{ + assert(ncache>0); +#ifdef GRID_OMP + assert(omp_in_parallel()==0); +#endif + + void * ret = NULL; + int v = -1; + + for(int e=0;e0); +#ifdef GRID_OMP + assert(omp_in_parallel()==0); +#endif + for(int e=0;e +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +// Move control to configure.ac and Config.h? + +#define ALLOCATION_CACHE +#define GRID_ALLOC_ALIGN (2*1024*1024) +#define GRID_ALLOC_SMALL_LIMIT (4096) + +/*Pinning pages is costly*/ + +class AllocationCache { +private: + + //////////////////////////////////////////////////////////// + // For caching recently freed allocations + //////////////////////////////////////////////////////////// + typedef struct { + void *address; + size_t bytes; + int valid; + } AllocationCacheEntry; + + static const int NallocCacheMax=128; + static const int NallocType=4; + static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; + static int Victim[NallocType]; + static int Ncache[NallocType]; + + ///////////////////////////////////////////////// + // Free pool + ///////////////////////////////////////////////// + static void *Insert(void *ptr,size_t bytes,int type) ; + static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ; + static void *Lookup(size_t bytes,int type) ; + static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ; + + ///////////////////////////////////////////////// + // Internal device view + ///////////////////////////////////////////////// + static void *AcceleratorAllocate(size_t bytes); + static void AcceleratorFree (void *ptr,size_t bytes); + static int ViewVictim(void); + static void Evict(int e); + static void Flush(int e); + static void Clone(int e); + static int CpuViewLookup(void *CpuPtr); + static int AccViewLookup(void *AccPtr); + +public: + static void Init(void); + + static void AccViewClose(void* AccPtr); + static void CpuViewClose(void* CpuPtr); + static void *AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient); + static void *CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient); + + static void *CpuAllocate(size_t bytes); + static void CpuFree (void *ptr,size_t bytes); +}; + +NAMESPACE_END(Grid); + + diff --git a/Grid/allocator/Allocator.h b/Grid/allocator/Allocator.h new file mode 100644 index 00000000..9eaec8f6 --- /dev/null +++ b/Grid/allocator/Allocator.h @@ -0,0 +1,4 @@ +#pragma once +#include +#include +#include diff --git a/Grid/allocator/MemoryCacheDeviceMem.cc b/Grid/allocator/MemoryCacheDeviceMem.cc new file mode 100644 index 00000000..e46d48af --- /dev/null +++ b/Grid/allocator/MemoryCacheDeviceMem.cc @@ -0,0 +1,338 @@ +#include +#ifndef GRID_UNIFIED + +#warning "Using explicit device memory copies" +NAMESPACE_BEGIN(Grid); +#define dprintf(...) + +//////////////////////////////////////////////////////////// +// For caching copies of data on device +//////////////////////////////////////////////////////////// +const int NaccCacheMax=128; + +typedef struct { + void *CpuPtr; + void *AccPtr; + size_t bytes; + uint32_t transient; + uint32_t state; + uint32_t accLock; + uint32_t cpuLock; +} AcceleratorViewEntry; + +#define Write (1) +#define Read (2) +#define WriteDiscard (3) +////////////////////////////////////////////////////////////////////// +// Data tables for ViewCache +////////////////////////////////////////////////////////////////////// +static AcceleratorViewEntry AccCache[NaccCacheMax]; +static int AccCacheVictim; // Base for round robin search +static int NaccCache = 8; + +//////////////////////////////////// +// Priority ordering for unlocked entries +// Empty +// CpuDirty +// Consistent +// AccDirty +//////////////////////////////////// +#define Empty (0x0) /*Entry unoccupied */ +#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/ +#define Consistent (0x2) /*ACC copy AND CPU copy are valid */ +#define AccDirty (0x4) /*ACC copy is golden */ +#define EvictNext (0x8) /*Priority for eviction*/ + +int AllocationCache::ViewVictim(void) +{ + int prioEmpty =-1; + int prioCpuDirty =-1; + int prioConsistent =-1; + int prioAccDirty =-1; + int prioCpuDirtyEN =-1; + int prioConsistentEN =-1; + int prioAccDirtyEN =-1; + + int victim=-1; + + // round robin priority search of unlocked entries offset from current victim + for(int ep=0;ep= 0 ) victim = prioAccDirty; + if ( prioConsistent >= 0 ) victim = prioConsistent; + if ( prioCpuDirty >= 0 ) victim = prioCpuDirty; + if ( prioAccDirtyEN >= 0 ) victim = prioAccDirtyEN; + if ( prioConsistentEN >= 0 ) victim = prioConsistentEN; + if ( prioCpuDirtyEN >= 0 ) victim = prioCpuDirtyEN; + if ( prioEmpty >= 0 ) victim = prioEmpty; /*Highest prio is winner*/ + + assert(victim >= 0); // Must succeed/ + dprintf("AllocationCacheDeviceMem: Selected victim cache entry %d\n",victim); + + // advance victim pointer + AccCacheVictim=(AccCacheVictim+1)%NaccCache; + dprintf("AllocationCacheDeviceMem: victim pointer now %d / %d\n",AccCacheVictim,NaccCache); + + return victim; +} +///////////////////////////////////////////////// +// Accelerator cache motion +///////////////////////////////////////////////// +void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerator, remove entry +{ + if(AccCache[e].state!=Empty){ + dprintf("AllocationCache: Evict(%d) %llx,%llxn",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); + assert(AccCache[e].accLock==0); + assert(AccCache[e].cpuLock==0); + if(AccCache[e].state==AccDirty) { + Flush(e); + } + assert(AccCache[e].CpuPtr!=NULL); + if(AccCache[e].AccPtr) { + dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr); + AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes); + } + } + AccCache[e].AccPtr=NULL; + AccCache[e].CpuPtr=NULL; + AccCache[e].bytes=0; + AccCache[e].state=Empty; + AccCache[e].accLock=0; + AccCache[e].cpuLock=0; +} +void AllocationCache::Flush(int e)// Copy back from a dirty device state and mark consistent. Do not remove +{ + dprintf("AllocationCache: Flush(%d) %llx -> %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); + assert(AccCache[e].state==AccDirty); + assert(AccCache[e].cpuLock==0); + assert(AccCache[e].accLock==0); + assert(AccCache[e].AccPtr!=NULL); + assert(AccCache[e].CpuPtr!=NULL); + acceleratorCopyFromDevice(AccCache[e].AccPtr,AccCache[e].CpuPtr,AccCache[e].bytes); + AccCache[e].state=Consistent; +} +void AllocationCache::Clone(int e)// Copy from CPU, mark consistent. Allocate if necessary +{ + assert(AccCache[e].state==CpuDirty); + assert(AccCache[e].cpuLock==0); + assert(AccCache[e].accLock==0); + assert(AccCache[e].CpuPtr!=NULL); + if(AccCache[e].AccPtr==NULL){ + AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes); + } + dprintf("AllocationCache: Clone(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); + acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes); + AccCache[e].state=Consistent; +} +///////////////////////////////////////////////////////////////////////////////// +// View management +///////////////////////////////////////////////////////////////////////////////// +void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) +{ + //////////////////////////////////////////////////////////////////////////// + // Find if present, otherwise get or force an empty + //////////////////////////////////////////////////////////////////////////// + int e=CpuViewLookup(CpuPtr); + if(e==-1) { + e = ViewVictim(); + Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty + } + + assert(AccCache[e].cpuLock==0); // Programming error + + if(AccCache[e].state!=Empty) { + assert(AccCache[e].CpuPtr == CpuPtr); + assert(AccCache[e].bytes==bytes); + } +/* + * State transitions and actions + * + * Action State StateNext Flush Clone + * + * AccRead Empty Consistent - Y + * AccWrite Empty AccDirty - Y + * AccRead CpuDirty Consistent - Y + * AccWrite CpuDirty AccDirty - Y + * AccRead Consistent Consistent - - + * AccWrite Consistent AccDirty - - + * AccRead AccDirty AccDirty - - + * AccWrite AccDirty AccDirty - - + */ + if(AccCache[e].state==Empty) { + AccCache[e].CpuPtr = CpuPtr; + AccCache[e].AccPtr = NULL; + AccCache[e].bytes = bytes; + AccCache[e].state = CpuDirty; // Cpu starts primary + Clone(e); + if(mode==Write) + AccCache[e].state = AccDirty; // Empty + AccWrite=> AccDirty + else + AccCache[e].state = Consistent; // Empty + AccRead => Consistent + AccCache[e].accLock= 1; + } else if(AccCache[e].state&CpuDirty ){ + Clone(e); + if(mode==Write) + AccCache[e].state = AccDirty; // CpuDirty + AccWrite=> AccDirty + else + AccCache[e].state = Consistent; // CpuDirty + AccRead => Consistent + AccCache[e].accLock++; + } else if(AccCache[e].state&Consistent) { + if(mode==Write) + AccCache[e].state = AccDirty; // Consistent + AccWrite=> AccDirty + else + AccCache[e].state = Consistent; // Consistent + AccRead => Consistent + AccCache[e].accLock++; + } else if(AccCache[e].state&AccDirty) { + if(mode==Write) + AccCache[e].state = AccDirty; // AccDirty + AccWrite=> AccDirty + else + AccCache[e].state = AccDirty; // AccDirty + AccRead => AccDirty + AccCache[e].accLock++; + } else { + assert(0); + } + + AccCache[e].transient= transient? EvictNext : 0; + + return AccCache[e].AccPtr; +} +/* + * Action State StateNext Flush Clone + * + * CpuRead Empty CpuDirty - - + * CpuWrite Empty CpuDirty - - + * CpuRead CpuDirty CpuDirty - - + * CpuWrite CpuDirty CpuDirty - - + * CpuRead Consistent Consistent - - + * CpuWrite Consistent CpuDirty - - + * CpuRead AccDirty Consistent Y - + * CpuWrite AccDirty CpuDirty Y - + */ +//////////////////////////////////// +// look up & decrement lock count +//////////////////////////////////// +void AllocationCache::AccViewClose(void* AccPtr) +{ + int e=AccViewLookup(AccPtr); + assert(e!=-1); + assert(AccCache[e].cpuLock==0); + assert(AccCache[e].accLock>0); + AccCache[e].accLock--; +} +void AllocationCache::CpuViewClose(void* CpuPtr) +{ + int e=CpuViewLookup(CpuPtr); + assert(e!=-1); + assert(AccCache[e].cpuLock>0); + assert(AccCache[e].accLock==0); + AccCache[e].cpuLock--; +} +void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) +{ + //////////////////////////////////////////////////////////////////////////// + // Find if present, otherwise get or force an empty + //////////////////////////////////////////////////////////////////////////// + int e=CpuViewLookup(CpuPtr); + if(e==-1) { + e = ViewVictim(); + Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty + } + + assert(AccCache[e].accLock==0); // Programming error + + if(AccCache[e].state!=Empty) { + assert(AccCache[e].CpuPtr == CpuPtr); + assert(AccCache[e].bytes==bytes); + } + + if(AccCache[e].state==Empty) { + AccCache[e].CpuPtr = CpuPtr; + AccCache[e].AccPtr = NULL; + AccCache[e].bytes = bytes; + AccCache[e].state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty + AccCache[e].accLock= 0; + AccCache[e].cpuLock= 1; + } else if(AccCache[e].state==CpuDirty ){ + // AccPtr dont care, deferred allocate + AccCache[e].state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty + AccCache[e].cpuLock++; + } else if(AccCache[e].state==Consistent) { + assert(AccCache[e].AccPtr != NULL); + if(mode==Write) + AccCache[e].state = CpuDirty; // Consistent +CpuWrite => CpuDirty + else + AccCache[e].state = Consistent; // Consistent +CpuRead => Consistent + AccCache[e].cpuLock++; + } else if(AccCache[e].state==AccDirty) { + assert(AccCache[e].AccPtr != NULL); + Flush(e); + if(mode==Write) AccCache[e].state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush + else AccCache[e].state = Consistent; // AccDirty +CpuRead => Consistent, Flush + AccCache[e].cpuLock++; + } else { + assert(0); // should be unreachable + } + + AccCache[e].transient= transient? EvictNext : 0; + + return AccCache[e].CpuPtr; +} + +////////////////////////////////////////////////////////////////////////////// +//loop round robin over entries checking acc pointer +////////////////////////////////////////////////////////////////////////////// +int AllocationCache::CpuViewLookup(void *CpuPtr) +{ + assert(CpuPtr!=NULL); + for(int e=0;e +#ifdef GRID_UNIFIED + +#warning "Grid is assuming unified virtual memory address space" +NAMESPACE_BEGIN(Grid); +///////////////////////////////////////////////////////////////////////////////// +// View management is 1:1 address space mapping +///////////////////////////////////////////////////////////////////////////////// + +void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; } +void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; } +void AllocationCache::AccViewClose(void* AccPtr){} +void AllocationCache::CpuViewClose(void* CpuPtr){} + +///////////////////////////////////// +// Dummy stubs +///////////////////////////////////// +int AllocationCache::ViewVictim(void) { assert(0); return 0;} +void AllocationCache::Evict(int e) { assert(0);} +void AllocationCache::Flush(int e) { assert(0);} +void AllocationCache::Clone(int e) { assert(0);} + +int AllocationCache::CpuViewLookup(void *CpuPtr){assert(0); return 0;} +int AllocationCache::AccViewLookup(void *AccPtr){assert(0); return 0;} + +NAMESPACE_END(Grid); +#endif diff --git a/Grid/allocator/MemoryStats.cc b/Grid/allocator/MemoryStats.cc new file mode 100644 index 00000000..0d1707d9 --- /dev/null +++ b/Grid/allocator/MemoryStats.cc @@ -0,0 +1,67 @@ +#include +#include + +NAMESPACE_BEGIN(Grid); + +MemoryStats *MemoryProfiler::stats = nullptr; +bool MemoryProfiler::debug = false; + +void check_huge_pages(void *Buf,uint64_t BYTES) +{ +#ifdef __linux__ + int fd = open("/proc/self/pagemap", O_RDONLY); + assert(fd >= 0); + const int page_size = 4096; + uint64_t virt_pfn = (uint64_t)Buf / page_size; + off_t offset = sizeof(uint64_t) * virt_pfn; + uint64_t npages = (BYTES + page_size-1) / page_size; + uint64_t pagedata[npages]; + uint64_t ret = lseek(fd, offset, SEEK_SET); + assert(ret == offset); + ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); + assert(ret == sizeof(uint64_t) * npages); + int nhugepages = npages / 512; + int n4ktotal, nnothuge; + n4ktotal = 0; + nnothuge = 0; + for (int i = 0; i < nhugepages; ++i) { + uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; + for (int j = 0; j < 512; ++j) { + uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; + ++n4ktotal; + if (pageaddr != baseaddr + j * page_size) + ++nnothuge; + } + } + int rank = CartesianCommunicator::RankWorld(); + printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); +#endif +} + +std::string sizeString(const size_t bytes) +{ + constexpr unsigned int bufSize = 256; + const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"}; + char buf[256]; + size_t s = 0; + double count = bytes; + + while (count >= 1024 && s < 7) + { + s++; + count /= 1024; + } + if (count - floor(count) == 0.0) + { + snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); + } + else + { + snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); + } + + return std::string(buf); +} + +NAMESPACE_END(Grid); + diff --git a/Grid/allocator/MemoryStats.h b/Grid/allocator/MemoryStats.h new file mode 100644 index 00000000..156c9747 --- /dev/null +++ b/Grid/allocator/MemoryStats.h @@ -0,0 +1,95 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/MemoryStats.h + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + + +NAMESPACE_BEGIN(Grid); + +std::string sizeString(size_t bytes); + +struct MemoryStats +{ + size_t totalAllocated{0}, maxAllocated{0}, + currentlyAllocated{0}, totalFreed{0}; +}; + +class MemoryProfiler +{ +public: + static MemoryStats *stats; + static bool debug; +}; + +#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")" +#define profilerDebugPrint \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \ + << std::endl; \ + } + +#define profilerAllocate(bytes) \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + s->totalAllocated += (bytes); \ + s->currentlyAllocated += (bytes); \ + s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \ + } \ + if (MemoryProfiler::debug) \ + { \ + std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \ + profilerDebugPrint; \ + } + +#define profilerFree(bytes) \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + s->totalFreed += (bytes); \ + s->currentlyAllocated -= (bytes); \ + } \ + if (MemoryProfiler::debug) \ + { \ + std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \ + profilerDebugPrint; \ + } + +void check_huge_pages(void *Buf,uint64_t BYTES); + +NAMESPACE_END(Grid); + diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index cf7147b9..b8abd199 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -87,7 +87,7 @@ sobj eval(const uint64_t ss, const sobj &arg) } template accelerator_inline -const lobj & eval(const uint64_t ss, const LatticeView &arg) +const lobj & eval(const uint64_t ss, const LatticeExprView &arg) { return arg[ss]; } @@ -179,16 +179,12 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf cb = lat.Checkerboard(); } template ::value, T1>::type * = nullptr> -inline void CBFromExpression(int &cb, const T1 ¬lat) // non-lattice leaf -{ -} - +inline void CBFromExpression(int &cb, const T1 ¬lat) {} // non-lattice leaf template inline void CBFromExpression(int &cb,const LatticeUnaryExpression &expr) { CBFromExpression(cb, expr.arg1); // recurse AST } - template inline void CBFromExpression(int &cb,const LatticeBinaryExpression &expr) { @@ -203,6 +199,68 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression::value, T1>::type * = nullptr> +inline void ExpressionViewOpen(T1 &lat) // Lattice leaf +{ + lat.AcceleratorViewOpen(); +} +template ::value, T1>::type * = nullptr> + inline void ExpressionViewOpen(T1 ¬lat) {} + +template inline +void ExpressionViewOpen(LatticeUnaryExpression &expr) +{ + ExpressionViewOpen(expr.arg1); // recurse AST +} + +template inline +void ExpressionViewOpen(LatticeBinaryExpression &expr) +{ + ExpressionViewOpen(expr.arg1); // recurse AST + ExpressionViewOpen(expr.arg2); // recurse AST +} +template +inline void ExpressionViewOpen(LatticeTrinaryExpression &expr) +{ + ExpressionViewOpen(expr.arg1); // recurse AST + ExpressionViewOpen(expr.arg2); // recurse AST + ExpressionViewOpen(expr.arg3); // recurse AST +} + +////////////////////////////////////////////////////////////////////////// +// ViewClose +////////////////////////////////////////////////////////////////////////// +template ::value, T1>::type * = nullptr> +inline void ExpressionViewClose( T1 &lat) // Lattice leaf +{ + lat.AcceleratorViewClose(); +} +template ::value, T1>::type * = nullptr> +inline void ExpressionViewClose(T1 ¬lat) {} + +template inline +void ExpressionViewClose(LatticeUnaryExpression &expr) +{ + ExpressionViewClose(expr.arg1); // recurse AST +} +template inline +void ExpressionViewClose(LatticeBinaryExpression &expr) +{ + ExpressionViewClose(expr.arg1); // recurse AST + ExpressionViewClose(expr.arg2); // recurse AST +} +template +inline void ExpressionViewClose(LatticeTrinaryExpression &expr) +{ + ExpressionViewClose(expr.arg1); // recurse AST + ExpressionViewClose(expr.arg2); // recurse AST + ExpressionViewClose(expr.arg3); // recurse AST +} + //////////////////////////////////////////// // Unary operators and funcs //////////////////////////////////////////// diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 6a8664d4..76622275 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -83,11 +83,9 @@ public: // The copy constructor for this will need to be used by device lambda functions ///////////////////////////////////////////////////////////////////////////////////////// template -class LatticeView : public LatticeAccelerator +class LatticeExprView : public LatticeAccelerator { public: - - // Rvalue #ifdef GRID_SIMT accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); } @@ -102,11 +100,65 @@ public: accelerator_inline uint64_t end(void) const { return this->_odata_size; }; accelerator_inline uint64_t size(void) const { return this->_odata_size; }; - LatticeView(const LatticeAccelerator &refer_to_me) : LatticeAccelerator (refer_to_me) + // Non accelerator functions + LatticeExprView(const LatticeAccelerator &refer_to_me) : LatticeAccelerator (refer_to_me){} + ~LatticeExprView(){} + + void AcceleratorViewOpen(void) + { // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base + void *cpu_ptr=this->_odata; + // std::cout << "AccViewOpen "<_odata <_odata=(vobj *)AllocationCache::AccViewOpen(this->_odata,this->_odata_size*sizeof(vobj),1,0); + } + void AcceleratorViewClose(void) + { // Inform the manager + // std::cout << "View Close"<_odata<_odata); + } + void CpuViewOpen(void) + { // Translate the pointer + void *cpu_ptr=this->_odata; + // std::cout << "CpuViewOpen "<_odata <_odata=(vobj *)AllocationCache::CpuViewOpen(cpu_ptr,this->_odata_size*sizeof(vobj),1,0); + } + void CpuViewClose(void) + { // Inform the manager + // std::cout << "CpuViewClose"<_odata<_odata); + } + +}; +// UserView constructor,destructor updates view manager +// Non-copyable object??? Second base with copy/= deleted? +template +class LatticeView : public LatticeExprView +{ +public: + // Rvalue + /* +#ifdef GRID_SIMT + accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); } +#else + accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } +#endif + + accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; + accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; }; + + accelerator_inline uint64_t begin(void) const { return 0;}; + accelerator_inline uint64_t end(void) const { return this->_odata_size; }; + accelerator_inline uint64_t size(void) const { return this->_odata_size; }; + */ + LatticeView(const LatticeAccelerator &refer_to_me) : LatticeExprView (refer_to_me) { + this->AcceleratorViewOpen(); + } + ~LatticeView(){ + this->AcceleratorViewClose(); } }; + ///////////////////////////////////////////////////////////////////////////////////////// // Lattice expression types used by ET to assemble the AST // @@ -120,7 +172,7 @@ template using is_lattice = std::is_base_of; template using is_lattice_expr = std::is_base_of; template struct ViewMapBase { typedef T Type; }; -template struct ViewMapBase { typedef LatticeView Type; }; +template struct ViewMapBase { typedef LatticeExprView Type; }; template using ViewMap = ViewMapBase::value >; template @@ -231,12 +283,15 @@ public: CBFromExpression(cb,expr); assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - + + auto exprCopy = expr; + ExpressionViewOpen(exprCopy); auto me = View(); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,expr); + auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + ExpressionViewClose(exprCopy); return *this; } template inline Lattice & operator=(const LatticeBinaryExpression &expr) @@ -251,11 +306,14 @@ public: assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; + auto exprCopy = expr; + ExpressionViewOpen(exprCopy); auto me = View(); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,expr); + auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + ExpressionViewClose(exprCopy); return *this; } template inline Lattice & operator=(const LatticeTrinaryExpression &expr) @@ -269,11 +327,14 @@ public: CBFromExpression(cb,expr); assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; + auto exprCopy = expr; + ExpressionViewOpen(exprCopy); auto me = View(); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,expr); + auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + ExpressionViewClose(exprCopy); return *this; } //GridFromExpression is tricky to do diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 0a5103a2..5da4e21e 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -27,6 +27,17 @@ Author: paboyle *************************************************************************************/ /* END LEGAL */ #pragma once + +#ifdef HAVE_MALLOC_MALLOC_H +#include +#endif +#ifdef HAVE_MALLOC_H +#include +#endif +#ifdef HAVE_MM_MALLOC_H +#include +#endif + NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////// @@ -144,8 +155,8 @@ inline void *acceleratorAllocDevice(size_t bytes) }; inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; - - +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} #endif ////////////////////////////////////////////// @@ -192,6 +203,8 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} #endif @@ -275,6 +288,8 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} #endif @@ -311,16 +326,8 @@ inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ }); accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific - -#ifdef HAVE_MALLOC_MALLOC_H -#include -#endif -#ifdef HAVE_MALLOC_H -#include -#endif -#ifdef HAVE_MM_MALLOC_H -#include -#endif +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} #ifdef HAVE_MM_MALLOC_H inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index f1ab6551..97ac7dc9 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -286,6 +286,8 @@ void Grid_init(int *argc,char ***argv) ////////////////////////////////////////////////////////// acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver + AllocationCache::Init(); + if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){ int MB; arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm"); diff --git a/configure.ac b/configure.ac index f9ea03fc..74d37605 100644 --- a/configure.ac +++ b/configure.ac @@ -147,7 +147,7 @@ case ${ac_SUMMIT} in AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);; esac -############### SYCL +############### SYCL/CUDA/HIP/none AC_ARG_ENABLE([accelerator], [AC_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none], [enable none,cuda,sycl,hip acceleration])], [ac_ACCELERATOR=${enable_accelerator}], [ac_ACCELERATOR=none]) @@ -168,6 +168,20 @@ case ${ac_ACCELERATOR} in AC_MSG_ERROR(["Acceleration not suppoorted ${ac_ACCELERATOR}"]);; esac +############### UNIFIED MEMORY +AC_ARG_ENABLE([unified], + [AC_HELP_STRING([--enable-unified=yes|no], [enable unified address space for accelerator loops])], + [ac_UNIFIED=${enable_unified}], [ac_UNIFIED=yes]) +case ${ac_UNIFIED} in + yes) + echo Unified memory for accelerator loops + AC_DEFINE([GRID_UVM],[1],[Use unified address space]);; + no) + echo Manual memory copy for accelerator loops;; + *) + AC_MSG_ERROR(["Unified virtual memory option not suppoorted ${ac_UNIFIED}"]);; +esac + ############### Intel libraries AC_ARG_ENABLE([mkl], [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], @@ -612,6 +626,7 @@ compiler version : ${ax_cv_gxx_version} SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG} Threading : ${ac_openmp} Acceleration : ${ac_ACCELERATOR} +Unified virtual memory : ${ac_UNIFIED} Communications type : ${comms_type} Shared memory allocator : ${ac_SHM} Shared memory mmap path : ${ac_SHMPATH}