1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 23:37:06 +01:00

Automatic data motion options beginning

This commit is contained in:
Peter Boyle
2020-05-17 16:34:25 -04:00
parent a9847aa866
commit ebb60330c9
14 changed files with 963 additions and 148 deletions

View File

@ -26,102 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H
#define GRID_ALIGNED_ALLOCATOR_H
#pragma once
NAMESPACE_BEGIN(Grid);
/*Move control to configure.ac and Config.h*/
#define POINTER_CACHE
/*Pinning pages is costly*/
/*Could maintain separate large and small allocation caches*/
#ifdef POINTER_CACHE
class PointerCache {
private:
static const int Ncache=128;
static int victim;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[Ncache];
public:
static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ;
};
#endif
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
template<typename _Tp>
class alignedAllocator {
public:
@ -144,42 +52,23 @@ public:
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef POINTER_CACHE
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
#else
pointer ptr = nullptr;
#endif
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) acceleratorAllocShared(bytes);
_Tp *ptr = (_Tp*) AllocationCache::CpuAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
#if 0
size_type page_size=4096;
size_type pages = (bytes+page_size-1)/page_size;
uint8_t *bp = (uint8_t *)ptr;
accelerator_for(pg,pages,1,{
bp[pg*page_size]=0;
});
#endif
return ptr;
}
void deallocate(pointer __p, size_type __n) {
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
#ifdef POINTER_CACHE
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#else
pointer __freeme = __p;
#endif
if ( __freeme ) acceleratorFreeShared((void *)__freeme);
AllocationCache::CpuFree((void *)__p,bytes);
}
// FIXME: hack for the copy constructor, eventually it must be avoided
@ -201,4 +90,4 @@ template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,159 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuSmall (1)
#define Acc (2)
#define AccSmall (3)
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
AllocationCache::AllocationCacheEntry AllocationCache::Entries[AllocationCache::NallocType][AllocationCache::NallocCacheMax];
int AllocationCache::Victim[AllocationCache::NallocType];
int AllocationCache::Ncache[AllocationCache::NallocType];
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
//////////////////////////////////////////////////////////////////////
void *AllocationCache::AcceleratorAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL )
ptr = (void *) acceleratorAllocDevice(bytes);
return ptr;
}
void AllocationCache::AcceleratorFree (void *ptr,size_t bytes)
{
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) acceleratorFreeShared(__freeme);
}
void *AllocationCache::CpuAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
// std::cout <<"CpuAllocate: allocated pointer "<<std::hex<<ptr<<std::endl;
} else {
// std::cout <<"CpuAllocate: cached pointer "<<std::hex<<ptr<<std::endl;
}
return ptr;
}
void AllocationCache::CpuFree (void *ptr,size_t bytes)
{
// Look up in ViewCache
int e=CpuViewLookup(ptr);
if(e>=0){ Evict(e); }
// If present remove entry and free accelerator too.
// Can we ever hit a free event with a view still in scope?
void *__freeme = Insert(ptr,bytes,Cpu);
// std::cout <<"CpuFree cached pointer "<<std::hex<<ptr<<std::endl;
// std::cout <<"CpuFree deallocating pointer "<<std::hex<<__freeme<<std::endl;
if ( __freeme ) acceleratorFreeShared(__freeme);
}
//////////////////////////////////////////
// call only once
//////////////////////////////////////////
void AllocationCache::Init(void)
{
Ncache[Cpu] = 8;
Ncache[Acc] = 8;
Ncache[CpuSmall] = 32;
Ncache[AccSmall] = 32;
char * str;
int Nc;
int NcS;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[Cpu]=Nc;
Ncache[Acc]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuSmall]=Nc;
Ncache[AccSmall]=Nc;
}
}
}
void *AllocationCache::Insert(void *ptr,size_t bytes,int type)
{
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
}
void *AllocationCache::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
return ret;
}
void *AllocationCache::Lookup(size_t bytes,int type)
{
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache]);
}
void *AllocationCache::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
return entries[e].address;
}
}
return NULL;
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,93 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/AllocationCache.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define ALLOCATION_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096)
/*Pinning pages is costly*/
class AllocationCache {
private:
////////////////////////////////////////////////////////////
// For caching recently freed allocations
////////////////////////////////////////////////////////////
typedef struct {
void *address;
size_t bytes;
int valid;
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=4;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
/////////////////////////////////////////////////
// Free pool
/////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes,int type) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
/////////////////////////////////////////////////
// Internal device view
/////////////////////////////////////////////////
static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes);
static int ViewVictim(void);
static void Evict(int e);
static void Flush(int e);
static void Clone(int e);
static int CpuViewLookup(void *CpuPtr);
static int AccViewLookup(void *AccPtr);
public:
static void Init(void);
static void AccViewClose(void* AccPtr);
static void CpuViewClose(void* CpuPtr);
static void *AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
static void *CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
static void *CpuAllocate(size_t bytes);
static void CpuFree (void *ptr,size_t bytes);
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,4 @@
#pragma once
#include <Grid/allocator/MemoryStats.h>
#include <Grid/allocator/AllocationCache.h>
#include <Grid/allocator/AlignedAllocator.h>

View File

@ -0,0 +1,338 @@
#include <Grid/GridCore.h>
#ifndef GRID_UNIFIED
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
#define dprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
////////////////////////////////////////////////////////////
const int NaccCacheMax=128;
typedef struct {
void *CpuPtr;
void *AccPtr;
size_t bytes;
uint32_t transient;
uint32_t state;
uint32_t accLock;
uint32_t cpuLock;
} AcceleratorViewEntry;
#define Write (1)
#define Read (2)
#define WriteDiscard (3)
//////////////////////////////////////////////////////////////////////
// Data tables for ViewCache
//////////////////////////////////////////////////////////////////////
static AcceleratorViewEntry AccCache[NaccCacheMax];
static int AccCacheVictim; // Base for round robin search
static int NaccCache = 8;
////////////////////////////////////
// Priority ordering for unlocked entries
// Empty
// CpuDirty
// Consistent
// AccDirty
////////////////////////////////////
#define Empty (0x0) /*Entry unoccupied */
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
#define AccDirty (0x4) /*ACC copy is golden */
#define EvictNext (0x8) /*Priority for eviction*/
int AllocationCache::ViewVictim(void)
{
int prioEmpty =-1;
int prioCpuDirty =-1;
int prioConsistent =-1;
int prioAccDirty =-1;
int prioCpuDirtyEN =-1;
int prioConsistentEN =-1;
int prioAccDirtyEN =-1;
int victim=-1;
// round robin priority search of unlocked entries offset from current victim
for(int ep=0;ep<NaccCache;ep++){
int e = (ep+AccCacheVictim)%NaccCache;
dprintf("AllocationCacheDeviceMem: Inspecting cache entry %d :",e);
uint32_t locks = AccCache[e].cpuLock+AccCache[e].accLock;
uint32_t s = AccCache[e].state;
uint32_t t = AccCache[e].transient;
assert( (s==Empty)||(s==CpuDirty)||(s==AccDirty)||(s==Consistent));
if ( locks==0 ) {
if( s==Empty ) { prioEmpty = e; dprintf("Empty");}
if( t == EvictNext ) {
if( s==CpuDirty ) { prioCpuDirtyEN = e; dprintf("CpuDirty Transient");}
if( s==Consistent ) { prioConsistentEN = e; dprintf("Consistent Transient");}
if( s==AccDirty ) { prioAccDirtyEN = e; dprintf("AccDirty Transient");}
} else {
if( s==CpuDirty ) { prioCpuDirty = e; dprintf("CpuDirty");}
if( s==Consistent ) { prioConsistent = e; dprintf("Consistent");}
if( s==AccDirty ) { prioAccDirty = e; dprintf("AccDirty");}
}
} else {
if ( AccCache[e].cpuLock ) dprintf("Locked in Cpu ");
if ( AccCache[e].accLock ) dprintf("Locked in Acc ");
}
dprintf("\n");
}
// This encodes the prioritisation for device residency
// EvictNext provides a transient mechanism
if ( prioAccDirty >= 0 ) victim = prioAccDirty;
if ( prioConsistent >= 0 ) victim = prioConsistent;
if ( prioCpuDirty >= 0 ) victim = prioCpuDirty;
if ( prioAccDirtyEN >= 0 ) victim = prioAccDirtyEN;
if ( prioConsistentEN >= 0 ) victim = prioConsistentEN;
if ( prioCpuDirtyEN >= 0 ) victim = prioCpuDirtyEN;
if ( prioEmpty >= 0 ) victim = prioEmpty; /*Highest prio is winner*/
assert(victim >= 0); // Must succeed/
dprintf("AllocationCacheDeviceMem: Selected victim cache entry %d\n",victim);
// advance victim pointer
AccCacheVictim=(AccCacheVictim+1)%NaccCache;
dprintf("AllocationCacheDeviceMem: victim pointer now %d / %d\n",AccCacheVictim,NaccCache);
return victim;
}
/////////////////////////////////////////////////
// Accelerator cache motion
/////////////////////////////////////////////////
void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerator, remove entry
{
if(AccCache[e].state!=Empty){
dprintf("AllocationCache: Evict(%d) %llx,%llxn",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
assert(AccCache[e].accLock==0);
assert(AccCache[e].cpuLock==0);
if(AccCache[e].state==AccDirty) {
Flush(e);
}
assert(AccCache[e].CpuPtr!=NULL);
if(AccCache[e].AccPtr) {
dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr);
AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes);
}
}
AccCache[e].AccPtr=NULL;
AccCache[e].CpuPtr=NULL;
AccCache[e].bytes=0;
AccCache[e].state=Empty;
AccCache[e].accLock=0;
AccCache[e].cpuLock=0;
}
void AllocationCache::Flush(int e)// Copy back from a dirty device state and mark consistent. Do not remove
{
dprintf("AllocationCache: Flush(%d) %llx -> %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
assert(AccCache[e].state==AccDirty);
assert(AccCache[e].cpuLock==0);
assert(AccCache[e].accLock==0);
assert(AccCache[e].AccPtr!=NULL);
assert(AccCache[e].CpuPtr!=NULL);
acceleratorCopyFromDevice(AccCache[e].AccPtr,AccCache[e].CpuPtr,AccCache[e].bytes);
AccCache[e].state=Consistent;
}
void AllocationCache::Clone(int e)// Copy from CPU, mark consistent. Allocate if necessary
{
assert(AccCache[e].state==CpuDirty);
assert(AccCache[e].cpuLock==0);
assert(AccCache[e].accLock==0);
assert(AccCache[e].CpuPtr!=NULL);
if(AccCache[e].AccPtr==NULL){
AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes);
}
dprintf("AllocationCache: Clone(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes);
AccCache[e].state=Consistent;
}
/////////////////////////////////////////////////////////////////////////////////
// View management
/////////////////////////////////////////////////////////////////////////////////
void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
int e=CpuViewLookup(CpuPtr);
if(e==-1) {
e = ViewVictim();
Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
}
assert(AccCache[e].cpuLock==0); // Programming error
if(AccCache[e].state!=Empty) {
assert(AccCache[e].CpuPtr == CpuPtr);
assert(AccCache[e].bytes==bytes);
}
/*
* State transitions and actions
*
* Action State StateNext Flush Clone
*
* AccRead Empty Consistent - Y
* AccWrite Empty AccDirty - Y
* AccRead CpuDirty Consistent - Y
* AccWrite CpuDirty AccDirty - Y
* AccRead Consistent Consistent - -
* AccWrite Consistent AccDirty - -
* AccRead AccDirty AccDirty - -
* AccWrite AccDirty AccDirty - -
*/
if(AccCache[e].state==Empty) {
AccCache[e].CpuPtr = CpuPtr;
AccCache[e].AccPtr = NULL;
AccCache[e].bytes = bytes;
AccCache[e].state = CpuDirty; // Cpu starts primary
Clone(e);
if(mode==Write)
AccCache[e].state = AccDirty; // Empty + AccWrite=> AccDirty
else
AccCache[e].state = Consistent; // Empty + AccRead => Consistent
AccCache[e].accLock= 1;
} else if(AccCache[e].state&CpuDirty ){
Clone(e);
if(mode==Write)
AccCache[e].state = AccDirty; // CpuDirty + AccWrite=> AccDirty
else
AccCache[e].state = Consistent; // CpuDirty + AccRead => Consistent
AccCache[e].accLock++;
} else if(AccCache[e].state&Consistent) {
if(mode==Write)
AccCache[e].state = AccDirty; // Consistent + AccWrite=> AccDirty
else
AccCache[e].state = Consistent; // Consistent + AccRead => Consistent
AccCache[e].accLock++;
} else if(AccCache[e].state&AccDirty) {
if(mode==Write)
AccCache[e].state = AccDirty; // AccDirty + AccWrite=> AccDirty
else
AccCache[e].state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache[e].accLock++;
} else {
assert(0);
}
AccCache[e].transient= transient? EvictNext : 0;
return AccCache[e].AccPtr;
}
/*
* Action State StateNext Flush Clone
*
* CpuRead Empty CpuDirty - -
* CpuWrite Empty CpuDirty - -
* CpuRead CpuDirty CpuDirty - -
* CpuWrite CpuDirty CpuDirty - -
* CpuRead Consistent Consistent - -
* CpuWrite Consistent CpuDirty - -
* CpuRead AccDirty Consistent Y -
* CpuWrite AccDirty CpuDirty Y -
*/
////////////////////////////////////
// look up & decrement lock count
////////////////////////////////////
void AllocationCache::AccViewClose(void* AccPtr)
{
int e=AccViewLookup(AccPtr);
assert(e!=-1);
assert(AccCache[e].cpuLock==0);
assert(AccCache[e].accLock>0);
AccCache[e].accLock--;
}
void AllocationCache::CpuViewClose(void* CpuPtr)
{
int e=CpuViewLookup(CpuPtr);
assert(e!=-1);
assert(AccCache[e].cpuLock>0);
assert(AccCache[e].accLock==0);
AccCache[e].cpuLock--;
}
void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
int e=CpuViewLookup(CpuPtr);
if(e==-1) {
e = ViewVictim();
Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
}
assert(AccCache[e].accLock==0); // Programming error
if(AccCache[e].state!=Empty) {
assert(AccCache[e].CpuPtr == CpuPtr);
assert(AccCache[e].bytes==bytes);
}
if(AccCache[e].state==Empty) {
AccCache[e].CpuPtr = CpuPtr;
AccCache[e].AccPtr = NULL;
AccCache[e].bytes = bytes;
AccCache[e].state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
AccCache[e].accLock= 0;
AccCache[e].cpuLock= 1;
} else if(AccCache[e].state==CpuDirty ){
// AccPtr dont care, deferred allocate
AccCache[e].state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache[e].cpuLock++;
} else if(AccCache[e].state==Consistent) {
assert(AccCache[e].AccPtr != NULL);
if(mode==Write)
AccCache[e].state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else
AccCache[e].state = Consistent; // Consistent +CpuRead => Consistent
AccCache[e].cpuLock++;
} else if(AccCache[e].state==AccDirty) {
assert(AccCache[e].AccPtr != NULL);
Flush(e);
if(mode==Write) AccCache[e].state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache[e].state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache[e].cpuLock++;
} else {
assert(0); // should be unreachable
}
AccCache[e].transient= transient? EvictNext : 0;
return AccCache[e].CpuPtr;
}
//////////////////////////////////////////////////////////////////////////////
//loop round robin over entries checking acc pointer
//////////////////////////////////////////////////////////////////////////////
int AllocationCache::CpuViewLookup(void *CpuPtr)
{
assert(CpuPtr!=NULL);
for(int e=0;e<NaccCache;e++){
if ( (AccCache[e].state!=Empty) && (AccCache[e].CpuPtr==CpuPtr) ) {
return e;
}
}
return -1;
}
int AllocationCache::AccViewLookup(void *AccPtr)
{
assert(AccPtr!=NULL);
for(int e=0;e<NaccCache;e++){
if ( (AccCache[e].state!=Empty) && (AccCache[e].AccPtr==AccPtr) ) {
return e;
}
}
return -1;
}
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,27 @@
#include <Grid/GridCore.h>
#ifdef GRID_UNIFIED
#warning "Grid is assuming unified virtual memory address space"
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// View management is 1:1 address space mapping
/////////////////////////////////////////////////////////////////////////////////
void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; }
void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; }
void AllocationCache::AccViewClose(void* AccPtr){}
void AllocationCache::CpuViewClose(void* CpuPtr){}
/////////////////////////////////////
// Dummy stubs
/////////////////////////////////////
int AllocationCache::ViewVictim(void) { assert(0); return 0;}
void AllocationCache::Evict(int e) { assert(0);}
void AllocationCache::Flush(int e) { assert(0);}
void AllocationCache::Clone(int e) { assert(0);}
int AllocationCache::CpuViewLookup(void *CpuPtr){assert(0); return 0;}
int AllocationCache::AccViewLookup(void *AccPtr){assert(0); return 0;}
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,67 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,95 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryStats.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
NAMESPACE_END(Grid);