1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 23:37:06 +01:00

Make view specify where and drive data motion - first cut.

This is a compile tiime option --enable-unified=yes/no
This commit is contained in:
Peter Boyle
2020-05-21 16:13:16 -04:00
parent ebb60330c9
commit 7860a50f70
48 changed files with 688 additions and 718 deletions

View File

@ -12,7 +12,7 @@ bool MemoryProfiler::debug = false;
#define SMALL_LIMIT (4096)
#endif
#ifdef POINTER_CACHE
#ifdef ALLOCATION_CACHE
int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];

View File

@ -22,8 +22,10 @@ void *AllocationCache::AcceleratorAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL )
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes);
// std::cout <<"AcceleratorAllocate: allocated Accelerator pointer "<<std::hex<<ptr<<std::endl;
}
return ptr;
}
@ -31,7 +33,7 @@ void AllocationCache::AcceleratorFree (void *ptr,size_t bytes)
{
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) acceleratorFreeShared(__freeme);
if ( __freeme ) acceleratorFreeDevice(__freeme);
}
void *AllocationCache::CpuAllocate(size_t bytes)
{
@ -39,9 +41,7 @@ void *AllocationCache::CpuAllocate(size_t bytes)
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
// std::cout <<"CpuAllocate: allocated pointer "<<std::hex<<ptr<<std::endl;
} else {
// std::cout <<"CpuAllocate: cached pointer "<<std::hex<<ptr<<std::endl;
// std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::endl;
}
return ptr;
@ -50,7 +50,7 @@ void AllocationCache::CpuFree (void *ptr,size_t bytes)
{
// Look up in ViewCache
int e=CpuViewLookup(ptr);
if(e>=0){ Evict(e); }
if(e>=0){ Discard(e); }
// If present remove entry and free accelerator too.
// Can we ever hit a free event with a view still in scope?
@ -90,13 +90,18 @@ void AllocationCache::Init(void)
Ncache[AccSmall]=Nc;
}
}
std::cout << "MemoryManager::Init() SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
}
void *AllocationCache::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
#else
return ptr;
#endif
}
void *AllocationCache::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
{
@ -136,9 +141,13 @@ void *AllocationCache::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entri
void *AllocationCache::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache]);
#else
return NULL;
#endif
}
void *AllocationCache::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
{

View File

@ -32,11 +32,38 @@ NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define ALLOCATION_CACHE
#undef ALLOCATION_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096)
/*Pinning pages is costly*/
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum ViewAdvise {
AdviseDefault = 0x0, // Reegular data
AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
AdviseTransient = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
AcceleratorRead = 0x01,
AcceleratorWrite = 0x02,
AcceleratorWriteDiscard = 0x04,
CpuRead = 0x08,
CpuWrite = 0x10,
CpuWriteDiscard = 0x10 // same for now
};
class AllocationCache {
private:
@ -70,19 +97,23 @@ private:
static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes);
static int ViewVictim(void);
static void CpuDiscard(int e);
static void Discard(int e);
static void Evict(int e);
static void Flush(int e);
static void Clone(int e);
static int CpuViewLookup(void *CpuPtr);
static int AccViewLookup(void *AccPtr);
// static int AccViewLookup(void *AccPtr);
static void AcceleratorViewClose(void* AccPtr);
static void *AcceleratorViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void CpuViewClose(void* Ptr);
static void *CpuViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
public:
static void Init(void);
static void AccViewClose(void* AccPtr);
static void CpuViewClose(void* CpuPtr);
static void *AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
static void *CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
static void ViewClose(void* AccPtr,ViewMode mode);
static void *ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void *CpuAllocate(size_t bytes);
static void CpuFree (void *ptr,size_t bytes);

View File

@ -1,9 +1,9 @@
#include <Grid/GridCore.h>
#ifndef GRID_UNIFIED
#ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
#define dprintf(...)
#define dprintf
////////////////////////////////////////////////////////////
// For caching copies of data on device
@ -20,15 +20,12 @@ typedef struct {
uint32_t cpuLock;
} AcceleratorViewEntry;
#define Write (1)
#define Read (2)
#define WriteDiscard (3)
//////////////////////////////////////////////////////////////////////
// Data tables for ViewCache
//////////////////////////////////////////////////////////////////////
static AcceleratorViewEntry AccCache[NaccCacheMax];
static int AccCacheVictim; // Base for round robin search
static int NaccCache = 8;
static int NaccCache = 32;
////////////////////////////////////
// Priority ordering for unlocked entries
@ -68,7 +65,7 @@ int AllocationCache::ViewVictim(void)
if ( locks==0 ) {
if( s==Empty ) { prioEmpty = e; dprintf("Empty");}
if( s==Empty ) { prioEmpty = e; dprintf("Empty"); }
if( t == EvictNext ) {
if( s==CpuDirty ) { prioCpuDirtyEN = e; dprintf("CpuDirty Transient");}
@ -97,21 +94,42 @@ int AllocationCache::ViewVictim(void)
if ( prioEmpty >= 0 ) victim = prioEmpty; /*Highest prio is winner*/
assert(victim >= 0); // Must succeed/
dprintf("AllocationCacheDeviceMem: Selected victim cache entry %d\n",victim);
dprintf("AllocationCacheDeviceMem: Selected victim cache entry %d\n",victim);
// advance victim pointer
AccCacheVictim=(AccCacheVictim+1)%NaccCache;
dprintf("AllocationCacheDeviceMem: victim pointer now %d / %d\n",AccCacheVictim,NaccCache);
dprintf("AllocationCacheDeviceMem: victim pointer now %d / %d\n",AccCacheVictim,NaccCache);
return victim;
}
/////////////////////////////////////////////////
// Accelerator cache motion
/////////////////////////////////////////////////
void AllocationCache::Discard(int e) // remove from Accelerator, remove entry, without flush
{
if(AccCache[e].state!=Empty){
dprintf("AllocationCache: Discard(%d) %llx,%llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
assert(AccCache[e].accLock==0);
assert(AccCache[e].cpuLock==0);
assert(AccCache[e].CpuPtr!=NULL);
if(AccCache[e].AccPtr) {
dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr);
AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes);
}
}
AccCache[e].AccPtr=NULL;
AccCache[e].CpuPtr=NULL;
AccCache[e].bytes=0;
AccCache[e].state=Empty;
AccCache[e].accLock=0;
AccCache[e].cpuLock=0;
}
void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerator, remove entry
{
if(AccCache[e].state!=Empty){
dprintf("AllocationCache: Evict(%d) %llx,%llxn",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
dprintf("AllocationCache: Evict(%d) %llx,%llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
assert(AccCache[e].accLock==0);
assert(AccCache[e].cpuLock==0);
if(AccCache[e].state==AccDirty) {
@ -119,7 +137,7 @@ void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerat
}
assert(AccCache[e].CpuPtr!=NULL);
if(AccCache[e].AccPtr) {
dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr);
dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr);
AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes);
}
}
@ -132,7 +150,7 @@ void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerat
}
void AllocationCache::Flush(int e)// Copy back from a dirty device state and mark consistent. Do not remove
{
dprintf("AllocationCache: Flush(%d) %llx -> %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
// printf("AllocationCache: Flush(%d) %llx -> %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); fflush(stdout);
assert(AccCache[e].state==AccDirty);
assert(AccCache[e].cpuLock==0);
assert(AccCache[e].accLock==0);
@ -150,14 +168,50 @@ void AllocationCache::Clone(int e)// Copy from CPU, mark consistent. Allocate if
if(AccCache[e].AccPtr==NULL){
AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes);
}
dprintf("AllocationCache: Clone(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
// printf("AllocationCache: Clone(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); fflush(stdout);
acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes);
AccCache[e].state=Consistent;
}
void AllocationCache::CpuDiscard(int e)// Mark accelerator dirty without copy. Allocate if necessary
{
assert(AccCache[e].state!=Empty);
assert(AccCache[e].cpuLock==0);
assert(AccCache[e].accLock==0);
assert(AccCache[e].CpuPtr!=NULL);
if(AccCache[e].AccPtr==NULL){
AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes);
}
// printf("AllocationCache: CpuDiscard(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); fflush(stdout);
// acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes);
AccCache[e].state=AccDirty;
}
/////////////////////////////////////////////////////////////////////////////////
// View management
/////////////////////////////////////////////////////////////////////////////////
void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient)
void AllocationCache::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
AcceleratorViewClose(Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose(Ptr);
} else {
assert(0);
}
}
void *AllocationCache::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
return AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return CpuViewOpen(CpuPtr,bytes,mode,hint);
} else {
assert(0);
return nullptr;
}
}
void *AllocationCache::AcceleratorViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
@ -165,9 +219,11 @@ void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
int e=CpuViewLookup(CpuPtr);
if(e==-1) {
e = ViewVictim();
dprintf("AcceleratorViewOpen Victim is %d\n",e);
Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
}
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache[e].cpuLock==0); // Programming error
if(AccCache[e].state!=Empty) {
@ -193,35 +249,50 @@ void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
AccCache[e].AccPtr = NULL;
AccCache[e].bytes = bytes;
AccCache[e].state = CpuDirty; // Cpu starts primary
Clone(e);
if(mode==Write)
AccCache[e].state = AccDirty; // Empty + AccWrite=> AccDirty
else
if(mode==AcceleratorWriteDiscard){
CpuDiscard(e);
AccCache[e].state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite){
Clone(e);
AccCache[e].state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else {
Clone(e);
AccCache[e].state = Consistent; // Empty + AccRead => Consistent
}
AccCache[e].accLock= 1;
} else if(AccCache[e].state&CpuDirty ){
Clone(e);
if(mode==Write)
AccCache[e].state = AccDirty; // CpuDirty + AccWrite=> AccDirty
else
// printf("Copied Empy entry %d into device accLock %d\n",e,AccCache[e].accLock);
} else if(AccCache[e].state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(e);
AccCache[e].state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite) {
Clone(e);
AccCache[e].state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else {
Clone(e);
AccCache[e].state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache[e].accLock++;
} else if(AccCache[e].state&Consistent) {
if(mode==Write)
AccCache[e].state = AccDirty; // Consistent + AccWrite=> AccDirty
// printf("Copied CpuDirty entry %d into device accLock %d\n",e,AccCache[e].accLock);
} else if(AccCache[e].state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache[e].state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache[e].state = Consistent; // Consistent + AccRead => Consistent
AccCache[e].accLock++;
} else if(AccCache[e].state&AccDirty) {
if(mode==Write)
AccCache[e].state = AccDirty; // AccDirty + AccWrite=> AccDirty
// printf("Consistent entry %d into device accLock %d\n",e,AccCache[e].accLock);
} else if(AccCache[e].state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache[e].state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache[e].state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache[e].accLock++;
// printf("AccDirty entry %d into device accLock %d\n",e,AccCache[e].accLock);
} else {
assert(0);
}
int transient =hint;
AccCache[e].transient= transient? EvictNext : 0;
return AccCache[e].AccPtr;
@ -241,12 +312,18 @@ void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
////////////////////////////////////
// look up & decrement lock count
////////////////////////////////////
void AllocationCache::AccViewClose(void* AccPtr)
void AllocationCache::AcceleratorViewClose(void* AccPtr)
{
int e=AccViewLookup(AccPtr);
int e=CpuViewLookup(AccPtr);
// printf("AccView close %d lock %d \n",e,AccCache[e].accLock);
if(e==-1) exit(0);
if(AccCache[e].cpuLock!=0) exit(0);
if(AccCache[e].accLock==0) exit(0);
/*
assert(e!=-1);
assert(AccCache[e].cpuLock==0);
assert(AccCache[e].accLock>0);
*/
AccCache[e].accLock--;
}
void AllocationCache::CpuViewClose(void* CpuPtr)
@ -257,7 +334,7 @@ void AllocationCache::CpuViewClose(void* CpuPtr)
assert(AccCache[e].accLock==0);
AccCache[e].cpuLock--;
}
void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient)
void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
@ -265,9 +342,11 @@ void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
int e=CpuViewLookup(CpuPtr);
if(e==-1) {
e = ViewVictim();
dprintf("CpuViewOpen Victim is %d\n",e);
Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
}
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache[e].accLock==0); // Programming error
if(AccCache[e].state!=Empty) {
@ -288,7 +367,7 @@ void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
AccCache[e].cpuLock++;
} else if(AccCache[e].state==Consistent) {
assert(AccCache[e].AccPtr != NULL);
if(mode==Write)
if(mode==CpuWrite)
AccCache[e].state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else
AccCache[e].state = Consistent; // Consistent +CpuRead => Consistent
@ -296,7 +375,7 @@ void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
} else if(AccCache[e].state==AccDirty) {
assert(AccCache[e].AccPtr != NULL);
Flush(e);
if(mode==Write) AccCache[e].state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
if(mode==CpuWrite) AccCache[e].state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache[e].state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache[e].cpuLock++;
} else {
@ -321,16 +400,6 @@ int AllocationCache::CpuViewLookup(void *CpuPtr)
}
return -1;
}
int AllocationCache::AccViewLookup(void *AccPtr)
{
assert(AccPtr!=NULL);
for(int e=0;e<NaccCache;e++){
if ( (AccCache[e].state!=Empty) && (AccCache[e].AccPtr==AccPtr) ) {
return e;
}
}
return -1;
}
NAMESPACE_END(Grid);

View File

@ -1,5 +1,5 @@
#include <Grid/GridCore.h>
#ifdef GRID_UNIFIED
#ifdef GRID_UVM
#warning "Grid is assuming unified virtual memory address space"
NAMESPACE_BEGIN(Grid);
@ -7,21 +7,22 @@ NAMESPACE_BEGIN(Grid);
// View management is 1:1 address space mapping
/////////////////////////////////////////////////////////////////////////////////
void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; }
void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; }
void AllocationCache::AccViewClose(void* AccPtr){}
void AllocationCache::CpuViewClose(void* CpuPtr){}
void AllocationCache::AcceleratorViewClose(void* AccPtr){};
void *AllocationCache::AcceleratorViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }
void AllocationCache::CpuViewClose(void* Ptr){};
void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }
int AllocationCache::CpuViewLookup(void *CpuPtr){ return 0;}
/////////////////////////////////////
// Dummy stubs
/////////////////////////////////////
int AllocationCache::ViewVictim(void) { assert(0); return 0;}
void AllocationCache::Evict(int e) { assert(0);}
void AllocationCache::Flush(int e) { assert(0);}
void AllocationCache::Clone(int e) { assert(0);}
int AllocationCache::CpuViewLookup(void *CpuPtr){assert(0); return 0;}
int AllocationCache::AccViewLookup(void *AccPtr){assert(0); return 0;}
void AllocationCache::CpuDiscard(int e) { return;}
void AllocationCache::Discard(int e) { return;}
void AllocationCache::Evict(int e) { return; }
void AllocationCache::Flush(int e) { assert(0);}
void AllocationCache::Clone(int e) { assert(0);}
int AllocationCache::ViewVictim(void) { assert(0); return 0;}
void AllocationCache::ViewClose(void* AccPtr,ViewMode mode){};
void *AllocationCache::ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){return CpuPtr;};
NAMESPACE_END(Grid);
#endif