mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-10 06:00:45 +01:00
Automatic data motion options beginning
This commit is contained in:
parent
a9847aa866
commit
ebb60330c9
@ -47,7 +47,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/perfmon/PerfCount.h>
|
||||
#include <Grid/util/Util.h>
|
||||
#include <Grid/log/Log.h>
|
||||
#include <Grid/allocator/AlignedAllocator.h>
|
||||
#include <Grid/allocator/Allocator.h>
|
||||
#include <Grid/simd/Simd.h>
|
||||
#include <Grid/threads/ThreadReduction.h>
|
||||
#include <Grid/serialisation/Serialisation.h>
|
||||
|
@ -26,102 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ALIGNED_ALLOCATOR_H
|
||||
#define GRID_ALIGNED_ALLOCATOR_H
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*Move control to configure.ac and Config.h*/
|
||||
#define POINTER_CACHE
|
||||
/*Pinning pages is costly*/
|
||||
/*Could maintain separate large and small allocation caches*/
|
||||
#ifdef POINTER_CACHE
|
||||
class PointerCache {
|
||||
private:
|
||||
|
||||
static const int Ncache=128;
|
||||
static int victim;
|
||||
|
||||
typedef struct {
|
||||
void *address;
|
||||
size_t bytes;
|
||||
int valid;
|
||||
} PointerCacheEntry;
|
||||
|
||||
static PointerCacheEntry Entries[Ncache];
|
||||
|
||||
public:
|
||||
|
||||
static void *Insert(void *ptr,size_t bytes) ;
|
||||
static void *Lookup(size_t bytes) ;
|
||||
|
||||
};
|
||||
#endif
|
||||
|
||||
std::string sizeString(size_t bytes);
|
||||
|
||||
struct MemoryStats
|
||||
{
|
||||
size_t totalAllocated{0}, maxAllocated{0},
|
||||
currentlyAllocated{0}, totalFreed{0};
|
||||
};
|
||||
|
||||
class MemoryProfiler
|
||||
{
|
||||
public:
|
||||
static MemoryStats *stats;
|
||||
static bool debug;
|
||||
};
|
||||
|
||||
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
|
||||
#define profilerDebugPrint \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
|
||||
<< std::endl; \
|
||||
}
|
||||
|
||||
#define profilerAllocate(bytes) \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
s->totalAllocated += (bytes); \
|
||||
s->currentlyAllocated += (bytes); \
|
||||
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
|
||||
} \
|
||||
if (MemoryProfiler::debug) \
|
||||
{ \
|
||||
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
|
||||
profilerDebugPrint; \
|
||||
}
|
||||
|
||||
#define profilerFree(bytes) \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
s->totalFreed += (bytes); \
|
||||
s->currentlyAllocated -= (bytes); \
|
||||
} \
|
||||
if (MemoryProfiler::debug) \
|
||||
{ \
|
||||
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
|
||||
profilerDebugPrint; \
|
||||
}
|
||||
|
||||
void check_huge_pages(void *Buf,uint64_t BYTES);
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// A lattice of something, but assume the something is SIMDized.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename _Tp>
|
||||
class alignedAllocator {
|
||||
public:
|
||||
@ -144,42 +52,23 @@ public:
|
||||
pointer allocate(size_type __n, const void* _p= 0)
|
||||
{
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
|
||||
profilerAllocate(bytes);
|
||||
|
||||
#ifdef POINTER_CACHE
|
||||
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
||||
#else
|
||||
pointer ptr = nullptr;
|
||||
#endif
|
||||
|
||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) acceleratorAllocShared(bytes);
|
||||
|
||||
_Tp *ptr = (_Tp*) AllocationCache::CpuAllocate(bytes);
|
||||
|
||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||
|
||||
#if 0
|
||||
size_type page_size=4096;
|
||||
size_type pages = (bytes+page_size-1)/page_size;
|
||||
uint8_t *bp = (uint8_t *)ptr;
|
||||
|
||||
accelerator_for(pg,pages,1,{
|
||||
bp[pg*page_size]=0;
|
||||
});
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void deallocate(pointer __p, size_type __n) {
|
||||
void deallocate(pointer __p, size_type __n)
|
||||
{
|
||||
size_type bytes = __n * sizeof(_Tp);
|
||||
|
||||
profilerFree(bytes);
|
||||
|
||||
#ifdef POINTER_CACHE
|
||||
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
|
||||
#else
|
||||
pointer __freeme = __p;
|
||||
#endif
|
||||
|
||||
if ( __freeme ) acceleratorFreeShared((void *)__freeme);
|
||||
AllocationCache::CpuFree((void *)__p,bytes);
|
||||
}
|
||||
|
||||
// FIXME: hack for the copy constructor, eventually it must be avoided
|
||||
@ -201,4 +90,4 @@ template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
||||
|
||||
|
159
Grid/allocator/AllocationCache.cc
Normal file
159
Grid/allocator/AllocationCache.cc
Normal file
@ -0,0 +1,159 @@
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*Allocation types, saying which pointer cache should be used*/
|
||||
#define Cpu (0)
|
||||
#define CpuSmall (1)
|
||||
#define Acc (2)
|
||||
#define AccSmall (3)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Data tables for recently freed pooiniter caches
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
AllocationCache::AllocationCacheEntry AllocationCache::Entries[AllocationCache::NallocType][AllocationCache::NallocCacheMax];
|
||||
int AllocationCache::Victim[AllocationCache::NallocType];
|
||||
int AllocationCache::Ncache[AllocationCache::NallocType];
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Actual allocation and deallocation utils
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
void *AllocationCache::AcceleratorAllocate(size_t bytes)
|
||||
{
|
||||
void *ptr = (void *) Lookup(bytes,Acc);
|
||||
|
||||
if ( ptr == (void *) NULL )
|
||||
ptr = (void *) acceleratorAllocDevice(bytes);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
void AllocationCache::AcceleratorFree (void *ptr,size_t bytes)
|
||||
{
|
||||
void *__freeme = Insert(ptr,bytes,Acc);
|
||||
|
||||
if ( __freeme ) acceleratorFreeShared(__freeme);
|
||||
}
|
||||
void *AllocationCache::CpuAllocate(size_t bytes)
|
||||
{
|
||||
void *ptr = (void *) Lookup(bytes,Cpu);
|
||||
|
||||
if ( ptr == (void *) NULL ) {
|
||||
ptr = (void *) acceleratorAllocShared(bytes);
|
||||
// std::cout <<"CpuAllocate: allocated pointer "<<std::hex<<ptr<<std::endl;
|
||||
} else {
|
||||
// std::cout <<"CpuAllocate: cached pointer "<<std::hex<<ptr<<std::endl;
|
||||
}
|
||||
|
||||
return ptr;
|
||||
}
|
||||
void AllocationCache::CpuFree (void *ptr,size_t bytes)
|
||||
{
|
||||
// Look up in ViewCache
|
||||
int e=CpuViewLookup(ptr);
|
||||
if(e>=0){ Evict(e); }
|
||||
|
||||
// If present remove entry and free accelerator too.
|
||||
// Can we ever hit a free event with a view still in scope?
|
||||
void *__freeme = Insert(ptr,bytes,Cpu);
|
||||
// std::cout <<"CpuFree cached pointer "<<std::hex<<ptr<<std::endl;
|
||||
// std::cout <<"CpuFree deallocating pointer "<<std::hex<<__freeme<<std::endl;
|
||||
if ( __freeme ) acceleratorFreeShared(__freeme);
|
||||
}
|
||||
//////////////////////////////////////////
|
||||
// call only once
|
||||
//////////////////////////////////////////
|
||||
void AllocationCache::Init(void)
|
||||
{
|
||||
Ncache[Cpu] = 8;
|
||||
Ncache[Acc] = 8;
|
||||
Ncache[CpuSmall] = 32;
|
||||
Ncache[AccSmall] = 32;
|
||||
|
||||
char * str;
|
||||
int Nc;
|
||||
int NcS;
|
||||
|
||||
str= getenv("GRID_ALLOC_NCACHE_LARGE");
|
||||
if ( str ) {
|
||||
Nc = atoi(str);
|
||||
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
||||
Ncache[Cpu]=Nc;
|
||||
Ncache[Acc]=Nc;
|
||||
}
|
||||
}
|
||||
|
||||
str= getenv("GRID_ALLOC_NCACHE_SMALL");
|
||||
if ( str ) {
|
||||
Nc = atoi(str);
|
||||
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
||||
Ncache[CpuSmall]=Nc;
|
||||
Ncache[AccSmall]=Nc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void *AllocationCache::Insert(void *ptr,size_t bytes,int type)
|
||||
{
|
||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||
int cache = type + small;
|
||||
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
|
||||
}
|
||||
void *AllocationCache::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
|
||||
{
|
||||
assert(ncache>0);
|
||||
#ifdef GRID_OMP
|
||||
assert(omp_in_parallel()==0);
|
||||
#endif
|
||||
|
||||
void * ret = NULL;
|
||||
int v = -1;
|
||||
|
||||
for(int e=0;e<ncache;e++) {
|
||||
if ( entries[e].valid==0 ) {
|
||||
v=e;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( v==-1 ) {
|
||||
v=victim;
|
||||
victim = (victim+1)%ncache;
|
||||
}
|
||||
|
||||
if ( entries[v].valid ) {
|
||||
ret = entries[v].address;
|
||||
entries[v].valid = 0;
|
||||
entries[v].address = NULL;
|
||||
entries[v].bytes = 0;
|
||||
}
|
||||
|
||||
entries[v].address=ptr;
|
||||
entries[v].bytes =bytes;
|
||||
entries[v].valid =1;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void *AllocationCache::Lookup(size_t bytes,int type)
|
||||
{
|
||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||
int cache = type+small;
|
||||
return Lookup(bytes,Entries[cache],Ncache[cache]);
|
||||
}
|
||||
void *AllocationCache::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
|
||||
{
|
||||
assert(ncache>0);
|
||||
#ifdef GRID_OMP
|
||||
assert(omp_in_parallel()==0);
|
||||
#endif
|
||||
for(int e=0;e<ncache;e++){
|
||||
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
||||
entries[e].valid = 0;
|
||||
return entries[e].address;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
93
Grid/allocator/AllocationCache.h
Normal file
93
Grid/allocator/AllocationCache.h
Normal file
@ -0,0 +1,93 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/AllocationCache.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// Move control to configure.ac and Config.h?
|
||||
|
||||
#define ALLOCATION_CACHE
|
||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
||||
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
||||
|
||||
/*Pinning pages is costly*/
|
||||
|
||||
class AllocationCache {
|
||||
private:
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// For caching recently freed allocations
|
||||
////////////////////////////////////////////////////////////
|
||||
typedef struct {
|
||||
void *address;
|
||||
size_t bytes;
|
||||
int valid;
|
||||
} AllocationCacheEntry;
|
||||
|
||||
static const int NallocCacheMax=128;
|
||||
static const int NallocType=4;
|
||||
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
|
||||
static int Victim[NallocType];
|
||||
static int Ncache[NallocType];
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Free pool
|
||||
/////////////////////////////////////////////////
|
||||
static void *Insert(void *ptr,size_t bytes,int type) ;
|
||||
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
|
||||
static void *Lookup(size_t bytes,int type) ;
|
||||
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Internal device view
|
||||
/////////////////////////////////////////////////
|
||||
static void *AcceleratorAllocate(size_t bytes);
|
||||
static void AcceleratorFree (void *ptr,size_t bytes);
|
||||
static int ViewVictim(void);
|
||||
static void Evict(int e);
|
||||
static void Flush(int e);
|
||||
static void Clone(int e);
|
||||
static int CpuViewLookup(void *CpuPtr);
|
||||
static int AccViewLookup(void *AccPtr);
|
||||
|
||||
public:
|
||||
static void Init(void);
|
||||
|
||||
static void AccViewClose(void* AccPtr);
|
||||
static void CpuViewClose(void* CpuPtr);
|
||||
static void *AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
|
||||
static void *CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
|
||||
|
||||
static void *CpuAllocate(size_t bytes);
|
||||
static void CpuFree (void *ptr,size_t bytes);
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
4
Grid/allocator/Allocator.h
Normal file
4
Grid/allocator/Allocator.h
Normal file
@ -0,0 +1,4 @@
|
||||
#pragma once
|
||||
#include <Grid/allocator/MemoryStats.h>
|
||||
#include <Grid/allocator/AllocationCache.h>
|
||||
#include <Grid/allocator/AlignedAllocator.h>
|
338
Grid/allocator/MemoryCacheDeviceMem.cc
Normal file
338
Grid/allocator/MemoryCacheDeviceMem.cc
Normal file
@ -0,0 +1,338 @@
|
||||
#include <Grid/GridCore.h>
|
||||
#ifndef GRID_UNIFIED
|
||||
|
||||
#warning "Using explicit device memory copies"
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
#define dprintf(...)
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// For caching copies of data on device
|
||||
////////////////////////////////////////////////////////////
|
||||
const int NaccCacheMax=128;
|
||||
|
||||
typedef struct {
|
||||
void *CpuPtr;
|
||||
void *AccPtr;
|
||||
size_t bytes;
|
||||
uint32_t transient;
|
||||
uint32_t state;
|
||||
uint32_t accLock;
|
||||
uint32_t cpuLock;
|
||||
} AcceleratorViewEntry;
|
||||
|
||||
#define Write (1)
|
||||
#define Read (2)
|
||||
#define WriteDiscard (3)
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Data tables for ViewCache
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
static AcceleratorViewEntry AccCache[NaccCacheMax];
|
||||
static int AccCacheVictim; // Base for round robin search
|
||||
static int NaccCache = 8;
|
||||
|
||||
////////////////////////////////////
|
||||
// Priority ordering for unlocked entries
|
||||
// Empty
|
||||
// CpuDirty
|
||||
// Consistent
|
||||
// AccDirty
|
||||
////////////////////////////////////
|
||||
#define Empty (0x0) /*Entry unoccupied */
|
||||
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
|
||||
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
|
||||
#define AccDirty (0x4) /*ACC copy is golden */
|
||||
#define EvictNext (0x8) /*Priority for eviction*/
|
||||
|
||||
int AllocationCache::ViewVictim(void)
|
||||
{
|
||||
int prioEmpty =-1;
|
||||
int prioCpuDirty =-1;
|
||||
int prioConsistent =-1;
|
||||
int prioAccDirty =-1;
|
||||
int prioCpuDirtyEN =-1;
|
||||
int prioConsistentEN =-1;
|
||||
int prioAccDirtyEN =-1;
|
||||
|
||||
int victim=-1;
|
||||
|
||||
// round robin priority search of unlocked entries offset from current victim
|
||||
for(int ep=0;ep<NaccCache;ep++){
|
||||
int e = (ep+AccCacheVictim)%NaccCache;
|
||||
dprintf("AllocationCacheDeviceMem: Inspecting cache entry %d :",e);
|
||||
|
||||
uint32_t locks = AccCache[e].cpuLock+AccCache[e].accLock;
|
||||
uint32_t s = AccCache[e].state;
|
||||
uint32_t t = AccCache[e].transient;
|
||||
|
||||
assert( (s==Empty)||(s==CpuDirty)||(s==AccDirty)||(s==Consistent));
|
||||
|
||||
if ( locks==0 ) {
|
||||
|
||||
if( s==Empty ) { prioEmpty = e; dprintf("Empty");}
|
||||
|
||||
if( t == EvictNext ) {
|
||||
if( s==CpuDirty ) { prioCpuDirtyEN = e; dprintf("CpuDirty Transient");}
|
||||
if( s==Consistent ) { prioConsistentEN = e; dprintf("Consistent Transient");}
|
||||
if( s==AccDirty ) { prioAccDirtyEN = e; dprintf("AccDirty Transient");}
|
||||
} else {
|
||||
if( s==CpuDirty ) { prioCpuDirty = e; dprintf("CpuDirty");}
|
||||
if( s==Consistent ) { prioConsistent = e; dprintf("Consistent");}
|
||||
if( s==AccDirty ) { prioAccDirty = e; dprintf("AccDirty");}
|
||||
}
|
||||
|
||||
} else {
|
||||
if ( AccCache[e].cpuLock ) dprintf("Locked in Cpu ");
|
||||
if ( AccCache[e].accLock ) dprintf("Locked in Acc ");
|
||||
}
|
||||
dprintf("\n");
|
||||
}
|
||||
// This encodes the prioritisation for device residency
|
||||
// EvictNext provides a transient mechanism
|
||||
if ( prioAccDirty >= 0 ) victim = prioAccDirty;
|
||||
if ( prioConsistent >= 0 ) victim = prioConsistent;
|
||||
if ( prioCpuDirty >= 0 ) victim = prioCpuDirty;
|
||||
if ( prioAccDirtyEN >= 0 ) victim = prioAccDirtyEN;
|
||||
if ( prioConsistentEN >= 0 ) victim = prioConsistentEN;
|
||||
if ( prioCpuDirtyEN >= 0 ) victim = prioCpuDirtyEN;
|
||||
if ( prioEmpty >= 0 ) victim = prioEmpty; /*Highest prio is winner*/
|
||||
|
||||
assert(victim >= 0); // Must succeed/
|
||||
dprintf("AllocationCacheDeviceMem: Selected victim cache entry %d\n",victim);
|
||||
|
||||
// advance victim pointer
|
||||
AccCacheVictim=(AccCacheVictim+1)%NaccCache;
|
||||
dprintf("AllocationCacheDeviceMem: victim pointer now %d / %d\n",AccCacheVictim,NaccCache);
|
||||
|
||||
return victim;
|
||||
}
|
||||
/////////////////////////////////////////////////
|
||||
// Accelerator cache motion
|
||||
/////////////////////////////////////////////////
|
||||
void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerator, remove entry
|
||||
{
|
||||
if(AccCache[e].state!=Empty){
|
||||
dprintf("AllocationCache: Evict(%d) %llx,%llxn",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
|
||||
assert(AccCache[e].accLock==0);
|
||||
assert(AccCache[e].cpuLock==0);
|
||||
if(AccCache[e].state==AccDirty) {
|
||||
Flush(e);
|
||||
}
|
||||
assert(AccCache[e].CpuPtr!=NULL);
|
||||
if(AccCache[e].AccPtr) {
|
||||
dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr);
|
||||
AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes);
|
||||
}
|
||||
}
|
||||
AccCache[e].AccPtr=NULL;
|
||||
AccCache[e].CpuPtr=NULL;
|
||||
AccCache[e].bytes=0;
|
||||
AccCache[e].state=Empty;
|
||||
AccCache[e].accLock=0;
|
||||
AccCache[e].cpuLock=0;
|
||||
}
|
||||
void AllocationCache::Flush(int e)// Copy back from a dirty device state and mark consistent. Do not remove
|
||||
{
|
||||
dprintf("AllocationCache: Flush(%d) %llx -> %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
|
||||
assert(AccCache[e].state==AccDirty);
|
||||
assert(AccCache[e].cpuLock==0);
|
||||
assert(AccCache[e].accLock==0);
|
||||
assert(AccCache[e].AccPtr!=NULL);
|
||||
assert(AccCache[e].CpuPtr!=NULL);
|
||||
acceleratorCopyFromDevice(AccCache[e].AccPtr,AccCache[e].CpuPtr,AccCache[e].bytes);
|
||||
AccCache[e].state=Consistent;
|
||||
}
|
||||
void AllocationCache::Clone(int e)// Copy from CPU, mark consistent. Allocate if necessary
|
||||
{
|
||||
assert(AccCache[e].state==CpuDirty);
|
||||
assert(AccCache[e].cpuLock==0);
|
||||
assert(AccCache[e].accLock==0);
|
||||
assert(AccCache[e].CpuPtr!=NULL);
|
||||
if(AccCache[e].AccPtr==NULL){
|
||||
AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes);
|
||||
}
|
||||
dprintf("AllocationCache: Clone(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
|
||||
acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes);
|
||||
AccCache[e].state=Consistent;
|
||||
}
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
// View management
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Find if present, otherwise get or force an empty
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
int e=CpuViewLookup(CpuPtr);
|
||||
if(e==-1) {
|
||||
e = ViewVictim();
|
||||
Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
|
||||
}
|
||||
|
||||
assert(AccCache[e].cpuLock==0); // Programming error
|
||||
|
||||
if(AccCache[e].state!=Empty) {
|
||||
assert(AccCache[e].CpuPtr == CpuPtr);
|
||||
assert(AccCache[e].bytes==bytes);
|
||||
}
|
||||
/*
|
||||
* State transitions and actions
|
||||
*
|
||||
* Action State StateNext Flush Clone
|
||||
*
|
||||
* AccRead Empty Consistent - Y
|
||||
* AccWrite Empty AccDirty - Y
|
||||
* AccRead CpuDirty Consistent - Y
|
||||
* AccWrite CpuDirty AccDirty - Y
|
||||
* AccRead Consistent Consistent - -
|
||||
* AccWrite Consistent AccDirty - -
|
||||
* AccRead AccDirty AccDirty - -
|
||||
* AccWrite AccDirty AccDirty - -
|
||||
*/
|
||||
if(AccCache[e].state==Empty) {
|
||||
AccCache[e].CpuPtr = CpuPtr;
|
||||
AccCache[e].AccPtr = NULL;
|
||||
AccCache[e].bytes = bytes;
|
||||
AccCache[e].state = CpuDirty; // Cpu starts primary
|
||||
Clone(e);
|
||||
if(mode==Write)
|
||||
AccCache[e].state = AccDirty; // Empty + AccWrite=> AccDirty
|
||||
else
|
||||
AccCache[e].state = Consistent; // Empty + AccRead => Consistent
|
||||
AccCache[e].accLock= 1;
|
||||
} else if(AccCache[e].state&CpuDirty ){
|
||||
Clone(e);
|
||||
if(mode==Write)
|
||||
AccCache[e].state = AccDirty; // CpuDirty + AccWrite=> AccDirty
|
||||
else
|
||||
AccCache[e].state = Consistent; // CpuDirty + AccRead => Consistent
|
||||
AccCache[e].accLock++;
|
||||
} else if(AccCache[e].state&Consistent) {
|
||||
if(mode==Write)
|
||||
AccCache[e].state = AccDirty; // Consistent + AccWrite=> AccDirty
|
||||
else
|
||||
AccCache[e].state = Consistent; // Consistent + AccRead => Consistent
|
||||
AccCache[e].accLock++;
|
||||
} else if(AccCache[e].state&AccDirty) {
|
||||
if(mode==Write)
|
||||
AccCache[e].state = AccDirty; // AccDirty + AccWrite=> AccDirty
|
||||
else
|
||||
AccCache[e].state = AccDirty; // AccDirty + AccRead => AccDirty
|
||||
AccCache[e].accLock++;
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
AccCache[e].transient= transient? EvictNext : 0;
|
||||
|
||||
return AccCache[e].AccPtr;
|
||||
}
|
||||
/*
|
||||
* Action State StateNext Flush Clone
|
||||
*
|
||||
* CpuRead Empty CpuDirty - -
|
||||
* CpuWrite Empty CpuDirty - -
|
||||
* CpuRead CpuDirty CpuDirty - -
|
||||
* CpuWrite CpuDirty CpuDirty - -
|
||||
* CpuRead Consistent Consistent - -
|
||||
* CpuWrite Consistent CpuDirty - -
|
||||
* CpuRead AccDirty Consistent Y -
|
||||
* CpuWrite AccDirty CpuDirty Y -
|
||||
*/
|
||||
////////////////////////////////////
|
||||
// look up & decrement lock count
|
||||
////////////////////////////////////
|
||||
void AllocationCache::AccViewClose(void* AccPtr)
|
||||
{
|
||||
int e=AccViewLookup(AccPtr);
|
||||
assert(e!=-1);
|
||||
assert(AccCache[e].cpuLock==0);
|
||||
assert(AccCache[e].accLock>0);
|
||||
AccCache[e].accLock--;
|
||||
}
|
||||
void AllocationCache::CpuViewClose(void* CpuPtr)
|
||||
{
|
||||
int e=CpuViewLookup(CpuPtr);
|
||||
assert(e!=-1);
|
||||
assert(AccCache[e].cpuLock>0);
|
||||
assert(AccCache[e].accLock==0);
|
||||
AccCache[e].cpuLock--;
|
||||
}
|
||||
void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Find if present, otherwise get or force an empty
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
int e=CpuViewLookup(CpuPtr);
|
||||
if(e==-1) {
|
||||
e = ViewVictim();
|
||||
Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
|
||||
}
|
||||
|
||||
assert(AccCache[e].accLock==0); // Programming error
|
||||
|
||||
if(AccCache[e].state!=Empty) {
|
||||
assert(AccCache[e].CpuPtr == CpuPtr);
|
||||
assert(AccCache[e].bytes==bytes);
|
||||
}
|
||||
|
||||
if(AccCache[e].state==Empty) {
|
||||
AccCache[e].CpuPtr = CpuPtr;
|
||||
AccCache[e].AccPtr = NULL;
|
||||
AccCache[e].bytes = bytes;
|
||||
AccCache[e].state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
|
||||
AccCache[e].accLock= 0;
|
||||
AccCache[e].cpuLock= 1;
|
||||
} else if(AccCache[e].state==CpuDirty ){
|
||||
// AccPtr dont care, deferred allocate
|
||||
AccCache[e].state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
|
||||
AccCache[e].cpuLock++;
|
||||
} else if(AccCache[e].state==Consistent) {
|
||||
assert(AccCache[e].AccPtr != NULL);
|
||||
if(mode==Write)
|
||||
AccCache[e].state = CpuDirty; // Consistent +CpuWrite => CpuDirty
|
||||
else
|
||||
AccCache[e].state = Consistent; // Consistent +CpuRead => Consistent
|
||||
AccCache[e].cpuLock++;
|
||||
} else if(AccCache[e].state==AccDirty) {
|
||||
assert(AccCache[e].AccPtr != NULL);
|
||||
Flush(e);
|
||||
if(mode==Write) AccCache[e].state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
|
||||
else AccCache[e].state = Consistent; // AccDirty +CpuRead => Consistent, Flush
|
||||
AccCache[e].cpuLock++;
|
||||
} else {
|
||||
assert(0); // should be unreachable
|
||||
}
|
||||
|
||||
AccCache[e].transient= transient? EvictNext : 0;
|
||||
|
||||
return AccCache[e].CpuPtr;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//loop round robin over entries checking acc pointer
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
int AllocationCache::CpuViewLookup(void *CpuPtr)
|
||||
{
|
||||
assert(CpuPtr!=NULL);
|
||||
for(int e=0;e<NaccCache;e++){
|
||||
if ( (AccCache[e].state!=Empty) && (AccCache[e].CpuPtr==CpuPtr) ) {
|
||||
return e;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
int AllocationCache::AccViewLookup(void *AccPtr)
|
||||
{
|
||||
assert(AccPtr!=NULL);
|
||||
for(int e=0;e<NaccCache;e++){
|
||||
if ( (AccCache[e].state!=Empty) && (AccCache[e].AccPtr==AccPtr) ) {
|
||||
return e;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
27
Grid/allocator/MemoryCacheShared.cc
Normal file
27
Grid/allocator/MemoryCacheShared.cc
Normal file
@ -0,0 +1,27 @@
|
||||
#include <Grid/GridCore.h>
|
||||
#ifdef GRID_UNIFIED
|
||||
|
||||
#warning "Grid is assuming unified virtual memory address space"
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
// View management is 1:1 address space mapping
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; }
|
||||
void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; }
|
||||
void AllocationCache::AccViewClose(void* AccPtr){}
|
||||
void AllocationCache::CpuViewClose(void* CpuPtr){}
|
||||
|
||||
/////////////////////////////////////
|
||||
// Dummy stubs
|
||||
/////////////////////////////////////
|
||||
int AllocationCache::ViewVictim(void) { assert(0); return 0;}
|
||||
void AllocationCache::Evict(int e) { assert(0);}
|
||||
void AllocationCache::Flush(int e) { assert(0);}
|
||||
void AllocationCache::Clone(int e) { assert(0);}
|
||||
|
||||
int AllocationCache::CpuViewLookup(void *CpuPtr){assert(0); return 0;}
|
||||
int AllocationCache::AccViewLookup(void *AccPtr){assert(0); return 0;}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
#endif
|
67
Grid/allocator/MemoryStats.cc
Normal file
67
Grid/allocator/MemoryStats.cc
Normal file
@ -0,0 +1,67 @@
|
||||
#include <Grid/GridCore.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
||||
bool MemoryProfiler::debug = false;
|
||||
|
||||
void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||
{
|
||||
#ifdef __linux__
|
||||
int fd = open("/proc/self/pagemap", O_RDONLY);
|
||||
assert(fd >= 0);
|
||||
const int page_size = 4096;
|
||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
||||
uint64_t pagedata[npages];
|
||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
||||
assert(ret == offset);
|
||||
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
||||
assert(ret == sizeof(uint64_t) * npages);
|
||||
int nhugepages = npages / 512;
|
||||
int n4ktotal, nnothuge;
|
||||
n4ktotal = 0;
|
||||
nnothuge = 0;
|
||||
for (int i = 0; i < nhugepages; ++i) {
|
||||
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
|
||||
for (int j = 0; j < 512; ++j) {
|
||||
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
|
||||
++n4ktotal;
|
||||
if (pageaddr != baseaddr + j * page_size)
|
||||
++nnothuge;
|
||||
}
|
||||
}
|
||||
int rank = CartesianCommunicator::RankWorld();
|
||||
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string sizeString(const size_t bytes)
|
||||
{
|
||||
constexpr unsigned int bufSize = 256;
|
||||
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
|
||||
char buf[256];
|
||||
size_t s = 0;
|
||||
double count = bytes;
|
||||
|
||||
while (count >= 1024 && s < 7)
|
||||
{
|
||||
s++;
|
||||
count /= 1024;
|
||||
}
|
||||
if (count - floor(count) == 0.0)
|
||||
{
|
||||
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
|
||||
}
|
||||
else
|
||||
{
|
||||
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
|
||||
}
|
||||
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
95
Grid/allocator/MemoryStats.h
Normal file
95
Grid/allocator/MemoryStats.h
Normal file
@ -0,0 +1,95 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/MemoryStats.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
std::string sizeString(size_t bytes);
|
||||
|
||||
struct MemoryStats
|
||||
{
|
||||
size_t totalAllocated{0}, maxAllocated{0},
|
||||
currentlyAllocated{0}, totalFreed{0};
|
||||
};
|
||||
|
||||
class MemoryProfiler
|
||||
{
|
||||
public:
|
||||
static MemoryStats *stats;
|
||||
static bool debug;
|
||||
};
|
||||
|
||||
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
|
||||
#define profilerDebugPrint \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
|
||||
<< std::endl; \
|
||||
}
|
||||
|
||||
#define profilerAllocate(bytes) \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
s->totalAllocated += (bytes); \
|
||||
s->currentlyAllocated += (bytes); \
|
||||
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
|
||||
} \
|
||||
if (MemoryProfiler::debug) \
|
||||
{ \
|
||||
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
|
||||
profilerDebugPrint; \
|
||||
}
|
||||
|
||||
#define profilerFree(bytes) \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
s->totalFreed += (bytes); \
|
||||
s->currentlyAllocated -= (bytes); \
|
||||
} \
|
||||
if (MemoryProfiler::debug) \
|
||||
{ \
|
||||
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
|
||||
profilerDebugPrint; \
|
||||
}
|
||||
|
||||
void check_huge_pages(void *Buf,uint64_t BYTES);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -87,7 +87,7 @@ sobj eval(const uint64_t ss, const sobj &arg)
|
||||
}
|
||||
|
||||
template <class lobj> accelerator_inline
|
||||
const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
|
||||
const lobj & eval(const uint64_t ss, const LatticeExprView<lobj> &arg)
|
||||
{
|
||||
return arg[ss];
|
||||
}
|
||||
@ -179,16 +179,12 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
|
||||
cb = lat.Checkerboard();
|
||||
}
|
||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
||||
inline void CBFromExpression(int &cb, const T1 ¬lat) // non-lattice leaf
|
||||
{
|
||||
}
|
||||
|
||||
inline void CBFromExpression(int &cb, const T1 ¬lat) {} // non-lattice leaf
|
||||
template <typename Op, typename T1> inline
|
||||
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
|
||||
{
|
||||
CBFromExpression(cb, expr.arg1); // recurse AST
|
||||
}
|
||||
|
||||
template <typename Op, typename T1, typename T2> inline
|
||||
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||
{
|
||||
@ -203,6 +199,68 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
|
||||
CBFromExpression(cb, expr.arg3); // recurse AST
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// ViewOpen
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
|
||||
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
|
||||
{
|
||||
lat.AcceleratorViewOpen();
|
||||
}
|
||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
||||
inline void ExpressionViewOpen(T1 ¬lat) {}
|
||||
|
||||
template <typename Op, typename T1> inline
|
||||
void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr)
|
||||
{
|
||||
ExpressionViewOpen(expr.arg1); // recurse AST
|
||||
}
|
||||
|
||||
template <typename Op, typename T1, typename T2> inline
|
||||
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||
{
|
||||
ExpressionViewOpen(expr.arg1); // recurse AST
|
||||
ExpressionViewOpen(expr.arg2); // recurse AST
|
||||
}
|
||||
template <typename Op, typename T1, typename T2, typename T3>
|
||||
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||
{
|
||||
ExpressionViewOpen(expr.arg1); // recurse AST
|
||||
ExpressionViewOpen(expr.arg2); // recurse AST
|
||||
ExpressionViewOpen(expr.arg3); // recurse AST
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// ViewClose
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
|
||||
inline void ExpressionViewClose( T1 &lat) // Lattice leaf
|
||||
{
|
||||
lat.AcceleratorViewClose();
|
||||
}
|
||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
||||
inline void ExpressionViewClose(T1 ¬lat) {}
|
||||
|
||||
template <typename Op, typename T1> inline
|
||||
void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr)
|
||||
{
|
||||
ExpressionViewClose(expr.arg1); // recurse AST
|
||||
}
|
||||
template <typename Op, typename T1, typename T2> inline
|
||||
void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||
{
|
||||
ExpressionViewClose(expr.arg1); // recurse AST
|
||||
ExpressionViewClose(expr.arg2); // recurse AST
|
||||
}
|
||||
template <typename Op, typename T1, typename T2, typename T3>
|
||||
inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||
{
|
||||
ExpressionViewClose(expr.arg1); // recurse AST
|
||||
ExpressionViewClose(expr.arg2); // recurse AST
|
||||
ExpressionViewClose(expr.arg3); // recurse AST
|
||||
}
|
||||
|
||||
////////////////////////////////////////////
|
||||
// Unary operators and funcs
|
||||
////////////////////////////////////////////
|
||||
|
@ -83,11 +83,9 @@ public:
|
||||
// The copy constructor for this will need to be used by device lambda functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj>
|
||||
class LatticeView : public LatticeAccelerator<vobj>
|
||||
class LatticeExprView : public LatticeAccelerator<vobj>
|
||||
{
|
||||
public:
|
||||
|
||||
|
||||
// Rvalue
|
||||
#ifdef GRID_SIMT
|
||||
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
|
||||
@ -102,11 +100,65 @@ public:
|
||||
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
||||
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
||||
|
||||
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
|
||||
// Non accelerator functions
|
||||
LatticeExprView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
|
||||
~LatticeExprView(){}
|
||||
|
||||
void AcceleratorViewOpen(void)
|
||||
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
|
||||
void *cpu_ptr=this->_odata;
|
||||
// std::cout << "AccViewOpen "<<std::hex<<this->_odata <<std::dec<<std::endl;
|
||||
this->_odata=(vobj *)AllocationCache::AccViewOpen(this->_odata,this->_odata_size*sizeof(vobj),1,0);
|
||||
}
|
||||
void AcceleratorViewClose(void)
|
||||
{ // Inform the manager
|
||||
// std::cout << "View Close"<<std::hex<<this->_odata<<std::dec <<std::endl;
|
||||
AllocationCache::AccViewClose((void *)this->_odata);
|
||||
}
|
||||
void CpuViewOpen(void)
|
||||
{ // Translate the pointer
|
||||
void *cpu_ptr=this->_odata;
|
||||
// std::cout << "CpuViewOpen "<<std::hex<<this->_odata <<std::dec<<std::endl;
|
||||
this->_odata=(vobj *)AllocationCache::CpuViewOpen(cpu_ptr,this->_odata_size*sizeof(vobj),1,0);
|
||||
}
|
||||
void CpuViewClose(void)
|
||||
{ // Inform the manager
|
||||
// std::cout << "CpuViewClose"<<std::hex<<this->_odata<<std::dec <<std::endl;
|
||||
AllocationCache::CpuViewClose((void *)this->_odata);
|
||||
}
|
||||
|
||||
};
|
||||
// UserView constructor,destructor updates view manager
|
||||
// Non-copyable object??? Second base with copy/= deleted?
|
||||
template<class vobj>
|
||||
class LatticeView : public LatticeExprView<vobj>
|
||||
{
|
||||
public:
|
||||
// Rvalue
|
||||
/*
|
||||
#ifdef GRID_SIMT
|
||||
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
|
||||
#else
|
||||
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
||||
#endif
|
||||
|
||||
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
|
||||
|
||||
accelerator_inline uint64_t begin(void) const { return 0;};
|
||||
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
||||
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
||||
*/
|
||||
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeExprView<vobj> (refer_to_me)
|
||||
{
|
||||
this->AcceleratorViewOpen();
|
||||
}
|
||||
~LatticeView(){
|
||||
this->AcceleratorViewClose();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Lattice expression types used by ET to assemble the AST
|
||||
//
|
||||
@ -120,7 +172,7 @@ template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
|
||||
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
|
||||
|
||||
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
|
||||
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
|
||||
template<class T> struct ViewMapBase<T,true> { typedef LatticeExprView<typename T::vector_object> Type; };
|
||||
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
|
||||
|
||||
template <typename Op, typename _T1>
|
||||
@ -231,12 +283,15 @@ public:
|
||||
CBFromExpression(cb,expr);
|
||||
assert( (cb==Odd) || (cb==Even));
|
||||
this->checkerboard=cb;
|
||||
|
||||
|
||||
auto exprCopy = expr;
|
||||
ExpressionViewOpen(exprCopy);
|
||||
auto me = View();
|
||||
accelerator_for(ss,me.size(),1,{
|
||||
auto tmp = eval(ss,expr);
|
||||
auto tmp = eval(ss,exprCopy);
|
||||
vstream(me[ss],tmp);
|
||||
});
|
||||
ExpressionViewClose(exprCopy);
|
||||
return *this;
|
||||
}
|
||||
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
|
||||
@ -251,11 +306,14 @@ public:
|
||||
assert( (cb==Odd) || (cb==Even));
|
||||
this->checkerboard=cb;
|
||||
|
||||
auto exprCopy = expr;
|
||||
ExpressionViewOpen(exprCopy);
|
||||
auto me = View();
|
||||
accelerator_for(ss,me.size(),1,{
|
||||
auto tmp = eval(ss,expr);
|
||||
auto tmp = eval(ss,exprCopy);
|
||||
vstream(me[ss],tmp);
|
||||
});
|
||||
ExpressionViewClose(exprCopy);
|
||||
return *this;
|
||||
}
|
||||
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
|
||||
@ -269,11 +327,14 @@ public:
|
||||
CBFromExpression(cb,expr);
|
||||
assert( (cb==Odd) || (cb==Even));
|
||||
this->checkerboard=cb;
|
||||
auto exprCopy = expr;
|
||||
ExpressionViewOpen(exprCopy);
|
||||
auto me = View();
|
||||
accelerator_for(ss,me.size(),1,{
|
||||
auto tmp = eval(ss,expr);
|
||||
auto tmp = eval(ss,exprCopy);
|
||||
vstream(me[ss],tmp);
|
||||
});
|
||||
ExpressionViewClose(exprCopy);
|
||||
return *this;
|
||||
}
|
||||
//GridFromExpression is tricky to do
|
||||
|
@ -27,6 +27,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#ifdef HAVE_MALLOC_MALLOC_H
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
#ifdef HAVE_MALLOC_H
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
#include <mm_malloc.h>
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
@ -144,8 +155,8 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
||||
};
|
||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||
|
||||
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -192,6 +203,8 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t
|
||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
|
||||
#endif
|
||||
|
||||
@ -275,6 +288,8 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
||||
|
||||
inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
|
||||
#endif
|
||||
|
||||
@ -311,16 +326,8 @@ inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
|
||||
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
|
||||
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||
|
||||
#ifdef HAVE_MALLOC_MALLOC_H
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
#ifdef HAVE_MALLOC_H
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
#include <mm_malloc.h>
|
||||
#endif
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
|
||||
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
||||
|
@ -286,6 +286,8 @@ void Grid_init(int *argc,char ***argv)
|
||||
//////////////////////////////////////////////////////////
|
||||
acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver
|
||||
|
||||
AllocationCache::Init();
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
|
||||
int MB;
|
||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
|
||||
|
17
configure.ac
17
configure.ac
@ -147,7 +147,7 @@ case ${ac_SUMMIT} in
|
||||
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
|
||||
esac
|
||||
|
||||
############### SYCL
|
||||
############### SYCL/CUDA/HIP/none
|
||||
AC_ARG_ENABLE([accelerator],
|
||||
[AC_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none], [enable none,cuda,sycl,hip acceleration])],
|
||||
[ac_ACCELERATOR=${enable_accelerator}], [ac_ACCELERATOR=none])
|
||||
@ -168,6 +168,20 @@ case ${ac_ACCELERATOR} in
|
||||
AC_MSG_ERROR(["Acceleration not suppoorted ${ac_ACCELERATOR}"]);;
|
||||
esac
|
||||
|
||||
############### UNIFIED MEMORY
|
||||
AC_ARG_ENABLE([unified],
|
||||
[AC_HELP_STRING([--enable-unified=yes|no], [enable unified address space for accelerator loops])],
|
||||
[ac_UNIFIED=${enable_unified}], [ac_UNIFIED=yes])
|
||||
case ${ac_UNIFIED} in
|
||||
yes)
|
||||
echo Unified memory for accelerator loops
|
||||
AC_DEFINE([GRID_UVM],[1],[Use unified address space]);;
|
||||
no)
|
||||
echo Manual memory copy for accelerator loops;;
|
||||
*)
|
||||
AC_MSG_ERROR(["Unified virtual memory option not suppoorted ${ac_UNIFIED}"]);;
|
||||
esac
|
||||
|
||||
############### Intel libraries
|
||||
AC_ARG_ENABLE([mkl],
|
||||
[AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
|
||||
@ -612,6 +626,7 @@ compiler version : ${ax_cv_gxx_version}
|
||||
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
|
||||
Threading : ${ac_openmp}
|
||||
Acceleration : ${ac_ACCELERATOR}
|
||||
Unified virtual memory : ${ac_UNIFIED}
|
||||
Communications type : ${comms_type}
|
||||
Shared memory allocator : ${ac_SHM}
|
||||
Shared memory mmap path : ${ac_SHMPATH}
|
||||
|
Loading…
x
Reference in New Issue
Block a user