Automatic data motion options beginning

2025-11-26 09:29:31 +00:00 · 2020-05-17 16:34:25 -04:00
parent a9847aa866
commit ebb60330c9
14 changed files with 963 additions and 148 deletions
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -26,102 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef GRID_ALIGNED_ALLOCATOR_H
-#define GRID_ALIGNED_ALLOCATOR_H
+#pragma once

 NAMESPACE_BEGIN(Grid);

-/*Move control to configure.ac and Config.h*/
-#define POINTER_CACHE
-/*Pinning pages is costly*/
-/*Could maintain separate large and small allocation caches*/
-#ifdef POINTER_CACHE
-class PointerCache {
-private:
-
-  static const int Ncache=128;
-  static int victim;
-
-  typedef struct { 
-    void *address;
-    size_t bytes;
-    int valid;
-  } PointerCacheEntry;
-    
-  static PointerCacheEntry Entries[Ncache];
-
-public:
-
-  static void *Insert(void *ptr,size_t bytes) ;
-  static void *Lookup(size_t bytes) ;
-
-};
-#endif  
-
-std::string sizeString(size_t bytes);
-
-struct MemoryStats
-{
-  size_t totalAllocated{0}, maxAllocated{0}, 
-    currentlyAllocated{0}, totalFreed{0};
-};
-    
-class MemoryProfiler
-{
-public:
-  static MemoryStats *stats;
-  static bool        debug;
-};
-
-#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
-#define profilerDebugPrint						\
-  if (MemoryProfiler::stats)						\
-    {									\
-      auto s = MemoryProfiler::stats;					\
-      std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
-      std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
-		<< std::endl;						\
-      std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
-		<< std::endl;						\
-      std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
-		<< std::endl;						\
-      std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
-		<< std::endl;						\
-    }
-
-#define profilerAllocate(bytes)						\
-  if (MemoryProfiler::stats)						\
-    {									\
-      auto s = MemoryProfiler::stats;					\
-      s->totalAllocated     += (bytes);					\
-      s->currentlyAllocated += (bytes);					\
-      s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated); \
-    }									\
-  if (MemoryProfiler::debug)						\
-    {									\
-      std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
-      profilerDebugPrint;						\
-    }
-
-#define profilerFree(bytes)						\
-  if (MemoryProfiler::stats)						\
-    {									\
-      auto s = MemoryProfiler::stats;					\
-      s->totalFreed         += (bytes);					\
-      s->currentlyAllocated -= (bytes);					\
-    }									\
-  if (MemoryProfiler::debug)						\
-    {									\
-      std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
-      profilerDebugPrint;						\
-    }
-
-void check_huge_pages(void *Buf,uint64_t BYTES);
-
-////////////////////////////////////////////////////////////////////
-// A lattice of something, but assume the something is SIMDized.
-////////////////////////////////////////////////////////////////////
-
 template<typename _Tp>
 class alignedAllocator {
 public: 
@@ -144,42 +52,23 @@ public:
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
+
    profilerAllocate(bytes);

-#ifdef POINTER_CACHE
-    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
-#else
-    pointer ptr = nullptr;
-#endif
-
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) acceleratorAllocShared(bytes);
-
+    _Tp *ptr = (_Tp*) AllocationCache::CpuAllocate(bytes);
+    
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );

-#if 0    
-    size_type page_size=4096;
-    size_type pages = (bytes+page_size-1)/page_size;
-    uint8_t *bp = (uint8_t *)ptr;
-
-    accelerator_for(pg,pages,1,{
-      bp[pg*page_size]=0;
-    });
-#endif
    return ptr;
  }

-  void deallocate(pointer __p, size_type __n) { 
+  void deallocate(pointer __p, size_type __n) 
+  { 
    size_type bytes = __n * sizeof(_Tp);

    profilerFree(bytes);

-#ifdef POINTER_CACHE
-    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
-#else 
-    pointer __freeme = __p;
-#endif
-
-    if ( __freeme ) acceleratorFreeShared((void *)__freeme);
+    AllocationCache::CpuFree((void *)__p,bytes);
  }

  // FIXME: hack for the copy constructor, eventually it must be avoided
@@ -201,4 +90,4 @@ template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<

 NAMESPACE_END(Grid);

-#endif
+
--- a/Grid/allocator/AllocationCache.cc
+++ b/Grid/allocator/AllocationCache.cc
@@ -0,0 +1,159 @@
+#include <Grid/GridCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+/*Allocation types, saying which pointer cache should be used*/
+#define Cpu      (0)
+#define CpuSmall (1)
+#define Acc      (2)
+#define AccSmall (3)
+
+//////////////////////////////////////////////////////////////////////
+// Data tables for recently freed pooiniter caches
+//////////////////////////////////////////////////////////////////////
+AllocationCache::AllocationCacheEntry AllocationCache::Entries[AllocationCache::NallocType][AllocationCache::NallocCacheMax];
+int AllocationCache::Victim[AllocationCache::NallocType];
+int AllocationCache::Ncache[AllocationCache::NallocType];
+
+//////////////////////////////////////////////////////////////////////
+// Actual allocation and deallocation utils
+//////////////////////////////////////////////////////////////////////
+void *AllocationCache::AcceleratorAllocate(size_t bytes)
+{
+  void *ptr = (void *) Lookup(bytes,Acc);
+
+  if ( ptr == (void *) NULL ) 
+    ptr = (void *) acceleratorAllocDevice(bytes);
+
+  return ptr;
+}
+void  AllocationCache::AcceleratorFree    (void *ptr,size_t bytes)
+{
+  void *__freeme = Insert(ptr,bytes,Acc);
+
+  if ( __freeme ) acceleratorFreeShared(__freeme);
+}
+void *AllocationCache::CpuAllocate(size_t bytes)
+{
+  void *ptr = (void *) Lookup(bytes,Cpu);
+
+  if ( ptr == (void *) NULL ) {
+    ptr = (void *) acceleratorAllocShared(bytes);
+    //    std::cout <<"CpuAllocate: allocated pointer "<<std::hex<<ptr<<std::endl;
+  } else {
+    //    std::cout <<"CpuAllocate: cached pointer "<<std::hex<<ptr<<std::endl;
+  }
+
+  return ptr;
+}
+void  AllocationCache::CpuFree    (void *ptr,size_t bytes)
+{
+  // Look up in ViewCache
+  int e=CpuViewLookup(ptr);
+  if(e>=0){ Evict(e); }
+
+  // If present remove entry and free accelerator too.
+  // Can we ever hit a free event with a view still in scope?
+  void *__freeme = Insert(ptr,bytes,Cpu);
+  //  std::cout <<"CpuFree cached pointer "<<std::hex<<ptr<<std::endl;
+  //  std::cout <<"CpuFree deallocating pointer "<<std::hex<<__freeme<<std::endl;
+  if ( __freeme ) acceleratorFreeShared(__freeme);
+}
+//////////////////////////////////////////
+// call only once
+//////////////////////////////////////////
+void AllocationCache::Init(void)
+{
+  Ncache[Cpu] = 8;
+  Ncache[Acc] = 8;
+  Ncache[CpuSmall] = 32;
+  Ncache[AccSmall] = 32;
+
+  char * str;
+  int Nc;
+  int NcS;
+  
+  str= getenv("GRID_ALLOC_NCACHE_LARGE");
+  if ( str ) {
+    Nc = atoi(str);
+    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
+      Ncache[Cpu]=Nc;
+      Ncache[Acc]=Nc;
+    }
+  }
+
+  str= getenv("GRID_ALLOC_NCACHE_SMALL");
+  if ( str ) {
+    Nc = atoi(str);
+    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
+      Ncache[CpuSmall]=Nc;
+      Ncache[AccSmall]=Nc;
+    }
+  }
+}
+
+void *AllocationCache::Insert(void *ptr,size_t bytes,int type) 
+{
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache = type + small;
+  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
+}
+void *AllocationCache::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
+{
+  assert(ncache>0);
+#ifdef GRID_OMP
+  assert(omp_in_parallel()==0);
+#endif 
+
+  void * ret = NULL;
+  int v = -1;
+
+  for(int e=0;e<ncache;e++) {
+    if ( entries[e].valid==0 ) {
+      v=e; 
+      break;
+    }
+  }
+
+  if ( v==-1 ) {
+    v=victim;
+    victim = (victim+1)%ncache;
+  }
+
+  if ( entries[v].valid ) {
+    ret = entries[v].address;
+    entries[v].valid = 0;
+    entries[v].address = NULL;
+    entries[v].bytes = 0;
+  }
+
+  entries[v].address=ptr;
+  entries[v].bytes  =bytes;
+  entries[v].valid  =1;
+
+  return ret;
+}
+
+void *AllocationCache::Lookup(size_t bytes,int type)
+{
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache = type+small;
+  return Lookup(bytes,Entries[cache],Ncache[cache]);
+}
+void *AllocationCache::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
+{
+  assert(ncache>0);
+#ifdef GRID_OMP
+  assert(omp_in_parallel()==0);
+#endif 
+  for(int e=0;e<ncache;e++){
+    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
+      entries[e].valid = 0;
+      return entries[e].address;
+    }
+  }
+  return NULL;
+}
+
+NAMESPACE_END(Grid);
+
--- a/Grid/allocator/AllocationCache.h
+++ b/Grid/allocator/AllocationCache.h
@@ -0,0 +1,93 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/AllocationCache.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+// Move control to configure.ac and Config.h?
+
+#define ALLOCATION_CACHE
+#define GRID_ALLOC_ALIGN (2*1024*1024)
+#define GRID_ALLOC_SMALL_LIMIT (4096)
+
+/*Pinning pages is costly*/
+
+class AllocationCache {
+private:
+
+  ////////////////////////////////////////////////////////////
+  // For caching recently freed allocations
+  ////////////////////////////////////////////////////////////
+  typedef struct { 
+    void *address;
+    size_t bytes;
+    int valid;
+  } AllocationCacheEntry;
+
+  static const int NallocCacheMax=128; 
+  static const int NallocType=4;
+  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
+  static int Victim[NallocType];
+  static int Ncache[NallocType];
+
+  /////////////////////////////////////////////////
+  // Free pool
+  /////////////////////////////////////////////////
+  static void *Insert(void *ptr,size_t bytes,int type) ;
+  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
+  static void *Lookup(size_t bytes,int type) ;
+  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
+
+  /////////////////////////////////////////////////
+  // Internal device view
+  /////////////////////////////////////////////////
+  static void *AcceleratorAllocate(size_t bytes);
+  static void  AcceleratorFree    (void *ptr,size_t bytes);
+  static int   ViewVictim(void);
+  static void  Evict(int e);
+  static void  Flush(int e);
+  static void  Clone(int e);
+  static int   CpuViewLookup(void *CpuPtr);
+  static int   AccViewLookup(void *AccPtr);
+
+public:
+  static void Init(void);
+
+  static void  AccViewClose(void* AccPtr);
+  static void  CpuViewClose(void* CpuPtr);
+  static void *AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
+  static void *CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
+
+  static void *CpuAllocate(size_t bytes);
+  static void  CpuFree    (void *ptr,size_t bytes);
+};
+
+NAMESPACE_END(Grid);
+
+
--- a/Grid/allocator/Allocator.h
+++ b/Grid/allocator/Allocator.h
@@ -0,0 +1,4 @@
+#pragma once
+#include <Grid/allocator/MemoryStats.h>
+#include <Grid/allocator/AllocationCache.h>
+#include <Grid/allocator/AlignedAllocator.h>
--- a/Grid/allocator/MemoryCacheDeviceMem.cc
+++ b/Grid/allocator/MemoryCacheDeviceMem.cc
@@ -0,0 +1,338 @@
+#include <Grid/GridCore.h>
+#ifndef GRID_UNIFIED
+
+#warning "Using explicit device memory copies"
+NAMESPACE_BEGIN(Grid);
+#define dprintf(...) 
+
+////////////////////////////////////////////////////////////
+// For caching copies of data on device
+////////////////////////////////////////////////////////////
+const int NaccCacheMax=128; 
+
+typedef struct { 
+  void *CpuPtr;
+  void *AccPtr;
+  size_t bytes;
+  uint32_t transient;
+  uint32_t state;
+  uint32_t accLock;
+  uint32_t cpuLock;
+} AcceleratorViewEntry;
+
+#define Write (1)
+#define Read  (2)
+#define WriteDiscard (3)
+//////////////////////////////////////////////////////////////////////
+// Data tables for ViewCache
+//////////////////////////////////////////////////////////////////////
+static AcceleratorViewEntry AccCache[NaccCacheMax];
+static int AccCacheVictim; // Base for round robin search
+static int NaccCache = 8;
+
+////////////////////////////////////
+// Priority ordering for unlocked entries
+//  Empty
+//  CpuDirty 
+//  Consistent
+//  AccDirty
+////////////////////////////////////
+#define Empty         (0x0)  /*Entry unoccupied  */
+#define CpuDirty      (0x1)  /*CPU copy is golden, Acc buffer MAY not be allocated*/
+#define Consistent    (0x2)  /*ACC copy AND CPU copy are valid */
+#define AccDirty      (0x4)  /*ACC copy is golden */
+#define EvictNext     (0x8)  /*Priority for eviction*/
+
+int   AllocationCache::ViewVictim(void)
+{
+  int prioEmpty            =-1;
+  int prioCpuDirty         =-1;
+  int prioConsistent       =-1;
+  int prioAccDirty         =-1;
+  int prioCpuDirtyEN       =-1;
+  int prioConsistentEN     =-1;
+  int prioAccDirtyEN       =-1;
+
+  int victim=-1;
+
+  // round robin priority search of unlocked entries offset from current victim
+  for(int ep=0;ep<NaccCache;ep++){
+    int e = (ep+AccCacheVictim)%NaccCache;
+    dprintf("AllocationCacheDeviceMem: Inspecting cache entry %d :",e);
+
+    uint32_t locks = AccCache[e].cpuLock+AccCache[e].accLock;
+    uint32_t s = AccCache[e].state;
+    uint32_t t = AccCache[e].transient;
+
+    assert( (s==Empty)||(s==CpuDirty)||(s==AccDirty)||(s==Consistent));
+
+    if ( locks==0 ) {
+
+      if( s==Empty       ) { prioEmpty = e; dprintf("Empty");}
+
+      if( t == EvictNext ) {
+	if( s==CpuDirty    ) { prioCpuDirtyEN     = e; dprintf("CpuDirty Transient");}
+	if( s==Consistent  ) { prioConsistentEN   = e; dprintf("Consistent Transient");}
+	if( s==AccDirty    ) { prioAccDirtyEN     = e; dprintf("AccDirty Transient");}
+      } else { 
+	if( s==CpuDirty    ) { prioCpuDirty     = e; dprintf("CpuDirty");}
+	if( s==Consistent  ) { prioConsistent   = e; dprintf("Consistent");}
+	if( s==AccDirty    ) { prioAccDirty     = e; dprintf("AccDirty");}
+      } 
+      
+    } else { 
+      if ( AccCache[e].cpuLock ) dprintf("Locked in Cpu ");
+      if ( AccCache[e].accLock ) dprintf("Locked in Acc ");
+    }
+    dprintf("\n");
+  }
+  // This encodes the prioritisation for device residency
+  // EvictNext provides a transient mechanism
+  if ( prioAccDirty     >= 0 ) victim = prioAccDirty;
+  if ( prioConsistent   >= 0 ) victim = prioConsistent;
+  if ( prioCpuDirty     >= 0 ) victim = prioCpuDirty;
+  if ( prioAccDirtyEN   >= 0 ) victim = prioAccDirtyEN;
+  if ( prioConsistentEN >= 0 ) victim = prioConsistentEN;
+  if ( prioCpuDirtyEN   >= 0 ) victim = prioCpuDirtyEN;
+  if ( prioEmpty        >= 0 ) victim = prioEmpty;       /*Highest prio is winner*/
+
+  assert(victim >= 0); // Must succeed/
+  dprintf("AllocationCacheDeviceMem: Selected victim cache entry %d\n",victim);
+
+  // advance victim pointer
+  AccCacheVictim=(AccCacheVictim+1)%NaccCache;
+  dprintf("AllocationCacheDeviceMem: victim pointer now %d / %d\n",AccCacheVictim,NaccCache);
+
+  return victim;
+}
+/////////////////////////////////////////////////
+// Accelerator cache motion
+/////////////////////////////////////////////////
+void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerator, remove entry
+{
+  if(AccCache[e].state!=Empty){
+    dprintf("AllocationCache: Evict(%d) %llx,%llxn",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
+    assert(AccCache[e].accLock==0);
+    assert(AccCache[e].cpuLock==0);
+    if(AccCache[e].state==AccDirty) {
+      Flush(e);
+    }
+    assert(AccCache[e].CpuPtr!=NULL);
+    if(AccCache[e].AccPtr) {
+      dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr);
+      AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes);
+    }
+  }
+  AccCache[e].AccPtr=NULL;
+  AccCache[e].CpuPtr=NULL;
+  AccCache[e].bytes=0;
+  AccCache[e].state=Empty;
+  AccCache[e].accLock=0;
+  AccCache[e].cpuLock=0;
+}
+void AllocationCache::Flush(int e)// Copy back from a dirty device state and mark consistent. Do not remove
+{
+  dprintf("AllocationCache: Flush(%d) %llx -> %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
+  assert(AccCache[e].state==AccDirty);
+  assert(AccCache[e].cpuLock==0);
+  assert(AccCache[e].accLock==0);
+  assert(AccCache[e].AccPtr!=NULL);
+  assert(AccCache[e].CpuPtr!=NULL);
+  acceleratorCopyFromDevice(AccCache[e].AccPtr,AccCache[e].CpuPtr,AccCache[e].bytes);
+  AccCache[e].state=Consistent;
+}
+void AllocationCache::Clone(int e)// Copy from CPU, mark consistent. Allocate if necessary
+{
+  assert(AccCache[e].state==CpuDirty);
+  assert(AccCache[e].cpuLock==0);
+  assert(AccCache[e].accLock==0);
+  assert(AccCache[e].CpuPtr!=NULL);
+  if(AccCache[e].AccPtr==NULL){
+    AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes);
+  }
+  dprintf("AllocationCache: Clone(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
+  acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes);
+  AccCache[e].state=Consistent;
+}
+/////////////////////////////////////////////////////////////////////////////////
+// View management
+/////////////////////////////////////////////////////////////////////////////////
+void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient)
+{
+  ////////////////////////////////////////////////////////////////////////////
+  // Find if present, otherwise get or force an empty
+  ////////////////////////////////////////////////////////////////////////////
+  int e=CpuViewLookup(CpuPtr);
+  if(e==-1) {
+    e = ViewVictim();
+    Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
+  }
+
+  assert(AccCache[e].cpuLock==0);  // Programming error
+
+  if(AccCache[e].state!=Empty) {
+    assert(AccCache[e].CpuPtr == CpuPtr);
+    assert(AccCache[e].bytes==bytes);
+  }
+/*
+ *  State transitions and actions
+ *
+ *  Action  State   StateNext         Flush    Clone
+ *
+ *  AccRead  Empty   Consistent        -        Y
+ *  AccWrite Empty   AccDirty          -        Y
+ *  AccRead  CpuDirty Consistent       -        Y
+ *  AccWrite CpuDirty AccDirty         -        Y
+ *  AccRead  Consistent Consistent     -        - 
+ *  AccWrite Consistent AccDirty       -        - 
+ *  AccRead  AccDirty   AccDirty       -        - 
+ *  AccWrite AccDirty   AccDirty       -        - 
+ */
+  if(AccCache[e].state==Empty) {
+    AccCache[e].CpuPtr = CpuPtr;
+    AccCache[e].AccPtr = NULL;
+    AccCache[e].bytes  = bytes;
+    AccCache[e].state  = CpuDirty;   // Cpu starts primary
+    Clone(e); 
+    if(mode==Write)
+      AccCache[e].state  = AccDirty;   // Empty + AccWrite=> AccDirty
+    else
+      AccCache[e].state  = Consistent; // Empty + AccRead => Consistent
+    AccCache[e].accLock= 1;
+  } else if(AccCache[e].state&CpuDirty ){
+    Clone(e); 
+    if(mode==Write)
+      AccCache[e].state  = AccDirty;   // CpuDirty + AccWrite=> AccDirty
+    else
+      AccCache[e].state  = Consistent; // CpuDirty + AccRead => Consistent
+    AccCache[e].accLock++;
+  } else if(AccCache[e].state&Consistent) {
+    if(mode==Write)
+      AccCache[e].state  = AccDirty;   // Consistent + AccWrite=> AccDirty
+    else
+      AccCache[e].state  = Consistent; // Consistent + AccRead => Consistent
+    AccCache[e].accLock++;
+  } else if(AccCache[e].state&AccDirty) {
+    if(mode==Write)
+      AccCache[e].state  = AccDirty; // AccDirty + AccWrite=> AccDirty
+    else
+      AccCache[e].state  = AccDirty; // AccDirty + AccRead => AccDirty
+    AccCache[e].accLock++;
+  } else {
+    assert(0);
+  }
+
+  AccCache[e].transient= transient? EvictNext : 0;
+
+  return AccCache[e].AccPtr;
+}
+/*
+ *  Action  State   StateNext         Flush    Clone
+ *
+ *  CpuRead  Empty   CpuDirty          -        -
+ *  CpuWrite Empty   CpuDirty          -        -
+ *  CpuRead  CpuDirty CpuDirty         -        -
+ *  CpuWrite CpuDirty CpuDirty         -        - 
+ *  CpuRead  Consistent Consistent     -        - 
+ *  CpuWrite Consistent CpuDirty       -        - 
+ *  CpuRead  AccDirty   Consistent     Y        -
+ *  CpuWrite AccDirty   CpuDirty       Y        -
+ */
+////////////////////////////////////
+// look up & decrement lock count
+////////////////////////////////////
+void AllocationCache::AccViewClose(void* AccPtr)
+{
+  int e=AccViewLookup(AccPtr);
+  assert(e!=-1);
+  assert(AccCache[e].cpuLock==0);
+  assert(AccCache[e].accLock>0);
+  AccCache[e].accLock--;
+}
+void AllocationCache::CpuViewClose(void* CpuPtr)
+{
+  int e=CpuViewLookup(CpuPtr);
+  assert(e!=-1);
+  assert(AccCache[e].cpuLock>0);
+  assert(AccCache[e].accLock==0);
+  AccCache[e].cpuLock--;
+}
+void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient)
+{
+  ////////////////////////////////////////////////////////////////////////////
+  // Find if present, otherwise get or force an empty
+  ////////////////////////////////////////////////////////////////////////////
+  int e=CpuViewLookup(CpuPtr);
+  if(e==-1) {
+    e = ViewVictim();
+    Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
+  }
+
+  assert(AccCache[e].accLock==0);  // Programming error
+
+  if(AccCache[e].state!=Empty) {
+    assert(AccCache[e].CpuPtr == CpuPtr);
+    assert(AccCache[e].bytes==bytes);
+  }
+
+  if(AccCache[e].state==Empty) {
+    AccCache[e].CpuPtr = CpuPtr;
+    AccCache[e].AccPtr = NULL;
+    AccCache[e].bytes  = bytes;
+    AccCache[e].state  = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
+    AccCache[e].accLock= 0;
+    AccCache[e].cpuLock= 1;
+  } else if(AccCache[e].state==CpuDirty ){
+    // AccPtr dont care, deferred allocate
+    AccCache[e].state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
+    AccCache[e].cpuLock++;
+  } else if(AccCache[e].state==Consistent) {
+    assert(AccCache[e].AccPtr != NULL);
+    if(mode==Write)
+      AccCache[e].state = CpuDirty;   // Consistent +CpuWrite => CpuDirty
+    else 
+      AccCache[e].state = Consistent; // Consistent +CpuRead  => Consistent
+    AccCache[e].cpuLock++;
+  } else if(AccCache[e].state==AccDirty) {
+    assert(AccCache[e].AccPtr != NULL);
+    Flush(e);
+    if(mode==Write) AccCache[e].state = CpuDirty;   // AccDirty +CpuWrite => CpuDirty, Flush
+    else            AccCache[e].state = Consistent; // AccDirty +CpuRead  => Consistent, Flush
+    AccCache[e].cpuLock++;
+  } else {
+    assert(0); // should be unreachable
+  }
+
+  AccCache[e].transient= transient? EvictNext : 0;
+
+  return AccCache[e].CpuPtr;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//loop round robin over entries checking acc pointer
+//////////////////////////////////////////////////////////////////////////////
+int   AllocationCache::CpuViewLookup(void *CpuPtr)
+{
+  assert(CpuPtr!=NULL);
+  for(int e=0;e<NaccCache;e++){
+    if ( (AccCache[e].state!=Empty) && (AccCache[e].CpuPtr==CpuPtr) ) {
+      return e;
+    }
+  }
+  return -1;
+}
+int   AllocationCache::AccViewLookup(void *AccPtr)
+{
+  assert(AccPtr!=NULL);
+  for(int e=0;e<NaccCache;e++){
+    if ( (AccCache[e].state!=Empty) && (AccCache[e].AccPtr==AccPtr) ) {
+      return e;
+    }
+  }
+  return -1;
+}
+
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/allocator/MemoryCacheShared.cc
+++ b/Grid/allocator/MemoryCacheShared.cc
@@ -0,0 +1,27 @@
+#include <Grid/GridCore.h>
+#ifdef GRID_UNIFIED
+
+#warning "Grid is assuming unified virtual memory address space"
+NAMESPACE_BEGIN(Grid);
+/////////////////////////////////////////////////////////////////////////////////
+// View management is 1:1 address space mapping
+/////////////////////////////////////////////////////////////////////////////////
+
+void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; }
+void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; }
+void  AllocationCache::AccViewClose(void* AccPtr){}
+void  AllocationCache::CpuViewClose(void* CpuPtr){}
+
+/////////////////////////////////////
+// Dummy stubs
+/////////////////////////////////////
+int  AllocationCache::ViewVictim(void)  { assert(0); return 0;}
+void AllocationCache::Evict(int e)      { assert(0);}
+void AllocationCache::Flush(int e)      { assert(0);}
+void AllocationCache::Clone(int e)      { assert(0);}
+
+int   AllocationCache::CpuViewLookup(void *CpuPtr){assert(0); return 0;}
+int   AllocationCache::AccViewLookup(void *AccPtr){assert(0); return 0;}
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/allocator/MemoryStats.cc
+++ b/Grid/allocator/MemoryStats.cc
@@ -0,0 +1,67 @@
+#include <Grid/GridCore.h>
+#include <fcntl.h>
+
+NAMESPACE_BEGIN(Grid);
+
+MemoryStats *MemoryProfiler::stats = nullptr;
+bool         MemoryProfiler::debug = false;
+
+void check_huge_pages(void *Buf,uint64_t BYTES)
+{
+#ifdef __linux__
+  int fd = open("/proc/self/pagemap", O_RDONLY);
+  assert(fd >= 0);
+  const int page_size = 4096;
+  uint64_t virt_pfn = (uint64_t)Buf / page_size;
+  off_t offset = sizeof(uint64_t) * virt_pfn;
+  uint64_t npages = (BYTES + page_size-1) / page_size;
+  uint64_t pagedata[npages];
+  uint64_t ret = lseek(fd, offset, SEEK_SET);
+  assert(ret == offset);
+  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
+  assert(ret == sizeof(uint64_t) * npages);
+  int nhugepages = npages / 512;
+  int n4ktotal, nnothuge;
+  n4ktotal = 0;
+  nnothuge = 0;
+  for (int i = 0; i < nhugepages; ++i) {
+    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
+    for (int j = 0; j < 512; ++j) {
+      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
+      ++n4ktotal;
+      if (pageaddr != baseaddr + j * page_size)
+	++nnothuge;
+    }
+  }
+  int rank = CartesianCommunicator::RankWorld();
+  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
+#endif
+}
+
+std::string sizeString(const size_t bytes)
+{
+  constexpr unsigned int bufSize = 256;
+  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
+  char                   buf[256];
+  size_t                 s     = 0;
+  double                 count = bytes;
+  
+  while (count >= 1024 && s < 7)
+    {
+      s++;
+      count /= 1024;
+    }
+  if (count - floor(count) == 0.0)
+    {
+      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
+    }
+  else
+    {
+      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
+    }
+  
+  return std::string(buf);
+}
+
+NAMESPACE_END(Grid);
+
--- a/Grid/allocator/MemoryStats.h
+++ b/Grid/allocator/MemoryStats.h
@@ -0,0 +1,95 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/MemoryStats.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+
+NAMESPACE_BEGIN(Grid);
+
+std::string sizeString(size_t bytes);
+
+struct MemoryStats
+{
+  size_t totalAllocated{0}, maxAllocated{0}, 
+    currentlyAllocated{0}, totalFreed{0};
+};
+    
+class MemoryProfiler
+{
+public:
+  static MemoryStats *stats;
+  static bool        debug;
+};
+
+#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
+#define profilerDebugPrint						\
+  if (MemoryProfiler::stats)						\
+    {									\
+      auto s = MemoryProfiler::stats;					\
+      std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
+      std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
+		<< std::endl;						\
+      std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
+		<< std::endl;						\
+      std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
+		<< std::endl;						\
+      std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
+		<< std::endl;						\
+    }
+
+#define profilerAllocate(bytes)						\
+  if (MemoryProfiler::stats)						\
+    {									\
+      auto s = MemoryProfiler::stats;					\
+      s->totalAllocated     += (bytes);					\
+      s->currentlyAllocated += (bytes);					\
+      s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated); \
+    }									\
+  if (MemoryProfiler::debug)						\
+    {									\
+      std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
+      profilerDebugPrint;						\
+    }
+
+#define profilerFree(bytes)						\
+  if (MemoryProfiler::stats)						\
+    {									\
+      auto s = MemoryProfiler::stats;					\
+      s->totalFreed         += (bytes);					\
+      s->currentlyAllocated -= (bytes);					\
+    }									\
+  if (MemoryProfiler::debug)						\
+    {									\
+      std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
+      profilerDebugPrint;						\
+    }
+
+void check_huge_pages(void *Buf,uint64_t BYTES);
+
+NAMESPACE_END(Grid);
+