Automatic data motion options beginning

2025-07-02 14:37:06 +01:00 · 2020-05-17 16:34:25 -04:00
parent a9847aa866
commit ebb60330c9
14 changed files with 963 additions and 148 deletions
--- a/Grid/GridCore.h
+++ b/Grid/GridCore.h
@ -47,7 +47,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/perfmon/PerfCount.h>
 #include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
-#include <Grid/allocator/AlignedAllocator.h>
+#include <Grid/allocator/Allocator.h>
 #include <Grid/simd/Simd.h>
 #include <Grid/threads/ThreadReduction.h>
 #include <Grid/serialisation/Serialisation.h>
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@ -26,102 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef GRID_ALIGNED_ALLOCATOR_H
-#define GRID_ALIGNED_ALLOCATOR_H
+#pragma once

 NAMESPACE_BEGIN(Grid);

-/*Move control to configure.ac and Config.h*/
-#define POINTER_CACHE
-/*Pinning pages is costly*/
-/*Could maintain separate large and small allocation caches*/
-#ifdef POINTER_CACHE
-class PointerCache {
-private:
-
-  static const int Ncache=128;
-  static int victim;
-
-  typedef struct { 
-    void *address;
-    size_t bytes;
-    int valid;
-  } PointerCacheEntry;
-    
-  static PointerCacheEntry Entries[Ncache];
-
-public:
-
-  static void *Insert(void *ptr,size_t bytes) ;
-  static void *Lookup(size_t bytes) ;
-
-};
-#endif  
-
-std::string sizeString(size_t bytes);
-
-struct MemoryStats
-{
-  size_t totalAllocated{0}, maxAllocated{0}, 
-    currentlyAllocated{0}, totalFreed{0};
-};
-    
-class MemoryProfiler
-{
-public:
-  static MemoryStats *stats;
-  static bool        debug;
-};
-
-#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
-#define profilerDebugPrint						\
-  if (MemoryProfiler::stats)						\
-    {									\
-      auto s = MemoryProfiler::stats;					\
-      std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
-      std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
-		<< std::endl;						\
-      std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
-		<< std::endl;						\
-      std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
-		<< std::endl;						\
-      std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
-		<< std::endl;						\
-    }
-
-#define profilerAllocate(bytes)						\
-  if (MemoryProfiler::stats)						\
-    {									\
-      auto s = MemoryProfiler::stats;					\
-      s->totalAllocated     += (bytes);					\
-      s->currentlyAllocated += (bytes);					\
-      s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated); \
-    }									\
-  if (MemoryProfiler::debug)						\
-    {									\
-      std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
-      profilerDebugPrint;						\
-    }
-
-#define profilerFree(bytes)						\
-  if (MemoryProfiler::stats)						\
-    {									\
-      auto s = MemoryProfiler::stats;					\
-      s->totalFreed         += (bytes);					\
-      s->currentlyAllocated -= (bytes);					\
-    }									\
-  if (MemoryProfiler::debug)						\
-    {									\
-      std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
-      profilerDebugPrint;						\
-    }
-
-void check_huge_pages(void *Buf,uint64_t BYTES);
-
-////////////////////////////////////////////////////////////////////
-// A lattice of something, but assume the something is SIMDized.
-////////////////////////////////////////////////////////////////////
-
 template<typename _Tp>
 class alignedAllocator {
 public: 
@ -144,42 +52,23 @@ public:
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
+
    profilerAllocate(bytes);

-#ifdef POINTER_CACHE
-    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
-#else
-    pointer ptr = nullptr;
-#endif
-
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) acceleratorAllocShared(bytes);
-
+    _Tp *ptr = (_Tp*) AllocationCache::CpuAllocate(bytes);
+    
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );

-#if 0    
-    size_type page_size=4096;
-    size_type pages = (bytes+page_size-1)/page_size;
-    uint8_t *bp = (uint8_t *)ptr;
-
-    accelerator_for(pg,pages,1,{
-      bp[pg*page_size]=0;
-    });
-#endif
    return ptr;
  }

-  void deallocate(pointer __p, size_type __n) { 
+  void deallocate(pointer __p, size_type __n) 
+  { 
    size_type bytes = __n * sizeof(_Tp);

    profilerFree(bytes);

-#ifdef POINTER_CACHE
-    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
-#else 
-    pointer __freeme = __p;
-#endif
-
-    if ( __freeme ) acceleratorFreeShared((void *)__freeme);
+    AllocationCache::CpuFree((void *)__p,bytes);
  }

  // FIXME: hack for the copy constructor, eventually it must be avoided
@ -201,4 +90,4 @@ template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<

 NAMESPACE_END(Grid);

-#endif
+
--- a/Grid/allocator/AllocationCache.cc
+++ b/Grid/allocator/AllocationCache.cc
@ -0,0 +1,159 @@
+#include <Grid/GridCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+/*Allocation types, saying which pointer cache should be used*/
+#define Cpu      (0)
+#define CpuSmall (1)
+#define Acc      (2)
+#define AccSmall (3)
+
+//////////////////////////////////////////////////////////////////////
+// Data tables for recently freed pooiniter caches
+//////////////////////////////////////////////////////////////////////
+AllocationCache::AllocationCacheEntry AllocationCache::Entries[AllocationCache::NallocType][AllocationCache::NallocCacheMax];
+int AllocationCache::Victim[AllocationCache::NallocType];
+int AllocationCache::Ncache[AllocationCache::NallocType];
+
+//////////////////////////////////////////////////////////////////////
+// Actual allocation and deallocation utils
+//////////////////////////////////////////////////////////////////////
+void *AllocationCache::AcceleratorAllocate(size_t bytes)
+{
+  void *ptr = (void *) Lookup(bytes,Acc);
+
+  if ( ptr == (void *) NULL ) 
+    ptr = (void *) acceleratorAllocDevice(bytes);
+
+  return ptr;
+}
+void  AllocationCache::AcceleratorFree    (void *ptr,size_t bytes)
+{
+  void *__freeme = Insert(ptr,bytes,Acc);
+
+  if ( __freeme ) acceleratorFreeShared(__freeme);
+}
+void *AllocationCache::CpuAllocate(size_t bytes)
+{
+  void *ptr = (void *) Lookup(bytes,Cpu);
+
+  if ( ptr == (void *) NULL ) {
+    ptr = (void *) acceleratorAllocShared(bytes);
+    //    std::cout <<"CpuAllocate: allocated pointer "<<std::hex<<ptr<<std::endl;
+  } else {
+    //    std::cout <<"CpuAllocate: cached pointer "<<std::hex<<ptr<<std::endl;
+  }
+
+  return ptr;
+}
+void  AllocationCache::CpuFree    (void *ptr,size_t bytes)
+{
+  // Look up in ViewCache
+  int e=CpuViewLookup(ptr);
+  if(e>=0){ Evict(e); }
+
+  // If present remove entry and free accelerator too.
+  // Can we ever hit a free event with a view still in scope?
+  void *__freeme = Insert(ptr,bytes,Cpu);
+  //  std::cout <<"CpuFree cached pointer "<<std::hex<<ptr<<std::endl;
+  //  std::cout <<"CpuFree deallocating pointer "<<std::hex<<__freeme<<std::endl;
+  if ( __freeme ) acceleratorFreeShared(__freeme);
+}
+//////////////////////////////////////////
+// call only once
+//////////////////////////////////////////
+void AllocationCache::Init(void)
+{
+  Ncache[Cpu] = 8;
+  Ncache[Acc] = 8;
+  Ncache[CpuSmall] = 32;
+  Ncache[AccSmall] = 32;
+
+  char * str;
+  int Nc;
+  int NcS;
+  
+  str= getenv("GRID_ALLOC_NCACHE_LARGE");
+  if ( str ) {
+    Nc = atoi(str);
+    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
+      Ncache[Cpu]=Nc;
+      Ncache[Acc]=Nc;
+    }
+  }
+
+  str= getenv("GRID_ALLOC_NCACHE_SMALL");
+  if ( str ) {
+    Nc = atoi(str);
+    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
+      Ncache[CpuSmall]=Nc;
+      Ncache[AccSmall]=Nc;
+    }
+  }
+}
+
+void *AllocationCache::Insert(void *ptr,size_t bytes,int type) 
+{
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache = type + small;
+  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
+}
+void *AllocationCache::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
+{
+  assert(ncache>0);
+#ifdef GRID_OMP
+  assert(omp_in_parallel()==0);
+#endif 
+
+  void * ret = NULL;
+  int v = -1;
+
+  for(int e=0;e<ncache;e++) {
+    if ( entries[e].valid==0 ) {
+      v=e; 
+      break;
+    }
+  }
+
+  if ( v==-1 ) {
+    v=victim;
+    victim = (victim+1)%ncache;
+  }
+
+  if ( entries[v].valid ) {
+    ret = entries[v].address;
+    entries[v].valid = 0;
+    entries[v].address = NULL;
+    entries[v].bytes = 0;
+  }
+
+  entries[v].address=ptr;
+  entries[v].bytes  =bytes;
+  entries[v].valid  =1;
+
+  return ret;
+}
+
+void *AllocationCache::Lookup(size_t bytes,int type)
+{
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache = type+small;
+  return Lookup(bytes,Entries[cache],Ncache[cache]);
+}
+void *AllocationCache::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
+{
+  assert(ncache>0);
+#ifdef GRID_OMP
+  assert(omp_in_parallel()==0);
+#endif 
+  for(int e=0;e<ncache;e++){
+    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
+      entries[e].valid = 0;
+      return entries[e].address;
+    }
+  }
+  return NULL;
+}
+
+NAMESPACE_END(Grid);
+
--- a/Grid/allocator/AllocationCache.h
+++ b/Grid/allocator/AllocationCache.h
@ -0,0 +1,93 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/AllocationCache.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+// Move control to configure.ac and Config.h?
+
+#define ALLOCATION_CACHE
+#define GRID_ALLOC_ALIGN (2*1024*1024)
+#define GRID_ALLOC_SMALL_LIMIT (4096)
+
+/*Pinning pages is costly*/
+
+class AllocationCache {
+private:
+
+  ////////////////////////////////////////////////////////////
+  // For caching recently freed allocations
+  ////////////////////////////////////////////////////////////
+  typedef struct { 
+    void *address;
+    size_t bytes;
+    int valid;
+  } AllocationCacheEntry;
+
+  static const int NallocCacheMax=128; 
+  static const int NallocType=4;
+  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
+  static int Victim[NallocType];
+  static int Ncache[NallocType];
+
+  /////////////////////////////////////////////////
+  // Free pool
+  /////////////////////////////////////////////////
+  static void *Insert(void *ptr,size_t bytes,int type) ;
+  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
+  static void *Lookup(size_t bytes,int type) ;
+  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
+
+  /////////////////////////////////////////////////
+  // Internal device view
+  /////////////////////////////////////////////////
+  static void *AcceleratorAllocate(size_t bytes);
+  static void  AcceleratorFree    (void *ptr,size_t bytes);
+  static int   ViewVictim(void);
+  static void  Evict(int e);
+  static void  Flush(int e);
+  static void  Clone(int e);
+  static int   CpuViewLookup(void *CpuPtr);
+  static int   AccViewLookup(void *AccPtr);
+
+public:
+  static void Init(void);
+
+  static void  AccViewClose(void* AccPtr);
+  static void  CpuViewClose(void* CpuPtr);
+  static void *AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
+  static void *CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
+
+  static void *CpuAllocate(size_t bytes);
+  static void  CpuFree    (void *ptr,size_t bytes);
+};
+
+NAMESPACE_END(Grid);
+
+
--- a/Grid/allocator/Allocator.h
+++ b/Grid/allocator/Allocator.h
@ -0,0 +1,4 @@
+#pragma once
+#include <Grid/allocator/MemoryStats.h>
+#include <Grid/allocator/AllocationCache.h>
+#include <Grid/allocator/AlignedAllocator.h>
--- a/Grid/allocator/MemoryCacheDeviceMem.cc
+++ b/Grid/allocator/MemoryCacheDeviceMem.cc
@ -0,0 +1,338 @@
+#include <Grid/GridCore.h>
+#ifndef GRID_UNIFIED
+
+#warning "Using explicit device memory copies"
+NAMESPACE_BEGIN(Grid);
+#define dprintf(...) 
+
+////////////////////////////////////////////////////////////
+// For caching copies of data on device
+////////////////////////////////////////////////////////////
+const int NaccCacheMax=128; 
+
+typedef struct { 
+  void *CpuPtr;
+  void *AccPtr;
+  size_t bytes;
+  uint32_t transient;
+  uint32_t state;
+  uint32_t accLock;
+  uint32_t cpuLock;
+} AcceleratorViewEntry;
+
+#define Write (1)
+#define Read  (2)
+#define WriteDiscard (3)
+//////////////////////////////////////////////////////////////////////
+// Data tables for ViewCache
+//////////////////////////////////////////////////////////////////////
+static AcceleratorViewEntry AccCache[NaccCacheMax];
+static int AccCacheVictim; // Base for round robin search
+static int NaccCache = 8;
+
+////////////////////////////////////
+// Priority ordering for unlocked entries
+//  Empty
+//  CpuDirty 
+//  Consistent
+//  AccDirty
+////////////////////////////////////
+#define Empty         (0x0)  /*Entry unoccupied  */
+#define CpuDirty      (0x1)  /*CPU copy is golden, Acc buffer MAY not be allocated*/
+#define Consistent    (0x2)  /*ACC copy AND CPU copy are valid */
+#define AccDirty      (0x4)  /*ACC copy is golden */
+#define EvictNext     (0x8)  /*Priority for eviction*/
+
+int   AllocationCache::ViewVictim(void)
+{
+  int prioEmpty            =-1;
+  int prioCpuDirty         =-1;
+  int prioConsistent       =-1;
+  int prioAccDirty         =-1;
+  int prioCpuDirtyEN       =-1;
+  int prioConsistentEN     =-1;
+  int prioAccDirtyEN       =-1;
+
+  int victim=-1;
+
+  // round robin priority search of unlocked entries offset from current victim
+  for(int ep=0;ep<NaccCache;ep++){
+    int e = (ep+AccCacheVictim)%NaccCache;
+    dprintf("AllocationCacheDeviceMem: Inspecting cache entry %d :",e);
+
+    uint32_t locks = AccCache[e].cpuLock+AccCache[e].accLock;
+    uint32_t s = AccCache[e].state;
+    uint32_t t = AccCache[e].transient;
+
+    assert( (s==Empty)||(s==CpuDirty)||(s==AccDirty)||(s==Consistent));
+
+    if ( locks==0 ) {
+
+      if( s==Empty       ) { prioEmpty = e; dprintf("Empty");}
+
+      if( t == EvictNext ) {
+	if( s==CpuDirty    ) { prioCpuDirtyEN     = e; dprintf("CpuDirty Transient");}
+	if( s==Consistent  ) { prioConsistentEN   = e; dprintf("Consistent Transient");}
+	if( s==AccDirty    ) { prioAccDirtyEN     = e; dprintf("AccDirty Transient");}
+      } else { 
+	if( s==CpuDirty    ) { prioCpuDirty     = e; dprintf("CpuDirty");}
+	if( s==Consistent  ) { prioConsistent   = e; dprintf("Consistent");}
+	if( s==AccDirty    ) { prioAccDirty     = e; dprintf("AccDirty");}
+      } 
+      
+    } else { 
+      if ( AccCache[e].cpuLock ) dprintf("Locked in Cpu ");
+      if ( AccCache[e].accLock ) dprintf("Locked in Acc ");
+    }
+    dprintf("\n");
+  }
+  // This encodes the prioritisation for device residency
+  // EvictNext provides a transient mechanism
+  if ( prioAccDirty     >= 0 ) victim = prioAccDirty;
+  if ( prioConsistent   >= 0 ) victim = prioConsistent;
+  if ( prioCpuDirty     >= 0 ) victim = prioCpuDirty;
+  if ( prioAccDirtyEN   >= 0 ) victim = prioAccDirtyEN;
+  if ( prioConsistentEN >= 0 ) victim = prioConsistentEN;
+  if ( prioCpuDirtyEN   >= 0 ) victim = prioCpuDirtyEN;
+  if ( prioEmpty        >= 0 ) victim = prioEmpty;       /*Highest prio is winner*/
+
+  assert(victim >= 0); // Must succeed/
+  dprintf("AllocationCacheDeviceMem: Selected victim cache entry %d\n",victim);
+
+  // advance victim pointer
+  AccCacheVictim=(AccCacheVictim+1)%NaccCache;
+  dprintf("AllocationCacheDeviceMem: victim pointer now %d / %d\n",AccCacheVictim,NaccCache);
+
+  return victim;
+}
+/////////////////////////////////////////////////
+// Accelerator cache motion
+/////////////////////////////////////////////////
+void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerator, remove entry
+{
+  if(AccCache[e].state!=Empty){
+    dprintf("AllocationCache: Evict(%d) %llx,%llxn",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
+    assert(AccCache[e].accLock==0);
+    assert(AccCache[e].cpuLock==0);
+    if(AccCache[e].state==AccDirty) {
+      Flush(e);
+    }
+    assert(AccCache[e].CpuPtr!=NULL);
+    if(AccCache[e].AccPtr) {
+      dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr);
+      AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes);
+    }
+  }
+  AccCache[e].AccPtr=NULL;
+  AccCache[e].CpuPtr=NULL;
+  AccCache[e].bytes=0;
+  AccCache[e].state=Empty;
+  AccCache[e].accLock=0;
+  AccCache[e].cpuLock=0;
+}
+void AllocationCache::Flush(int e)// Copy back from a dirty device state and mark consistent. Do not remove
+{
+  dprintf("AllocationCache: Flush(%d) %llx -> %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
+  assert(AccCache[e].state==AccDirty);
+  assert(AccCache[e].cpuLock==0);
+  assert(AccCache[e].accLock==0);
+  assert(AccCache[e].AccPtr!=NULL);
+  assert(AccCache[e].CpuPtr!=NULL);
+  acceleratorCopyFromDevice(AccCache[e].AccPtr,AccCache[e].CpuPtr,AccCache[e].bytes);
+  AccCache[e].state=Consistent;
+}
+void AllocationCache::Clone(int e)// Copy from CPU, mark consistent. Allocate if necessary
+{
+  assert(AccCache[e].state==CpuDirty);
+  assert(AccCache[e].cpuLock==0);
+  assert(AccCache[e].accLock==0);
+  assert(AccCache[e].CpuPtr!=NULL);
+  if(AccCache[e].AccPtr==NULL){
+    AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes);
+  }
+  dprintf("AllocationCache: Clone(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
+  acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes);
+  AccCache[e].state=Consistent;
+}
+/////////////////////////////////////////////////////////////////////////////////
+// View management
+/////////////////////////////////////////////////////////////////////////////////
+void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient)
+{
+  ////////////////////////////////////////////////////////////////////////////
+  // Find if present, otherwise get or force an empty
+  ////////////////////////////////////////////////////////////////////////////
+  int e=CpuViewLookup(CpuPtr);
+  if(e==-1) {
+    e = ViewVictim();
+    Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
+  }
+
+  assert(AccCache[e].cpuLock==0);  // Programming error
+
+  if(AccCache[e].state!=Empty) {
+    assert(AccCache[e].CpuPtr == CpuPtr);
+    assert(AccCache[e].bytes==bytes);
+  }
+/*
+ *  State transitions and actions
+ *
+ *  Action  State   StateNext         Flush    Clone
+ *
+ *  AccRead  Empty   Consistent        -        Y
+ *  AccWrite Empty   AccDirty          -        Y
+ *  AccRead  CpuDirty Consistent       -        Y
+ *  AccWrite CpuDirty AccDirty         -        Y
+ *  AccRead  Consistent Consistent     -        - 
+ *  AccWrite Consistent AccDirty       -        - 
+ *  AccRead  AccDirty   AccDirty       -        - 
+ *  AccWrite AccDirty   AccDirty       -        - 
+ */
+  if(AccCache[e].state==Empty) {
+    AccCache[e].CpuPtr = CpuPtr;
+    AccCache[e].AccPtr = NULL;
+    AccCache[e].bytes  = bytes;
+    AccCache[e].state  = CpuDirty;   // Cpu starts primary
+    Clone(e); 
+    if(mode==Write)
+      AccCache[e].state  = AccDirty;   // Empty + AccWrite=> AccDirty
+    else
+      AccCache[e].state  = Consistent; // Empty + AccRead => Consistent
+    AccCache[e].accLock= 1;
+  } else if(AccCache[e].state&CpuDirty ){
+    Clone(e); 
+    if(mode==Write)
+      AccCache[e].state  = AccDirty;   // CpuDirty + AccWrite=> AccDirty
+    else
+      AccCache[e].state  = Consistent; // CpuDirty + AccRead => Consistent
+    AccCache[e].accLock++;
+  } else if(AccCache[e].state&Consistent) {
+    if(mode==Write)
+      AccCache[e].state  = AccDirty;   // Consistent + AccWrite=> AccDirty
+    else
+      AccCache[e].state  = Consistent; // Consistent + AccRead => Consistent
+    AccCache[e].accLock++;
+  } else if(AccCache[e].state&AccDirty) {
+    if(mode==Write)
+      AccCache[e].state  = AccDirty; // AccDirty + AccWrite=> AccDirty
+    else
+      AccCache[e].state  = AccDirty; // AccDirty + AccRead => AccDirty
+    AccCache[e].accLock++;
+  } else {
+    assert(0);
+  }
+
+  AccCache[e].transient= transient? EvictNext : 0;
+
+  return AccCache[e].AccPtr;
+}
+/*
+ *  Action  State   StateNext         Flush    Clone
+ *
+ *  CpuRead  Empty   CpuDirty          -        -
+ *  CpuWrite Empty   CpuDirty          -        -
+ *  CpuRead  CpuDirty CpuDirty         -        -
+ *  CpuWrite CpuDirty CpuDirty         -        - 
+ *  CpuRead  Consistent Consistent     -        - 
+ *  CpuWrite Consistent CpuDirty       -        - 
+ *  CpuRead  AccDirty   Consistent     Y        -
+ *  CpuWrite AccDirty   CpuDirty       Y        -
+ */
+////////////////////////////////////
+// look up & decrement lock count
+////////////////////////////////////
+void AllocationCache::AccViewClose(void* AccPtr)
+{
+  int e=AccViewLookup(AccPtr);
+  assert(e!=-1);
+  assert(AccCache[e].cpuLock==0);
+  assert(AccCache[e].accLock>0);
+  AccCache[e].accLock--;
+}
+void AllocationCache::CpuViewClose(void* CpuPtr)
+{
+  int e=CpuViewLookup(CpuPtr);
+  assert(e!=-1);
+  assert(AccCache[e].cpuLock>0);
+  assert(AccCache[e].accLock==0);
+  AccCache[e].cpuLock--;
+}
+void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient)
+{
+  ////////////////////////////////////////////////////////////////////////////
+  // Find if present, otherwise get or force an empty
+  ////////////////////////////////////////////////////////////////////////////
+  int e=CpuViewLookup(CpuPtr);
+  if(e==-1) {
+    e = ViewVictim();
+    Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
+  }
+
+  assert(AccCache[e].accLock==0);  // Programming error
+
+  if(AccCache[e].state!=Empty) {
+    assert(AccCache[e].CpuPtr == CpuPtr);
+    assert(AccCache[e].bytes==bytes);
+  }
+
+  if(AccCache[e].state==Empty) {
+    AccCache[e].CpuPtr = CpuPtr;
+    AccCache[e].AccPtr = NULL;
+    AccCache[e].bytes  = bytes;
+    AccCache[e].state  = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
+    AccCache[e].accLock= 0;
+    AccCache[e].cpuLock= 1;
+  } else if(AccCache[e].state==CpuDirty ){
+    // AccPtr dont care, deferred allocate
+    AccCache[e].state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
+    AccCache[e].cpuLock++;
+  } else if(AccCache[e].state==Consistent) {
+    assert(AccCache[e].AccPtr != NULL);
+    if(mode==Write)
+      AccCache[e].state = CpuDirty;   // Consistent +CpuWrite => CpuDirty
+    else 
+      AccCache[e].state = Consistent; // Consistent +CpuRead  => Consistent
+    AccCache[e].cpuLock++;
+  } else if(AccCache[e].state==AccDirty) {
+    assert(AccCache[e].AccPtr != NULL);
+    Flush(e);
+    if(mode==Write) AccCache[e].state = CpuDirty;   // AccDirty +CpuWrite => CpuDirty, Flush
+    else            AccCache[e].state = Consistent; // AccDirty +CpuRead  => Consistent, Flush
+    AccCache[e].cpuLock++;
+  } else {
+    assert(0); // should be unreachable
+  }
+
+  AccCache[e].transient= transient? EvictNext : 0;
+
+  return AccCache[e].CpuPtr;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//loop round robin over entries checking acc pointer
+//////////////////////////////////////////////////////////////////////////////
+int   AllocationCache::CpuViewLookup(void *CpuPtr)
+{
+  assert(CpuPtr!=NULL);
+  for(int e=0;e<NaccCache;e++){
+    if ( (AccCache[e].state!=Empty) && (AccCache[e].CpuPtr==CpuPtr) ) {
+      return e;
+    }
+  }
+  return -1;
+}
+int   AllocationCache::AccViewLookup(void *AccPtr)
+{
+  assert(AccPtr!=NULL);
+  for(int e=0;e<NaccCache;e++){
+    if ( (AccCache[e].state!=Empty) && (AccCache[e].AccPtr==AccPtr) ) {
+      return e;
+    }
+  }
+  return -1;
+}
+
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/allocator/MemoryCacheShared.cc
+++ b/Grid/allocator/MemoryCacheShared.cc
@ -0,0 +1,27 @@
+#include <Grid/GridCore.h>
+#ifdef GRID_UNIFIED
+
+#warning "Grid is assuming unified virtual memory address space"
+NAMESPACE_BEGIN(Grid);
+/////////////////////////////////////////////////////////////////////////////////
+// View management is 1:1 address space mapping
+/////////////////////////////////////////////////////////////////////////////////
+
+void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; }
+void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; }
+void  AllocationCache::AccViewClose(void* AccPtr){}
+void  AllocationCache::CpuViewClose(void* CpuPtr){}
+
+/////////////////////////////////////
+// Dummy stubs
+/////////////////////////////////////
+int  AllocationCache::ViewVictim(void)  { assert(0); return 0;}
+void AllocationCache::Evict(int e)      { assert(0);}
+void AllocationCache::Flush(int e)      { assert(0);}
+void AllocationCache::Clone(int e)      { assert(0);}
+
+int   AllocationCache::CpuViewLookup(void *CpuPtr){assert(0); return 0;}
+int   AllocationCache::AccViewLookup(void *AccPtr){assert(0); return 0;}
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/allocator/MemoryStats.cc
+++ b/Grid/allocator/MemoryStats.cc
@ -0,0 +1,67 @@
+#include <Grid/GridCore.h>
+#include <fcntl.h>
+
+NAMESPACE_BEGIN(Grid);
+
+MemoryStats *MemoryProfiler::stats = nullptr;
+bool         MemoryProfiler::debug = false;
+
+void check_huge_pages(void *Buf,uint64_t BYTES)
+{
+#ifdef __linux__
+  int fd = open("/proc/self/pagemap", O_RDONLY);
+  assert(fd >= 0);
+  const int page_size = 4096;
+  uint64_t virt_pfn = (uint64_t)Buf / page_size;
+  off_t offset = sizeof(uint64_t) * virt_pfn;
+  uint64_t npages = (BYTES + page_size-1) / page_size;
+  uint64_t pagedata[npages];
+  uint64_t ret = lseek(fd, offset, SEEK_SET);
+  assert(ret == offset);
+  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
+  assert(ret == sizeof(uint64_t) * npages);
+  int nhugepages = npages / 512;
+  int n4ktotal, nnothuge;
+  n4ktotal = 0;
+  nnothuge = 0;
+  for (int i = 0; i < nhugepages; ++i) {
+    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
+    for (int j = 0; j < 512; ++j) {
+      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
+      ++n4ktotal;
+      if (pageaddr != baseaddr + j * page_size)
+	++nnothuge;
+    }
+  }
+  int rank = CartesianCommunicator::RankWorld();
+  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
+#endif
+}
+
+std::string sizeString(const size_t bytes)
+{
+  constexpr unsigned int bufSize = 256;
+  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
+  char                   buf[256];
+  size_t                 s     = 0;
+  double                 count = bytes;
+  
+  while (count >= 1024 && s < 7)
+    {
+      s++;
+      count /= 1024;
+    }
+  if (count - floor(count) == 0.0)
+    {
+      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
+    }
+  else
+    {
+      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
+    }
+  
+  return std::string(buf);
+}
+
+NAMESPACE_END(Grid);
+
--- a/Grid/allocator/MemoryStats.h
+++ b/Grid/allocator/MemoryStats.h
@ -0,0 +1,95 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/MemoryStats.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+
+NAMESPACE_BEGIN(Grid);
+
+std::string sizeString(size_t bytes);
+
+struct MemoryStats
+{
+  size_t totalAllocated{0}, maxAllocated{0}, 
+    currentlyAllocated{0}, totalFreed{0};
+};
+    
+class MemoryProfiler
+{
+public:
+  static MemoryStats *stats;
+  static bool        debug;
+};
+
+#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
+#define profilerDebugPrint						\
+  if (MemoryProfiler::stats)						\
+    {									\
+      auto s = MemoryProfiler::stats;					\
+      std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
+      std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
+		<< std::endl;						\
+      std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
+		<< std::endl;						\
+      std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
+		<< std::endl;						\
+      std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
+		<< std::endl;						\
+    }
+
+#define profilerAllocate(bytes)						\
+  if (MemoryProfiler::stats)						\
+    {									\
+      auto s = MemoryProfiler::stats;					\
+      s->totalAllocated     += (bytes);					\
+      s->currentlyAllocated += (bytes);					\
+      s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated); \
+    }									\
+  if (MemoryProfiler::debug)						\
+    {									\
+      std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
+      profilerDebugPrint;						\
+    }
+
+#define profilerFree(bytes)						\
+  if (MemoryProfiler::stats)						\
+    {									\
+      auto s = MemoryProfiler::stats;					\
+      s->totalFreed         += (bytes);					\
+      s->currentlyAllocated -= (bytes);					\
+    }									\
+  if (MemoryProfiler::debug)						\
+    {									\
+      std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
+      profilerDebugPrint;						\
+    }
+
+void check_huge_pages(void *Buf,uint64_t BYTES);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@ -87,7 +87,7 @@ sobj eval(const uint64_t ss, const sobj &arg)
 }

 template <class lobj> accelerator_inline 
-const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg) 
+const lobj & eval(const uint64_t ss, const LatticeExprView<lobj> &arg) 
 {
  return arg[ss];
 }
@ -179,16 +179,12 @@ inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
  cb = lat.Checkerboard();
 }
 template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
-inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
-{
-}
-
+inline void CBFromExpression(int &cb, const T1 &notlat) {} // non-lattice leaf
 template <typename Op, typename T1> inline 
 void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr) 
 {
  CBFromExpression(cb, expr.arg1);  // recurse AST
 }
-
 template <typename Op, typename T1, typename T2> inline 
 void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr) 
 {
@ -203,6 +199,68 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
  CBFromExpression(cb, expr.arg3);  // recurse AST
 }

+
+//////////////////////////////////////////////////////////////////////////
+// ViewOpen
+//////////////////////////////////////////////////////////////////////////
+template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void ExpressionViewOpen(T1 &lat)  // Lattice leaf
+{
+  lat.AcceleratorViewOpen();
+}
+template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+  inline void ExpressionViewOpen(T1 &notlat) {}
+
+template <typename Op, typename T1> inline 
+void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr) 
+{  
+  ExpressionViewOpen(expr.arg1); // recurse AST
+}
+
+template <typename Op, typename T1, typename T2> inline 
+void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr) 
+{
+  ExpressionViewOpen(expr.arg1);  // recurse AST
+  ExpressionViewOpen(expr.arg2);  // recurse AST
+}
+template <typename Op, typename T1, typename T2, typename T3>
+inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr) 
+{
+  ExpressionViewOpen(expr.arg1);  // recurse AST
+  ExpressionViewOpen(expr.arg2);  // recurse AST
+  ExpressionViewOpen(expr.arg3);  // recurse AST
+}
+
+//////////////////////////////////////////////////////////////////////////
+// ViewClose
+//////////////////////////////////////////////////////////////////////////
+template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void ExpressionViewClose( T1 &lat)  // Lattice leaf
+{
+  lat.AcceleratorViewClose();
+}
+template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void ExpressionViewClose(T1 &notlat) {}
+
+template <typename Op, typename T1> inline 
+void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr) 
+{  
+  ExpressionViewClose(expr.arg1); // recurse AST
+}
+template <typename Op, typename T1, typename T2> inline 
+void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr) 
+{
+  ExpressionViewClose(expr.arg1);  // recurse AST
+  ExpressionViewClose(expr.arg2);  // recurse AST
+}
+template <typename Op, typename T1, typename T2, typename T3>
+inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr) 
+{
+  ExpressionViewClose(expr.arg1);  // recurse AST
+  ExpressionViewClose(expr.arg2);  // recurse AST
+  ExpressionViewClose(expr.arg3);  // recurse AST
+}
+
 ////////////////////////////////////////////
 // Unary operators and funcs
 ////////////////////////////////////////////
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -83,11 +83,9 @@ public:
 // The copy constructor for this will need to be used by device lambda functions
 /////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> 
-class LatticeView : public LatticeAccelerator<vobj>
+class LatticeExprView : public LatticeAccelerator<vobj>
 {
 public:
-
-
  // Rvalue
 #ifdef GRID_SIMT
  accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
@ -102,11 +100,65 @@ public:
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };

-  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
+  // Non accelerator functions
+  LatticeExprView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
+  ~LatticeExprView(){}
+
+  void AcceleratorViewOpen(void) 
+  { // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
+    void *cpu_ptr=this->_odata;
+    //    std::cout << "AccViewOpen "<<std::hex<<this->_odata <<std::dec<<std::endl;
+    this->_odata=(vobj *)AllocationCache::AccViewOpen(this->_odata,this->_odata_size*sizeof(vobj),1,0);    
+  }
+  void AcceleratorViewClose(void)
+  { // Inform the manager
+    //    std::cout << "View Close"<<std::hex<<this->_odata<<std::dec <<std::endl;
+    AllocationCache::AccViewClose((void *)this->_odata);    
+  }
+  void CpuViewOpen(void)
+  { // Translate the pointer
+    void *cpu_ptr=this->_odata;
+    //    std::cout << "CpuViewOpen "<<std::hex<<this->_odata <<std::dec<<std::endl;
+    this->_odata=(vobj *)AllocationCache::CpuViewOpen(cpu_ptr,this->_odata_size*sizeof(vobj),1,0);    
+  }
+  void CpuViewClose(void) 
+  { // Inform the manager
+    //    std::cout << "CpuViewClose"<<std::hex<<this->_odata<<std::dec <<std::endl;
+    AllocationCache::CpuViewClose((void *)this->_odata);    
+  }
+
+};
+// UserView constructor,destructor updates view manager
+// Non-copyable object??? Second base with copy/= deleted?
+template<class vobj> 
+class LatticeView : public LatticeExprView<vobj>
+{
+public:
+  // Rvalue
+  /*
+#ifdef GRID_SIMT
+  accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
+#else 
+  accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
+#endif
+
+  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
+  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
+
+  accelerator_inline uint64_t begin(void) const { return 0;};
+  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
+  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };
+  */
+  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeExprView<vobj> (refer_to_me)
  {
+    this->AcceleratorViewOpen();
+  }
+  ~LatticeView(){
+    this->AcceleratorViewClose();
  }
 };

+
 /////////////////////////////////////////////////////////////////////////////////////////
 // Lattice expression types used by ET to assemble the AST
 // 
@ -120,7 +172,7 @@ template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
 template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;

 template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
-template<class T>                 struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
+template<class T>                 struct ViewMapBase<T,true> { typedef LatticeExprView<typename T::vector_object> Type; };
 template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;

 template <typename Op, typename _T1>                           
@ -231,12 +283,15 @@ public:
    CBFromExpression(cb,expr);
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;
-
+    
+    auto exprCopy = expr;
+    ExpressionViewOpen(exprCopy);
    auto me  = View();
    accelerator_for(ss,me.size(),1,{
-      auto tmp = eval(ss,expr);
+      auto tmp = eval(ss,exprCopy);
      vstream(me[ss],tmp);
    });
+    ExpressionViewClose(exprCopy);
    return *this;
  }
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
@ -251,11 +306,14 @@ public:
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;

+    auto exprCopy = expr;
+    ExpressionViewOpen(exprCopy);
    auto me  = View();
    accelerator_for(ss,me.size(),1,{
-      auto tmp = eval(ss,expr);
+      auto tmp = eval(ss,exprCopy);
      vstream(me[ss],tmp);
    });
+    ExpressionViewClose(exprCopy);
    return *this;
  }
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
@ -269,11 +327,14 @@ public:
    CBFromExpression(cb,expr);
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;
+    auto exprCopy = expr;
+    ExpressionViewOpen(exprCopy);
    auto me  = View();
    accelerator_for(ss,me.size(),1,{
-      auto tmp = eval(ss,expr);
+      auto tmp = eval(ss,exprCopy);
      vstream(me[ss],tmp);
    });
+    ExpressionViewClose(exprCopy);
    return *this;
  }
  //GridFromExpression is tricky to do
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -27,6 +27,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
+
+#ifdef HAVE_MALLOC_MALLOC_H
+#include <malloc/malloc.h>
+#endif
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+#ifdef HAVE_MM_MALLOC_H
+#include <mm_malloc.h>
+#endif
+
 NAMESPACE_BEGIN(Grid);

 //////////////////////////////////////////////////////////////////////////////////
@ -144,8 +155,8 @@ inline void *acceleratorAllocDevice(size_t bytes)
 };
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
-
-
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 #endif

 //////////////////////////////////////////////
@ -192,6 +203,8 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}

 #endif

@ -275,6 +288,8 @@ inline void *acceleratorAllocDevice(size_t bytes)

 inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}

 #endif

@ -311,16 +326,8 @@ inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
 #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });

 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
-
-#ifdef HAVE_MALLOC_MALLOC_H
-#include <malloc/malloc.h>
-#endif
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>
-#endif
-#ifdef HAVE_MM_MALLOC_H
-#include <mm_malloc.h>
-#endif
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}

 #ifdef HAVE_MM_MALLOC_H
 inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -286,6 +286,8 @@ void Grid_init(int *argc,char ***argv)
  //////////////////////////////////////////////////////////
  acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver

+  AllocationCache::Init();
+
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
    int MB;
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
--- a/configure.ac
+++ b/configure.ac
@ -147,7 +147,7 @@ case ${ac_SUMMIT} in
      AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
 esac

-############### SYCL
+############### SYCL/CUDA/HIP/none
 AC_ARG_ENABLE([accelerator],
    [AC_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none], [enable none,cuda,sycl,hip acceleration])],
    [ac_ACCELERATOR=${enable_accelerator}], [ac_ACCELERATOR=none])
@ -168,6 +168,20 @@ case ${ac_ACCELERATOR} in
      AC_MSG_ERROR(["Acceleration not suppoorted ${ac_ACCELERATOR}"]);;
 esac

+############### UNIFIED MEMORY
+AC_ARG_ENABLE([unified],
+    [AC_HELP_STRING([--enable-unified=yes|no], [enable unified address space for accelerator loops])],
+    [ac_UNIFIED=${enable_unified}], [ac_UNIFIED=yes])
+case ${ac_UNIFIED} in
+    yes)
+      echo Unified memory for accelerator loops
+      AC_DEFINE([GRID_UVM],[1],[Use unified address space]);;
+    no)
+      echo Manual memory copy for accelerator loops;;
+    *)
+      AC_MSG_ERROR(["Unified virtual memory option not suppoorted ${ac_UNIFIED}"]);;
+esac
+
 ############### Intel libraries
 AC_ARG_ENABLE([mkl],
    [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
@ -612,6 +626,7 @@ compiler version            : ${ax_cv_gxx_version}
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp}
 Acceleration                : ${ac_ACCELERATOR}
+Unified virtual memory      : ${ac_UNIFIED}
 Communications type         : ${comms_type}
 Shared memory allocator     : ${ac_SHM}
 Shared memory mmap path     : ${ac_SHMPATH}