From a7635fd5ba250e95483005c6988b30b27980d928 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Mon, 18 May 2020 17:52:26 -0400
Subject: [PATCH] summit mem

---
 Grid/GridCore.h                     |   1 +
 Grid/allocator/AlignedAllocator.h   |   9 +-
 Grid/allocator/GridMemoryManager.cc | 131 ++++++++++++++++++++++++++++
 Grid/allocator/GridMemoryManager.h  |  42 +++++++++
 Grid/lattice/Lattice_base.h         |  39 ++++-----
 Grid/lattice/Lattice_transfer.h     |   2 +-
 6 files changed, 197 insertions(+), 27 deletions(-)
 create mode 100644 Grid/allocator/GridMemoryManager.cc
 create mode 100644 Grid/allocator/GridMemoryManager.h

diff --git a/Grid/GridCore.h b/Grid/GridCore.h
index a48d2d49..495a81e1 100644
--- a/Grid/GridCore.h
+++ b/Grid/GridCore.h
@@ -47,6 +47,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/perfmon/PerfCount.h>
 #include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
+#include <Grid/allocator/GridMemoryManager.h>
 #include <Grid/allocator/AlignedAllocator.h>
 #include <Grid/simd/Simd.h>
 #include <Grid/threads/Threads.h>
diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index 77167299..600b7097 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -178,12 +178,13 @@ public:
     ////////////////////////////////////
     if ( ptr == (_Tp *) NULL ) {
       //      printf(" alignedAllocater cache miss %ld bytes ",bytes);      BACKTRACEFP(stdout);
-      auto err = cudaMallocManaged((void **)&ptr,bytes);
-      if( err != cudaSuccess ) {
+      //      auto err = 
+gridMallocManaged((void **)&ptr,bytes);
+/*if( err != cudaSuccess ) {
 	ptr = (_Tp *) NULL;
 	std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
 	assert(0);
-      }
+	}*/
     } 
     assert( ptr != (_Tp *)NULL);
 #else 
@@ -220,7 +221,7 @@ public:
 #endif
 
 #ifdef GRID_NVCC
-    if ( __freeme ) cudaFree((void *)__freeme);
+    if ( __freeme ) gridFree((void *)__freeme);
 #else 
   #ifdef HAVE_MM_MALLOC_H
     if ( __freeme ) _mm_free((void *)__freeme); 
diff --git a/Grid/allocator/GridMemoryManager.cc b/Grid/allocator/GridMemoryManager.cc
new file mode 100644
index 00000000..677b3ed8
--- /dev/null
+++ b/Grid/allocator/GridMemoryManager.cc
@@ -0,0 +1,131 @@
+#include <Grid/GridCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#define _GRID_MEM_PAGE_SIZE 4096
+void* _grid_mem_base = 0;
+size_t _grid_mem_pages;
+struct _grid_mem_range {
+  size_t page_start, page_end;
+};
+std::vector<_grid_mem_range> _grid_mem_avail;
+std::map<void*,_grid_mem_range> _grid_mem_alloc;
+
+void gridMemoryInit() {
+  size_t free,total;
+  cudaMemGetInfo(&free,&total);
+  
+  char* ev = getenv("GRID_DEVICE_BYTES_FOR_CACHE");
+  if (ev) {
+    long bytes;
+    assert(sscanf(ev,"%ld",&bytes)==1);
+    free -= bytes;
+  }
+
+  _grid_mem_pages = free / _GRID_MEM_PAGE_SIZE;
+  size_t sz = _grid_mem_pages * _GRID_MEM_PAGE_SIZE;
+
+  assert(cudaSuccess == cudaMallocManaged(&_grid_mem_base,sz));
+  
+  int target;
+  cudaGetDevice(&target);
+  cudaMemAdvise(_grid_mem_base,sz,cudaMemAdviseSetPreferredLocation,target);
+
+  assert(cudaSuccess == cudaMemset(_grid_mem_base,0,sz)); // touch on device
+  std::cout << GridLogMessage << "gridMemoryInit: " << sz << " bytes" << std::endl;
+
+  _grid_mem_avail.push_back( { 0, _grid_mem_pages } );
+}
+
+void gridMallocManaged(void** pp, size_t sz) {
+
+  if (_grid_mem_avail.empty())
+    gridMemoryInit();
+
+  size_t pages = (sz + _GRID_MEM_PAGE_SIZE - 1) / _GRID_MEM_PAGE_SIZE;
+  // find free block
+  size_t m;
+  for (m=0;m<_grid_mem_avail.size();m++) {
+    auto & b = _grid_mem_avail[m];
+    if (b.page_end - b.page_start >= pages)
+      break;
+  }
+  if (m == _grid_mem_avail.size()) {
+    std::cout << GridLogMessage << "Out of memory" << std::endl;
+    assert(0);
+  }
+  *pp = (char*)_grid_mem_base + _GRID_MEM_PAGE_SIZE*_grid_mem_avail[m].page_start;
+  _grid_mem_alloc[*pp] = { _grid_mem_avail[m].page_start, _grid_mem_avail[m].page_start + pages };
+  _grid_mem_avail[m].page_start += pages;
+}
+
+void gridFree(void* p) {
+
+  if (_grid_mem_avail.empty())
+    gridMemoryInit();
+
+  auto & alloc = _grid_mem_alloc[p];
+  if (alloc.page_start == alloc.page_end) {
+    free(p);
+    //cudaFreeHost(p);
+  } else {
+    // can we enlarge existing one?
+    for (size_t m=0;m<_grid_mem_avail.size();m++) {
+      auto & b = _grid_mem_avail[m];
+      if (b.page_start == alloc.page_end) {
+	b.page_start = alloc.page_start;
+	return;
+      }
+      if (b.page_end == alloc.page_start) {
+	b.page_end = alloc.page_end;
+	return;
+      }
+    }
+    // fragment memory
+    _grid_mem_avail.push_back( alloc );  
+  }
+  _grid_mem_alloc.erase(p);
+}
+
+void gridAcceleratorPrefetch(void* p, size_t sz) {
+
+  auto & alloc = _grid_mem_alloc[p];
+  if (alloc.page_start == alloc.page_end) // pinned to host
+    return;
+
+  int target;
+  cudaGetDevice(&target);
+  cudaMemPrefetchAsync(p,sz,target);
+}
+
+void gridMemGetInfo(size_t* pfree, size_t* ptotal) {
+
+  if (_grid_mem_avail.empty())
+    gridMemoryInit();
+
+  *ptotal = _grid_mem_pages * _GRID_MEM_PAGE_SIZE;
+  *pfree = 0;
+  for (auto & a : _grid_mem_avail)
+    *pfree += (a.page_end - a.page_start) * _GRID_MEM_PAGE_SIZE;
+}
+
+void gridMoveToHost(void** pp) {
+
+  if (_grid_mem_avail.empty())
+    gridMemoryInit();
+
+  auto & alloc = _grid_mem_alloc[*pp];
+  if (alloc.page_start == alloc.page_end) // already on host
+    return;
+
+  size_t sz = (alloc.page_end - alloc.page_start) * _GRID_MEM_PAGE_SIZE;
+  void*pn;
+  //assert(cudaSuccess == cudaMallocHost(&pn,sz));
+  pn = malloc(sz);
+  memcpy(pn,*pp,sz);
+  gridFree(*pp);
+  *pp = pn;
+  _grid_mem_alloc[pn] = { 0,0 };
+}
+
+NAMESPACE_END(Grid);
diff --git a/Grid/allocator/GridMemoryManager.h b/Grid/allocator/GridMemoryManager.h
new file mode 100644
index 00000000..9e619301
--- /dev/null
+++ b/Grid/allocator/GridMemoryManager.h
@@ -0,0 +1,42 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/GridMemoryManager.h
+
+    Copyright (C) 2020
+
+Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MEMORY_MANAGER_H
+#define GRID_MEMORY_MANAGER_H
+
+NAMESPACE_BEGIN(Grid);
+
+void gridMemoryInit();
+void gridMallocManaged(void** pp, size_t sz);
+void gridMoveToHost(void** pp);
+void gridAcceleratorPrefetch(void* p, size_t sz);
+void gridMemGetInfo(size_t* pfree, size_t* ptotal);
+void gridFree(void* p);
+
+NAMESPACE_END(Grid);
+
+#endif
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 284190ba..42e9e50a 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -97,33 +97,14 @@ public:
     else      grid = _grid;
   };
 
-  accelerator_inline void Advise(int advise) {
-#ifdef GRID_NVCC
-#ifndef __CUDA_ARCH__ // only on host
-    if (advise & AdviseInfrequentUse) {
-      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
-    }
-    if (advise & AdviseReadMostly) {
-      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
-    }
-#endif
-#endif
-  };
-
   accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
-#ifdef GRID_NVCC
-#ifndef __CUDA_ARCH__ // only on host
-    int target;
-    cudaGetDevice(&target);
-    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
-#endif
-#endif
+    gridAcceleratorPrefetch(_odata,_odata_size*sizeof(vobj));
   };
 
   accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
 #ifdef GRID_NVCC
 #ifndef __CUDA_ARCH__ // only on host
-    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
+    //cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
 #endif
 #endif
   };
@@ -246,13 +227,27 @@ private:
       dealloc();
       
       this->_odata_size = size;
-      if ( size ) 
+      if ( size )
 	this->_odata      = alloc.allocate(this->_odata_size);
       else 
 	this->_odata      = nullptr;
     }
   }
 public:
+
+  void Advise(int advise) {
+#ifdef GRID_NVCC
+#ifndef __CUDA_ARCH__ // only on host
+    if (advise & AdviseInfrequentUse) {
+      gridMoveToHost((void**)&this->_odata);
+    }
+    if (advise & AdviseReadMostly) {
+      //cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
+    }
+#endif
+#endif
+  };
+
   /////////////////////////////////////////////////////////////////////////////////
   // Return a view object that may be dereferenced in site loops.
   // The view is trivially copy constructible and may be copied to an accelerator device
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index c23ddcdc..e12ef787 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -96,7 +96,7 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
   out = in;
 }
 
-#ifdef __CUDA_ARCH__
+#ifdef GRID_NVCC
 accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
   ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
 }