From 162e4bb567858210c9e895ed0bdcbb577fe4fdaf Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Tue, 12 May 2020 07:01:23 -0400
Subject: [PATCH 1/5] no automatic prefetching for now

---
 Grid/lattice/Lattice_base.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 74525cc1..284190ba 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -267,14 +267,14 @@ public:
   LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const 
   {
     LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
-    accessor.AcceleratorPrefetch(mode);
+    //accessor.AcceleratorPrefetch(mode);
     return accessor;
   }
 
   LatticeView<vobj> HostView(int mode = ViewReadWrite) const 
   {
     LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
-    accessor.HostPrefetch(mode);
+    //accessor.HostPrefetch(mode);
     return accessor;
   }
   

From a7635fd5ba250e95483005c6988b30b27980d928 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Mon, 18 May 2020 17:52:26 -0400
Subject: [PATCH 2/5] summit mem

---
 Grid/GridCore.h                     |   1 +
 Grid/allocator/AlignedAllocator.h   |   9 +-
 Grid/allocator/GridMemoryManager.cc | 131 ++++++++++++++++++++++++++++
 Grid/allocator/GridMemoryManager.h  |  42 +++++++++
 Grid/lattice/Lattice_base.h         |  39 ++++-----
 Grid/lattice/Lattice_transfer.h     |   2 +-
 6 files changed, 197 insertions(+), 27 deletions(-)
 create mode 100644 Grid/allocator/GridMemoryManager.cc
 create mode 100644 Grid/allocator/GridMemoryManager.h

diff --git a/Grid/GridCore.h b/Grid/GridCore.h
index a48d2d49..495a81e1 100644
--- a/Grid/GridCore.h
+++ b/Grid/GridCore.h
@@ -47,6 +47,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/perfmon/PerfCount.h>
 #include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
+#include <Grid/allocator/GridMemoryManager.h>
 #include <Grid/allocator/AlignedAllocator.h>
 #include <Grid/simd/Simd.h>
 #include <Grid/threads/Threads.h>
diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index 77167299..600b7097 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -178,12 +178,13 @@ public:
     ////////////////////////////////////
     if ( ptr == (_Tp *) NULL ) {
       //      printf(" alignedAllocater cache miss %ld bytes ",bytes);      BACKTRACEFP(stdout);
-      auto err = cudaMallocManaged((void **)&ptr,bytes);
-      if( err != cudaSuccess ) {
+      //      auto err = 
+gridMallocManaged((void **)&ptr,bytes);
+/*if( err != cudaSuccess ) {
 	ptr = (_Tp *) NULL;
 	std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
 	assert(0);
-      }
+	}*/
     } 
     assert( ptr != (_Tp *)NULL);
 #else 
@@ -220,7 +221,7 @@ public:
 #endif
 
 #ifdef GRID_NVCC
-    if ( __freeme ) cudaFree((void *)__freeme);
+    if ( __freeme ) gridFree((void *)__freeme);
 #else 
   #ifdef HAVE_MM_MALLOC_H
     if ( __freeme ) _mm_free((void *)__freeme); 
diff --git a/Grid/allocator/GridMemoryManager.cc b/Grid/allocator/GridMemoryManager.cc
new file mode 100644
index 00000000..677b3ed8
--- /dev/null
+++ b/Grid/allocator/GridMemoryManager.cc
@@ -0,0 +1,131 @@
+#include <Grid/GridCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#define _GRID_MEM_PAGE_SIZE 4096
+void* _grid_mem_base = 0;
+size_t _grid_mem_pages;
+struct _grid_mem_range {
+  size_t page_start, page_end;
+};
+std::vector<_grid_mem_range> _grid_mem_avail;
+std::map<void*,_grid_mem_range> _grid_mem_alloc;
+
+void gridMemoryInit() {
+  size_t free,total;
+  cudaMemGetInfo(&free,&total);
+  
+  char* ev = getenv("GRID_DEVICE_BYTES_FOR_CACHE");
+  if (ev) {
+    long bytes;
+    assert(sscanf(ev,"%ld",&bytes)==1);
+    free -= bytes;
+  }
+
+  _grid_mem_pages = free / _GRID_MEM_PAGE_SIZE;
+  size_t sz = _grid_mem_pages * _GRID_MEM_PAGE_SIZE;
+
+  assert(cudaSuccess == cudaMallocManaged(&_grid_mem_base,sz));
+  
+  int target;
+  cudaGetDevice(&target);
+  cudaMemAdvise(_grid_mem_base,sz,cudaMemAdviseSetPreferredLocation,target);
+
+  assert(cudaSuccess == cudaMemset(_grid_mem_base,0,sz)); // touch on device
+  std::cout << GridLogMessage << "gridMemoryInit: " << sz << " bytes" << std::endl;
+
+  _grid_mem_avail.push_back( { 0, _grid_mem_pages } );
+}
+
+void gridMallocManaged(void** pp, size_t sz) {
+
+  if (_grid_mem_avail.empty())
+    gridMemoryInit();
+
+  size_t pages = (sz + _GRID_MEM_PAGE_SIZE - 1) / _GRID_MEM_PAGE_SIZE;
+  // find free block
+  size_t m;
+  for (m=0;m<_grid_mem_avail.size();m++) {
+    auto & b = _grid_mem_avail[m];
+    if (b.page_end - b.page_start >= pages)
+      break;
+  }
+  if (m == _grid_mem_avail.size()) {
+    std::cout << GridLogMessage << "Out of memory" << std::endl;
+    assert(0);
+  }
+  *pp = (char*)_grid_mem_base + _GRID_MEM_PAGE_SIZE*_grid_mem_avail[m].page_start;
+  _grid_mem_alloc[*pp] = { _grid_mem_avail[m].page_start, _grid_mem_avail[m].page_start + pages };
+  _grid_mem_avail[m].page_start += pages;
+}
+
+void gridFree(void* p) {
+
+  if (_grid_mem_avail.empty())
+    gridMemoryInit();
+
+  auto & alloc = _grid_mem_alloc[p];
+  if (alloc.page_start == alloc.page_end) {
+    free(p);
+    //cudaFreeHost(p);
+  } else {
+    // can we enlarge existing one?
+    for (size_t m=0;m<_grid_mem_avail.size();m++) {
+      auto & b = _grid_mem_avail[m];
+      if (b.page_start == alloc.page_end) {
+	b.page_start = alloc.page_start;
+	return;
+      }
+      if (b.page_end == alloc.page_start) {
+	b.page_end = alloc.page_end;
+	return;
+      }
+    }
+    // fragment memory
+    _grid_mem_avail.push_back( alloc );  
+  }
+  _grid_mem_alloc.erase(p);
+}
+
+void gridAcceleratorPrefetch(void* p, size_t sz) {
+
+  auto & alloc = _grid_mem_alloc[p];
+  if (alloc.page_start == alloc.page_end) // pinned to host
+    return;
+
+  int target;
+  cudaGetDevice(&target);
+  cudaMemPrefetchAsync(p,sz,target);
+}
+
+void gridMemGetInfo(size_t* pfree, size_t* ptotal) {
+
+  if (_grid_mem_avail.empty())
+    gridMemoryInit();
+
+  *ptotal = _grid_mem_pages * _GRID_MEM_PAGE_SIZE;
+  *pfree = 0;
+  for (auto & a : _grid_mem_avail)
+    *pfree += (a.page_end - a.page_start) * _GRID_MEM_PAGE_SIZE;
+}
+
+void gridMoveToHost(void** pp) {
+
+  if (_grid_mem_avail.empty())
+    gridMemoryInit();
+
+  auto & alloc = _grid_mem_alloc[*pp];
+  if (alloc.page_start == alloc.page_end) // already on host
+    return;
+
+  size_t sz = (alloc.page_end - alloc.page_start) * _GRID_MEM_PAGE_SIZE;
+  void*pn;
+  //assert(cudaSuccess == cudaMallocHost(&pn,sz));
+  pn = malloc(sz);
+  memcpy(pn,*pp,sz);
+  gridFree(*pp);
+  *pp = pn;
+  _grid_mem_alloc[pn] = { 0,0 };
+}
+
+NAMESPACE_END(Grid);
diff --git a/Grid/allocator/GridMemoryManager.h b/Grid/allocator/GridMemoryManager.h
new file mode 100644
index 00000000..9e619301
--- /dev/null
+++ b/Grid/allocator/GridMemoryManager.h
@@ -0,0 +1,42 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/GridMemoryManager.h
+
+    Copyright (C) 2020
+
+Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MEMORY_MANAGER_H
+#define GRID_MEMORY_MANAGER_H
+
+NAMESPACE_BEGIN(Grid);
+
+void gridMemoryInit();
+void gridMallocManaged(void** pp, size_t sz);
+void gridMoveToHost(void** pp);
+void gridAcceleratorPrefetch(void* p, size_t sz);
+void gridMemGetInfo(size_t* pfree, size_t* ptotal);
+void gridFree(void* p);
+
+NAMESPACE_END(Grid);
+
+#endif
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 284190ba..42e9e50a 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -97,33 +97,14 @@ public:
     else      grid = _grid;
   };
 
-  accelerator_inline void Advise(int advise) {
-#ifdef GRID_NVCC
-#ifndef __CUDA_ARCH__ // only on host
-    if (advise & AdviseInfrequentUse) {
-      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
-    }
-    if (advise & AdviseReadMostly) {
-      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
-    }
-#endif
-#endif
-  };
-
   accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
-#ifdef GRID_NVCC
-#ifndef __CUDA_ARCH__ // only on host
-    int target;
-    cudaGetDevice(&target);
-    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
-#endif
-#endif
+    gridAcceleratorPrefetch(_odata,_odata_size*sizeof(vobj));
   };
 
   accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
 #ifdef GRID_NVCC
 #ifndef __CUDA_ARCH__ // only on host
-    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
+    //cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
 #endif
 #endif
   };
@@ -246,13 +227,27 @@ private:
       dealloc();
       
       this->_odata_size = size;
-      if ( size ) 
+      if ( size )
 	this->_odata      = alloc.allocate(this->_odata_size);
       else 
 	this->_odata      = nullptr;
     }
   }
 public:
+
+  void Advise(int advise) {
+#ifdef GRID_NVCC
+#ifndef __CUDA_ARCH__ // only on host
+    if (advise & AdviseInfrequentUse) {
+      gridMoveToHost((void**)&this->_odata);
+    }
+    if (advise & AdviseReadMostly) {
+      //cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
+    }
+#endif
+#endif
+  };
+
   /////////////////////////////////////////////////////////////////////////////////
   // Return a view object that may be dereferenced in site loops.
   // The view is trivially copy constructible and may be copied to an accelerator device
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index c23ddcdc..e12ef787 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -96,7 +96,7 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
   out = in;
 }
 
-#ifdef __CUDA_ARCH__
+#ifdef GRID_NVCC
 accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
   ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
 }

From 9fcb47ee63246dc180963ce840c8da525238d5b3 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Tue, 2 Jun 2020 07:44:38 -0400
Subject: [PATCH 3/5] Explicit error message instead of infinite loop in
 GlobalSharedMemory::GetShmDims

---
 Grid/communicator/SharedMemoryMPI.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc
index ed465252..0de48cfe 100644
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -170,17 +170,24 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
   std::vector<int> primes({2,3,5});
 
   int dim = 0;
+  int last_dim = ndimension - 1;
   int AutoShmSize = 1;
   while(AutoShmSize != WorldShmSize) {
-    for(int p=0;p<primes.size();p++) {
+    int p;
+    for(p=0;p<primes.size();p++) {
       int prime=primes[p];
       if ( divides(prime,WorldDims[dim]/ShmDims[dim])
         && divides(prime,WorldShmSize/AutoShmSize)  ) {
 	AutoShmSize*=prime;
 	ShmDims[dim]*=prime;
+	last_dim = (dim + ndimension - 1) % ndimension;
 	break;
       }
     }
+    if (p == primes.size() && last_dim == dim) {
+      std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
+      exit(EXIT_FAILURE);
+    }
     dim=(dim+1) %ndimension;
   }
 }

From 5f5807d60a7f6bcbc16dc5a25a157fb54cc6edcd Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 12 Jun 2020 14:48:23 -0400
Subject: [PATCH 4/5] cleanup

---
 Grid/allocator/GridMemoryManager.cc | 145 ----------------------------
 Grid/allocator/GridMemoryManager.h  |  42 --------
 Grid/lattice/Lattice_base.h         |  13 ---
 3 files changed, 200 deletions(-)
 delete mode 100644 Grid/allocator/GridMemoryManager.cc
 delete mode 100644 Grid/allocator/GridMemoryManager.h

diff --git a/Grid/allocator/GridMemoryManager.cc b/Grid/allocator/GridMemoryManager.cc
deleted file mode 100644
index 369f72f7..00000000
--- a/Grid/allocator/GridMemoryManager.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-#include <Grid/GridCore.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#define _GRID_MEM_PAGE_SIZE 4096
-void* _grid_mem_base = 0;
-size_t _grid_mem_pages;
-struct _grid_mem_range {
-  size_t page_start, page_end;
-};
-std::vector<_grid_mem_range> _grid_mem_avail;
-std::map<void*,_grid_mem_range> _grid_mem_alloc;
-
-void gridMemoryInit() {
-#ifdef GRID_NVCC
-  size_t free,total;
-  cudaMemGetInfo(&free,&total);
-  
-  char* ev = getenv("GRID_DEVICE_BYTES_FOR_CACHE");
-  if (ev) {
-    long bytes;
-    assert(sscanf(ev,"%ld",&bytes)==1);
-    free -= bytes;
-  }
-
-  _grid_mem_pages = free / _GRID_MEM_PAGE_SIZE;
-  size_t sz = _grid_mem_pages * _GRID_MEM_PAGE_SIZE;
-
-  assert(cudaSuccess == cudaMallocManaged(&_grid_mem_base,sz));
-  
-  int target;
-  cudaGetDevice(&target);
-  cudaMemAdvise(_grid_mem_base,sz,cudaMemAdviseSetPreferredLocation,target);
-
-  assert(cudaSuccess == cudaMemset(_grid_mem_base,0,sz)); // touch on device
-  std::cout << GridLogMessage << "gridMemoryInit: " << sz << " bytes" << std::endl;
-
-  _grid_mem_avail.push_back( { 0, _grid_mem_pages } );
-#endif
-}
-
-void gridMallocManaged(void** pp, size_t sz) {
-#ifdef GRID_NVCC
-  if (_grid_mem_avail.empty())
-    gridMemoryInit();
-
-  size_t pages = (sz + _GRID_MEM_PAGE_SIZE - 1) / _GRID_MEM_PAGE_SIZE;
-  // find free block
-  size_t m;
-  for (m=0;m<_grid_mem_avail.size();m++) {
-    auto & b = _grid_mem_avail[m];
-    if (b.page_end - b.page_start >= pages)
-      break;
-  }
-  if (m == _grid_mem_avail.size()) {
-    std::cout << GridLogMessage << "Out of memory" << std::endl;
-    assert(0);
-  }
-  *pp = (char*)_grid_mem_base + _GRID_MEM_PAGE_SIZE*_grid_mem_avail[m].page_start;
-  _grid_mem_alloc[*pp] = { _grid_mem_avail[m].page_start, _grid_mem_avail[m].page_start + pages };
-  _grid_mem_avail[m].page_start += pages;
-#else
-  *pp = malloc(sz);
-#endif
-}
-
-void gridFree(void* p) {
-#ifdef GRID_NVCC
-  if (_grid_mem_avail.empty())
-    gridMemoryInit();
-
-  auto & alloc = _grid_mem_alloc[p];
-  if (alloc.page_start == alloc.page_end) {
-    free(p);
-    //cudaFreeHost(p);
-  } else {
-    // can we enlarge existing one?
-    for (size_t m=0;m<_grid_mem_avail.size();m++) {
-      auto & b = _grid_mem_avail[m];
-      if (b.page_start == alloc.page_end) {
-	b.page_start = alloc.page_start;
-	return;
-      }
-      if (b.page_end == alloc.page_start) {
-	b.page_end = alloc.page_end;
-	return;
-      }
-    }
-    // fragment memory
-    _grid_mem_avail.push_back( alloc );  
-  }
-  _grid_mem_alloc.erase(p);
-#else
-  free(p);
-#endif
-}
-
-void gridAcceleratorPrefetch(void* p, size_t sz) {
-#ifdef GRID_NVCC
-  auto & alloc = _grid_mem_alloc[p];
-  if (alloc.page_start == alloc.page_end) // pinned to host
-    return;
-
-  int target;
-  cudaGetDevice(&target);
-  cudaMemPrefetchAsync(p,sz,target);
-#endif
-}
-
-void gridMemGetInfo(size_t* pfree, size_t* ptotal) {
-#ifdef GRID_NVCC
-  if (_grid_mem_avail.empty())
-    gridMemoryInit();
-
-  *ptotal = _grid_mem_pages * _GRID_MEM_PAGE_SIZE;
-  *pfree = 0;
-  for (auto & a : _grid_mem_avail)
-    *pfree += (a.page_end - a.page_start) * _GRID_MEM_PAGE_SIZE;
-#else
-  *pfree = 0;
-  *ptotal = 0;
-#endif
-}
-
-void gridMoveToHost(void** pp) {
-#ifdef GRID_NVCC
-  if (_grid_mem_avail.empty())
-    gridMemoryInit();
-
-  auto & alloc = _grid_mem_alloc[*pp];
-  if (alloc.page_start == alloc.page_end) // already on host
-    return;
-
-  size_t sz = (alloc.page_end - alloc.page_start) * _GRID_MEM_PAGE_SIZE;
-  void*pn;
-  //assert(cudaSuccess == cudaMallocHost(&pn,sz));
-  pn = malloc(sz);
-  memcpy(pn,*pp,sz);
-  gridFree(*pp);
-  *pp = pn;
-  _grid_mem_alloc[pn] = { 0,0 };
-#endif
-}
-
-NAMESPACE_END(Grid);
diff --git a/Grid/allocator/GridMemoryManager.h b/Grid/allocator/GridMemoryManager.h
deleted file mode 100644
index 9e619301..00000000
--- a/Grid/allocator/GridMemoryManager.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/GridMemoryManager.h
-
-    Copyright (C) 2020
-
-Author: Christoph Lehner <christoph@lhnr.de>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_MEMORY_MANAGER_H
-#define GRID_MEMORY_MANAGER_H
-
-NAMESPACE_BEGIN(Grid);
-
-void gridMemoryInit();
-void gridMallocManaged(void** pp, size_t sz);
-void gridMoveToHost(void** pp);
-void gridAcceleratorPrefetch(void* p, size_t sz);
-void gridMemGetInfo(size_t* pfree, size_t* ptotal);
-void gridFree(void* p);
-
-NAMESPACE_END(Grid);
-
-#endif
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 668583a1..73b1b6a1 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -81,19 +81,6 @@ private:
   }
 public:
 
-  void Advise(int advise) {
-#ifdef GRID_NVCC
-#ifndef __CUDA_ARCH__ // only on host
-    if (advise & AdviseInfrequentUse) {
-      gridMoveToHost((void**)&this->_odata);
-    }
-    if (advise & AdviseReadMostly) {
-      //cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
-    }
-#endif
-#endif
-  };
-
   /////////////////////////////////////////////////////////////////////////////////
   // Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
   /////////////////////////////////////////////////////////////////////////////////

From b5e87e8d9746ead5baffa477063d119232db3d8e Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 12 Jun 2020 18:16:12 -0400
Subject: [PATCH 5/5] summit compile fixes

---
 Grid/algorithms/CoarsenedMatrix.h | 33 +++++++++++++++++++++++++++++++
 Grid/allocator/MemoryManager.h    |  2 +-
 Grid/lattice/Lattice_transfer.h   | 19 +++++++++---------
 Grid/lattice/Lattice_view.h       |  7 +++++--
 4 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index e56b39c5..8d184aea 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -120,6 +120,39 @@ public:
     blockPromote(CoarseVec,FineVec,subspace);
   }
 
+  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
+
+    RealD scale;
+
+    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+
+    for(int b=0;b<nn;b++){
+      
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+      for(int i=0;i<1;i++){
+
+	CG(hermop,noise,subspace[b]);
+
+	noise = subspace[b];
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+      }
+
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
+      subspace[b]   = noise;
+
+    }
+  }
+
   ////////////////////////////////////////////////////////////////////////////////////////////////
   // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
   // and this is the best I found
diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h
index f94f3cd8..ec56f86f 100644
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 // Advise the LatticeAccelerator class
 ////////////////////////////////////////////////////////////////////////////
 enum ViewAdvise {
- AdviseDefault       = 0x0,    // Reegular data
+ AdviseDefault       = 0x0,    // Regular data
  AdviseInfrequentUse = 0x1,    // Advise that the data is used infrequently.  This can
                                // significantly influence performance of bulk storage.
  
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index ad2d07cb..7041f01a 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -164,7 +164,7 @@ accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
 ////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj>
 inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
--> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
+-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
 {
   autoView( lhs_v , lhs, AcceleratorRead);
   autoView( rhs_v , rhs, AcceleratorRead);
@@ -283,7 +283,7 @@ template<class vobj,class CComplex>
   Lattice<dotp> coarse_inner(coarse);
 
   // Precision promotion
-  fine_inner = localInnerProductD(fineX,fineY);
+  fine_inner = localInnerProductD<vobj>(fineX,fineY);
   blockSum(coarse_inner,fine_inner);
   {
     autoView( CoarseInner_  , CoarseInner,AcceleratorWrite);
@@ -486,13 +486,14 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
   for(int i=0;i<nbasis;i++) {
     Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
 
-    Lattice<CComplex> cip(coarse);
-    autoView( cip_ , cip, AcceleratorWrite);
-    autoView(  ip_ ,  ip, AcceleratorRead);
-    accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
-	coalescedWrite(cip_[sc], ip_(sc)());
-    });
-    blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
+    //Lattice<CComplex> cip(coarse);
+    //autoView( cip_ , cip, AcceleratorWrite);
+    //autoView(  ip_ ,  ip, AcceleratorRead);
+    //accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
+    //	coalescedWrite(cip_[sc], ip_(sc)());
+    //  });
+    //blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
+    blockZAXPY(fineData,ip,Basis[i],fineData);
   }
 }
 #endif
diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h
index d21ab874..a10acd87 100644
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -30,11 +30,14 @@ protected:
   int checkerboard;
   vobj     *_odata;    // A managed pointer
   uint64_t _odata_size;    
+  ViewAdvise advise;
 public:
-  accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { }; 
+  accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { }; 
   accelerator_inline uint64_t oSites(void) const { return _odata_size; };
   accelerator_inline int  Checkerboard(void) const { return checkerboard; };
   accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
+  accelerator_inline ViewAdvise Advise(void) const { return advise; };
+  accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
   accelerator_inline void Conformable(GridBase * &grid) const
   { 
     if (grid) conformable(grid, _grid);
@@ -86,7 +89,7 @@ public:
       MemoryManager::ViewOpen(this->cpu_ptr,
 				this->_odata_size*sizeof(vobj),
 				mode,
-				AdviseDefault);    
+				this->advise);    
   }
   void ViewClose(void)
   { // Inform the manager