From 88015b08588e040abcb888041bd3f63187307834 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Mon, 26 Dec 2022 10:01:32 +0100
Subject: [PATCH 01/10] Split sum in rankSum and GlobalSum

---
 Grid/lattice/Lattice_reduction.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 326b9ea3..d9025de0 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -144,17 +144,23 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 }
 
 template<class vobj>
-inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
+inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)
   autoView( arg_v, arg, AcceleratorRead);
   Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_gpu(&arg_v[0],osites);
+  return sum_gpu(&arg_v[0],osites);
 #else
   autoView(arg_v, arg, CpuRead);
   Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_cpu(&arg_v[0],osites);
+  return sum_cpu(&arg_v[0],osites);
 #endif  
+}
+
+template<class vobj>
+inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
+{
+  auto ssum = rankSum(arg);
   arg.Grid()->GlobalSum(ssum);
   return ssum;
 }

From 7d62f1d6d20790f681f37fe1755e3712d2a4e2b0 Mon Sep 17 00:00:00 2001
From: Makis Kappas <makis.kappas@gmail.com>
Date: Wed, 11 Jan 2023 21:26:25 +0000
Subject: [PATCH 02/10] Populate the Cshift_table in the GPU

Cshift is allocated in Unified memory and used
in the LambdaApply kernels but also populated
from the host. This creates a lot of Unified HtoD
and DtoH mem operations and has a negative effect
in performance. With this commit we populate the
Cshift table in the device with the
populate_Cshift_table() kernel.
---
 Grid/cshift/Cshift_common.h | 40 +++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h
index cf902b58..742c99da 100644
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
   }
 }
 
+#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
+
+template <typename T>
+T iDivUp(T a, T b) // Round a / b to nearest higher integer value
+{ return (a % b != 0) ? (a / b + 1) : (a / b); }
+
+template <typename T>
+__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
+{
+    int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    if (idx >= e1*e2) return;
+
+    int n, b, o;
+
+    n = idx / e2;
+    b = idx % e2;
+    o = n*stride + b;
+
+    vector[2*idx + 0] = lo + o;
+    vector[2*idx + 1] = ro + o;
+}
+
+#endif
+
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
   int ent=0;
 
   if(cbmask == 0x3 ){
+#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
+    ent = e1*e2;
+    dim3 blockSize(acceleratorThreads());
+    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
+    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
+    accelerator_barrier();
+#else
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
         int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
       }
     }
+#endif
   } else { 
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
@@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
   int ent=0;
 
   if ( cbmask == 0x3 ) {
+#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
+    ent = e1*e2;
+    dim3 blockSize(acceleratorThreads());
+    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
+    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
+    accelerator_barrier();
+#else
     for(int n=0;n<e1;n++){
     for(int b=0;b<e2;b++){
       int o  =n*stride;
       Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
     }}
+#endif
   } else {
     for(int n=0;n<e1;n++){
     for(int b=0;b<e2;b++){

From be528b6d27b900c4008b7e8c38915223e5845de1 Mon Sep 17 00:00:00 2001
From: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
Date: Tue, 14 Feb 2023 14:37:10 +0000
Subject: [PATCH 03/10] Add batched block project/promote functions

---
 Grid/lattice/Lattice_transfer.h | 45 +++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index ef489ea6..556785c0 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -288,7 +288,34 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
     blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
   }
 }
+template<class vobj,class CComplex,int nbasis,class VLattice>
+inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
+                               const std::vector<Lattice<vobj>> &fineData,
+                               const VLattice &Basis)
+{
+  int NBatch = fineData.size();
+  GridBase * fine  = fineData[0].Grid();
+  GridBase * coarse= coarseData[0].Grid();
 
+  Lattice<iScalar<CComplex>> ip(coarse);
+  std::vector<Lattice<vobj>> fineDataCopy = fineData;
+
+  autoView(ip_, ip, AcceleratorWrite);
+  for(int v=0;v<nbasis;v++) {
+    for (int k=0; k<NBatch; k++) {
+      autoView( coarseData_ , coarseData[k], AcceleratorWrite);
+      blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
+      accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
+        convertType(coarseData_[sc](v),ip_[sc]);
+      });
+
+      // improve numerical stability of projection
+      // |fine> = |fine> - <basis|fine> |basis>
+      ip=-ip;
+      blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); 
+    }
+  }
+}
 
 template<class vobj,class vobj2,class CComplex>
   inline void blockZAXPY(Lattice<vobj> &fineZ,
@@ -590,6 +617,24 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }
 #endif
 
+template<class vobj,class CComplex,int nbasis,class VLattice>
+inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
+                               std::vector<Lattice<vobj>> &fineData,
+                               const VLattice &Basis)
+{
+  int NBatch = fineData.size();
+  GridBase * fine   = fineData[0].Grid();
+  GridBase * coarse = coarseData[0].Grid();
+  for (int k=0; k<NBatch; k++)
+    fineData[k]=Zero();
+  for (int i=0;i<nbasis;i++) {
+    for (int k=0; k<NBatch; k++) {
+      Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
+      blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
+    }
+  }
+}
+
 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
 template<class vobj,class vvobj>

From 920a51438db5a5aaaa2f93b7308b567573cb52dc Mon Sep 17 00:00:00 2001
From: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
Date: Tue, 14 Feb 2023 17:04:13 +0000
Subject: [PATCH 04/10] Added batched Mixed precision CG

---
 Grid/algorithms/Algorithms.h                  |   1 +
 .../ConjugateGradientMixedPrecBatched.h       | 213 ++++++++++++++++++
 2 files changed, 214 insertions(+)
 create mode 100644 Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h

diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h
index 7f27784b..ff3da17d 100644
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
new file mode 100644
index 00000000..93f5c677
--- /dev/null
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
@@ -0,0 +1,213 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
+
+    Copyright (C) 2015
+
+    Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
+#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
+
+NAMESPACE_BEGIN(Grid);
+
+//Mixed precision restarted defect correction CG
+template<class FieldD,class FieldF, 
+  typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
+  typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
+public:
+  using LinearFunction<FieldD>::operator();
+  RealD   Tolerance;
+  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+  Integer MaxInnerIterations;
+  Integer MaxOuterIterations;
+  Integer MaxPatchupIterations;
+  GridBase* SinglePrecGrid; //Grid for single-precision fields
+  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+  LinearOperatorBase<FieldF> &Linop_f;
+  LinearOperatorBase<FieldD> &Linop_d;
+
+  //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+  LinearFunction<FieldF> *guesser;
+  bool updateResidual;
+  
+  MixedPrecisionConjugateGradientBatched(RealD tol, 
+          Integer maxinnerit, 
+          Integer maxouterit, 
+          Integer maxpatchit,
+          GridBase* _sp_grid, 
+          LinearOperatorBase<FieldF> &_Linop_f, 
+          LinearOperatorBase<FieldD> &_Linop_d,
+          bool _updateResidual=true) :
+    Linop_f(_Linop_f), Linop_d(_Linop_d),
+    Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
+    OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
+
+  void useGuesser(LinearFunction<FieldF> &g){
+    guesser = &g;
+  }
+  
+  void operator() (const FieldD &src_d_in, FieldD &sol_d){
+    std::vector<FieldD> srcs_d_in{src_d_in};
+    std::vector<FieldD> sols_d{sol_d};
+
+    (*this)(srcs_d_in,sols_d);
+
+    sol_d = sols_d[0];
+  }
+
+  void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
+    assert(src_d_in.size() == sol_d.size());
+    int NBatch = src_d_in.size();
+
+    std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
+
+    Integer TotalOuterIterations = 0; //Number of restarts
+    std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations
+    std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
+  
+    GridStopWatch TotalTimer;
+    TotalTimer.Start();
+
+    GridStopWatch InnerCGtimer;
+    GridStopWatch PrecChangeTimer;
+    
+    int cb = src_d_in[0].Checkerboard();
+    
+    std::vector<RealD> src_norm;
+    std::vector<RealD> norm;
+    std::vector<RealD> stop;
+    
+    GridBase* DoublePrecGrid = src_d_in[0].Grid();
+    FieldD tmp_d(DoublePrecGrid);
+    tmp_d.Checkerboard() = cb;
+    
+    FieldD tmp2_d(DoublePrecGrid);
+    tmp2_d.Checkerboard() = cb;
+
+    std::vector<FieldD> src_d;
+    std::vector<FieldF> src_f;
+    std::vector<FieldF> sol_f;
+
+    for (int i=0; i<NBatch; i++) {
+      sol_d[i].Checkerboard() = cb;
+
+      src_norm.push_back(norm2(src_d_in[i]));
+      norm.push_back(0.);
+      stop.push_back(src_norm[i] * Tolerance*Tolerance);
+
+      src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
+
+      src_f.push_back(SinglePrecGrid);
+      src_f[i].Checkerboard() = cb;
+
+      sol_f.push_back(SinglePrecGrid);
+      sol_f[i].Checkerboard() = cb;
+    }
+    
+    RealD inner_tol = InnerTolerance;
+    
+    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
+    CG_f.ErrorOnNoConverge = false;
+    
+    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
+      
+    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
+      std::cout << GridLogMessage << std::endl;
+      std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
+      
+      bool allConverged = true;
+      
+      for (int i=0; i<NBatch; i++) {
+        //Compute double precision rsd and also new RHS vector.
+        Linop_d.HermOp(sol_d[i], tmp_d);
+        norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
+        
+        std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
+
+        PrecChangeTimer.Start();
+        precisionChange(src_f[i], src_d[i]);
+        PrecChangeTimer.Stop();
+        
+        sol_f[i] = Zero();
+      
+        if(norm[i] > OuterLoopNormMult * stop[i]) {
+          allConverged = false;
+        }
+      }
+      if (allConverged) break;
+
+      if (updateResidual) {
+        RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
+        RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
+        while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+        CG_f.Tolerance = inner_tol;
+      }
+
+      //Optionally improve inner solver guess (eg using known eigenvectors)
+      if(guesser != NULL) {
+        (*guesser)(src_f, sol_f);
+      }
+
+      for (int i=0; i<NBatch; i++) {
+        //Inner CG
+        InnerCGtimer.Start();
+        CG_f(Linop_f, src_f[i], sol_f[i]);
+        InnerCGtimer.Stop();
+        TotalInnerIterations[i] += CG_f.IterationsToComplete;
+        
+        //Convert sol back to double and add to double prec solution
+        PrecChangeTimer.Start();
+        precisionChange(tmp_d, sol_f[i]);
+        PrecChangeTimer.Stop();
+        
+        axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
+      }
+
+    }
+    
+    //Final trial CG
+    std::cout << GridLogMessage << std::endl;
+    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
+    
+    for (int i=0; i<NBatch; i++) {
+      ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
+      CG_d(Linop_d, src_d_in[i], sol_d[i]);
+      TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
+    }
+
+    TotalTimer.Stop();
+
+    std::cout << GridLogMessage << std::endl;
+    for (int i=0; i<NBatch; i++) {
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
+    }
+    std::cout << GridLogMessage << std::endl;
+    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+    
+  }
+};
+
+NAMESPACE_END(Grid);
+
+#endif

From ff97340324674b0e91be6afdccc39eb96e03c9d8 Mon Sep 17 00:00:00 2001
From: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
Date: Sun, 26 Feb 2023 12:22:45 +0000
Subject: [PATCH 05/10] Expose cached bytes

---
 Grid/allocator/MemoryManager.cc | 6 +++++-
 Grid/allocator/MemoryManager.h  | 4 +++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc
index d055898f..955a1f90 100644
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -35,6 +35,8 @@ void MemoryManager::PrintBytes(void)
   
 }
 
+uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccSmall]; }
+
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
 //////////////////////////////////////////////////////////////////////
@@ -190,7 +192,9 @@ void MemoryManager::InitMessage(void) {
   
   std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<std::endl;
 #endif
   
 #ifdef GRID_UVM
diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h
index c22a54f3..74390bc5 100644
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -121,7 +121,9 @@ private:
   static uint64_t     DeviceToHostXfer;
   static uint64_t     DeviceEvictions;
   static uint64_t     DeviceDestroy;
- 
+  
+  static uint64_t     DeviceCacheBytes();
+
  private:
 #ifndef GRID_UVM
   //////////////////////////////////////////////////////////////////////

From 7731c7db8e782ba3737278013b3f292d6723641c Mon Sep 17 00:00:00 2001
From: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
Date: Sun, 26 Feb 2023 14:15:28 +0000
Subject: [PATCH 06/10] Add huge cache type and allow Ncache==0

---
 Grid/allocator/MemoryManager.cc | 51 ++++++++++++++++++++++-----------
 Grid/allocator/MemoryManager.h  |  3 +-
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc
index 955a1f90..e9097c75 100644
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid);
 
 /*Allocation types, saying which pointer cache should be used*/
 #define Cpu      (0)
-#define CpuSmall (1)
-#define Acc      (2)
-#define AccSmall (3)
-#define Shared   (4)
-#define SharedSmall (5)
+#define CpuHuge  (1)
+#define CpuSmall (2)
+#define Acc      (3)
+#define AccHuge  (4)
+#define AccSmall (5)
+#define Shared   (6)
+#define SharedHuge  (7)
+#define SharedSmall (8)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
@@ -35,14 +38,14 @@ void MemoryManager::PrintBytes(void)
   
 }
 
-uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccSmall]; }
+uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
 
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
 uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
@@ -172,6 +175,16 @@ void MemoryManager::Init(void)
     }
   }
 
+  str= getenv("GRID_ALLOC_NCACHE_HUGE");
+  if ( str ) {
+    Nc = atoi(str);
+    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
+      Ncache[CpuHuge]=Nc;
+      Ncache[AccHuge]=Nc;
+      Ncache[SharedHuge]=Nc;
+    }
+  }
+
   str= getenv("GRID_ALLOC_NCACHE_SMALL");
   if ( str ) {
     Nc = atoi(str);
@@ -192,9 +205,9 @@ void MemoryManager::InitMessage(void) {
   
   std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<std::endl;
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
 #endif
   
 #ifdef GRID_UVM
@@ -226,8 +239,11 @@ void MemoryManager::InitMessage(void) {
 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
 {
 #ifdef ALLOCATION_CACHE
-  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
-  int cache = type + small;
+  int cache;
+  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
+  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
+  else                                     cache = type;
+
   return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
   return ptr;
@@ -236,11 +252,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 
 void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
-  assert(ncache>0);
 #ifdef GRID_OMP
   assert(omp_in_parallel()==0);
 #endif 
 
+  if (ncache == 0) return ptr;
+
   void * ret = NULL;
   int v = -1;
 
@@ -275,8 +292,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 void *MemoryManager::Lookup(size_t bytes,int type)
 {
 #ifdef ALLOCATION_CACHE
-  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
-  int cache = type+small;
+  int cache;
+  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
+  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
+  else                                     cache = type;
+
   return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
   return NULL;
@@ -285,7 +305,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 
 void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
-  assert(ncache>0);
 #ifdef GRID_OMP
   assert(omp_in_parallel()==0);
 #endif 
diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h
index 74390bc5..7a5f978c 100644
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?
 
 #define GRID_ALLOC_SMALL_LIMIT (4096)
+#define GRID_ALLOC_HUGE_LIMIT  (2147483648)
 
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
@@ -83,7 +84,7 @@ private:
   } AllocationCacheEntry;
 
   static const int NallocCacheMax=128; 
-  static const int NallocType=6;
+  static const int NallocType=9;
   static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
   static int Victim[NallocType];
   static int Ncache[NallocType];

From a3e935c9028a77938603168aa0edd6b24f05d607 Mon Sep 17 00:00:00 2001
From: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
Date: Mon, 27 Feb 2023 11:38:16 +0000
Subject: [PATCH 07/10] Batched block project/promote size checks

---
 Grid/lattice/Lattice_transfer.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 556785c0..4d1292a4 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -294,6 +294,8 @@ inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &co
                                const VLattice &Basis)
 {
   int NBatch = fineData.size();
+  assert(coarseData.size() == NBatch);
+
   GridBase * fine  = fineData[0].Grid();
   GridBase * coarse= coarseData[0].Grid();
 
@@ -622,7 +624,9 @@ inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>
                                std::vector<Lattice<vobj>> &fineData,
                                const VLattice &Basis)
 {
-  int NBatch = fineData.size();
+  int NBatch = coarseData.size();
+  assert(fineData.size() == NBatch);
+
   GridBase * fine   = fineData[0].Grid();
   GridBase * coarse = coarseData[0].Grid();
   for (int k=0; k<NBatch; k++)

From 39c0815d9e6a338cf294af6fa6b759401888a5cc Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 21 Mar 2023 08:57:29 -0400
Subject: [PATCH 08/10] WriteDiscard

---
 Grid/lattice/Lattice_base.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 34f13fa6..d6289de2 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -129,7 +129,7 @@ public:
     
     auto exprCopy = expr;
     ExpressionViewOpen(exprCopy);
-    auto me  = View(AcceleratorWrite);
+    auto me  = View(AcceleratorWriteDiscard);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
       auto tmp = eval(ss,exprCopy);
       coalescedWrite(me[ss],tmp);
@@ -152,7 +152,7 @@ public:
 
     auto exprCopy = expr;
     ExpressionViewOpen(exprCopy);
-    auto me  = View(AcceleratorWrite);
+    auto me  = View(AcceleratorWriteDiscard);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
       auto tmp = eval(ss,exprCopy);
       coalescedWrite(me[ss],tmp);
@@ -174,7 +174,7 @@ public:
     this->checkerboard=cb;
     auto exprCopy = expr;
     ExpressionViewOpen(exprCopy);
-    auto me  = View(AcceleratorWrite);
+    auto me  = View(AcceleratorWriteDiscard);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
       auto tmp = eval(ss,exprCopy);
       coalescedWrite(me[ss],tmp);
@@ -288,8 +288,8 @@ public:
     typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
     conformable(*this,r);
     this->checkerboard = r.Checkerboard();
-    auto me =   View(AcceleratorWrite);
     auto him= r.View(AcceleratorRead);
+    auto me =   View(AcceleratorWriteDiscard);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
       coalescedWrite(me[ss],him(ss));
     });
@@ -303,8 +303,8 @@ public:
   inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
     this->checkerboard = r.Checkerboard();
     conformable(*this,r);
-    auto me =   View(AcceleratorWrite);
     auto him= r.View(AcceleratorRead);
+    auto me =   View(AcceleratorWriteDiscard);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
       coalescedWrite(me[ss],him(ss));
     });

From 281488611a7b127a52e7d91930774bb7e7faa81f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 23 Mar 2023 10:28:50 -0400
Subject: [PATCH 09/10] WriteDiscard on construct

---
 Grid/lattice/Lattice_base.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index d6289de2..838cdda5 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -245,7 +245,7 @@ public:
   ///////////////////////////////////////////
   // user defined constructor
   ///////////////////////////////////////////
-  Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) { 
+  Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { 
     this->_grid = grid;
     resize(this->_grid->oSites());
     assert((((uint64_t)&this->_odata[0])&0xF) ==0);

From 481bbaf1fce5b7ef0162c6f9ecec73a80e263cc7 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 23 Mar 2023 12:55:31 -0400
Subject: [PATCH 10/10] Interface to query memory use

---
 Grid/allocator/MemoryManager.cc |  1 +
 Grid/allocator/MemoryManager.h  | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc
index e9097c75..a9e5c9b4 100644
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -39,6 +39,7 @@ void MemoryManager::PrintBytes(void)
 }
 
 uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
+uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
 
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h
index 7a5f978c..0dc78f04 100644
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -71,6 +71,21 @@ enum ViewMode {
   CpuWriteDiscard = 0x10 // same for now
 };
 
+struct MemoryStatus {
+  uint64_t     DeviceBytes;
+  uint64_t     DeviceLRUBytes;
+  uint64_t     DeviceMaxBytes;
+  uint64_t     HostToDeviceBytes;
+  uint64_t     DeviceToHostBytes;
+  uint64_t     HostToDeviceXfer;
+  uint64_t     DeviceToHostXfer;
+  uint64_t     DeviceEvictions;
+  uint64_t     DeviceDestroy;
+  uint64_t     DeviceAllocCacheBytes;
+  uint64_t     HostAllocCacheBytes;
+};
+
+
 class MemoryManager {
 private:
 
@@ -124,7 +139,24 @@ private:
   static uint64_t     DeviceDestroy;
   
   static uint64_t     DeviceCacheBytes();
+  static uint64_t     HostCacheBytes();
 
+  static MemoryStatus GetFootprint(void) {
+    MemoryStatus stat;
+    stat.DeviceBytes       = DeviceBytes;
+    stat.DeviceLRUBytes    = DeviceLRUBytes;
+    stat.DeviceMaxBytes    = DeviceMaxBytes;
+    stat.HostToDeviceBytes = HostToDeviceBytes;
+    stat.DeviceToHostBytes = DeviceToHostBytes;
+    stat.HostToDeviceXfer  = HostToDeviceXfer;
+    stat.DeviceToHostXfer  = DeviceToHostXfer;
+    stat.DeviceEvictions   = DeviceEvictions;
+    stat.DeviceDestroy     = DeviceDestroy;
+    stat.DeviceAllocCacheBytes = DeviceCacheBytes();
+    stat.HostAllocCacheBytes   = HostCacheBytes();
+    return stat;
+  };
+  
  private:
 #ifndef GRID_UVM
   //////////////////////////////////////////////////////////////////////