From 88015b08588e040abcb888041bd3f63187307834 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 26 Dec 2022 10:01:32 +0100 Subject: [PATCH 01/10] Split sum in rankSum and GlobalSum --- Grid/lattice/Lattice_reduction.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 326b9ea3..d9025de0 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -144,17 +144,23 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) } template -inline typename vobj::scalar_object sum(const Lattice &arg) +inline typename vobj::scalar_object rankSum(const Lattice &arg) { #if defined(GRID_CUDA)||defined(GRID_HIP) autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_gpu(&arg_v[0],osites); + return sum_gpu(&arg_v[0],osites); #else autoView(arg_v, arg, CpuRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_cpu(&arg_v[0],osites); + return sum_cpu(&arg_v[0],osites); #endif +} + +template +inline typename vobj::scalar_object sum(const Lattice &arg) +{ + auto ssum = rankSum(arg); arg.Grid()->GlobalSum(ssum); return ssum; } From 7d62f1d6d20790f681f37fe1755e3712d2a4e2b0 Mon Sep 17 00:00:00 2001 From: Makis Kappas Date: Wed, 11 Jan 2023 21:26:25 +0000 Subject: [PATCH 02/10] Populate the Cshift_table in the GPU Cshift is allocated in Unified memory and used in the LambdaApply kernels but also populated from the host. This creates a lot of Unified HtoD and DtoH mem operations and has a negative effect in performance. With this commit we populate the Cshift table in the device with the populate_Cshift_table() kernel. --- Grid/cshift/Cshift_common.h | 40 +++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index cf902b58..742c99da 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -297,6 +297,30 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA } } +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + +template +T iDivUp(T a, T b) // Round a / b to nearest higher integer value +{ return (a % b != 0) ? (a / b + 1) : (a / b); } + +template +__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) +{ + int idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx >= e1*e2) return; + + int n, b, o; + + n = idx / e2; + b = idx % e2; + o = n*stride + b; + + vector[2*idx + 0] = lo + o; + vector[2*idx + 1] = ro + o; +} + +#endif + ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// @@ -321,12 +345,20 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int ent=0; if(cbmask == 0x3 ){ +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + ent = e1*e2; + dim3 blockSize(acceleratorThreads()); + dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); + populate_Cshift_table<<>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o,ro+o); } } +#endif } else { for(int n=0;n void Copy_plane_permute(Lattice& lhs,const Lattice>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o+b,ro+o+b); }} +#endif } else { for(int n=0;n Date: Tue, 14 Feb 2023 14:37:10 +0000 Subject: [PATCH 03/10] Add batched block project/promote functions --- Grid/lattice/Lattice_transfer.h | 45 +++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index ef489ea6..556785c0 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -288,7 +288,34 @@ inline void blockProject(Lattice > &coarseData, blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); } } +template +inline void batchBlockProject(std::vector>> &coarseData, + const std::vector> &fineData, + const VLattice &Basis) +{ + int NBatch = fineData.size(); + GridBase * fine = fineData[0].Grid(); + GridBase * coarse= coarseData[0].Grid(); + Lattice> ip(coarse); + std::vector> fineDataCopy = fineData; + + autoView(ip_, ip, AcceleratorWrite); + for(int v=0;v + accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { + convertType(coarseData_[sc](v),ip_[sc]); + }); + + // improve numerical stability of projection + // |fine> = |fine> - |basis> + ip=-ip; + blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); + } + } +} template inline void blockZAXPY(Lattice &fineZ, @@ -590,6 +617,24 @@ inline void blockPromote(const Lattice > &coarseData, } #endif +template +inline void batchBlockPromote(const std::vector>> &coarseData, + std::vector> &fineData, + const VLattice &Basis) +{ + int NBatch = fineData.size(); + GridBase * fine = fineData[0].Grid(); + GridBase * coarse = coarseData[0].Grid(); + for (int k=0; k> ip = PeekIndex<0>(coarseData[k],i); + blockZAXPY(fineData[k],ip,Basis[i],fineData[k]); + } + } +} + // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Simd layouts need not match since we use peek/poke Local template From 920a51438db5a5aaaa2f93b7308b567573cb52dc Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Tue, 14 Feb 2023 17:04:13 +0000 Subject: [PATCH 04/10] Added batched Mixed precision CG --- Grid/algorithms/Algorithms.h | 1 + .../ConjugateGradientMixedPrecBatched.h | 213 ++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 7f27784b..ff3da17d 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB); #include #include #include +#include #include #include #include diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h new file mode 100644 index 00000000..93f5c677 --- /dev/null +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h @@ -0,0 +1,213 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h + + Copyright (C) 2015 + + Author: Raoul Hodgson + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H +#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H + +NAMESPACE_BEGIN(Grid); + +//Mixed precision restarted defect correction CG +template::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class MixedPrecisionConjugateGradientBatched : public LinearFunction { +public: + using LinearFunction::operator(); + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + Integer MaxPatchupIterations; + GridBase* SinglePrecGrid; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + LinearOperatorBase &Linop_f; + LinearOperatorBase &Linop_d; + + //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess + LinearFunction *guesser; + bool updateResidual; + + MixedPrecisionConjugateGradientBatched(RealD tol, + Integer maxinnerit, + Integer maxouterit, + Integer maxpatchit, + GridBase* _sp_grid, + LinearOperatorBase &_Linop_f, + LinearOperatorBase &_Linop_d, + bool _updateResidual=true) : + Linop_f(_Linop_f), Linop_d(_Linop_d), + Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid), + OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { }; + + void useGuesser(LinearFunction &g){ + guesser = &g; + } + + void operator() (const FieldD &src_d_in, FieldD &sol_d){ + std::vector srcs_d_in{src_d_in}; + std::vector sols_d{sol_d}; + + (*this)(srcs_d_in,sols_d); + + sol_d = sols_d[0]; + } + + void operator() (const std::vector &src_d_in, std::vector &sol_d){ + assert(src_d_in.size() == sol_d.size()); + int NBatch = src_d_in.size(); + + std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; + + Integer TotalOuterIterations = 0; //Number of restarts + std::vector TotalInnerIterations(NBatch,0); //Number of inner CG iterations + std::vector TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step + + GridStopWatch TotalTimer; + TotalTimer.Start(); + + GridStopWatch InnerCGtimer; + GridStopWatch PrecChangeTimer; + + int cb = src_d_in[0].Checkerboard(); + + std::vector src_norm; + std::vector norm; + std::vector stop; + + GridBase* DoublePrecGrid = src_d_in[0].Grid(); + FieldD tmp_d(DoublePrecGrid); + tmp_d.Checkerboard() = cb; + + FieldD tmp2_d(DoublePrecGrid); + tmp2_d.Checkerboard() = cb; + + std::vector src_d; + std::vector src_f; + std::vector sol_f; + + for (int i=0; i CG_f(inner_tol, MaxInnerIterations); + CG_f.ErrorOnNoConverge = false; + + Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count + + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl; + + bool allConverged = true; + + for (int i=0; i OuterLoopNormMult * stop[i]) { + allConverged = false; + } + } + if (allConverged) break; + + if (updateResidual) { + RealD normMax = *std::max_element(std::begin(norm), std::end(norm)); + RealD stopMax = *std::max_element(std::begin(stop), std::end(stop)); + while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? + CG_f.Tolerance = inner_tol; + } + + //Optionally improve inner solver guess (eg using known eigenvectors) + if(guesser != NULL) { + (*guesser)(src_f, sol_f); + } + + for (int i=0; i CG_d(Tolerance, MaxPatchupIterations); + CG_d(Linop_d, src_d_in[i], sol_d[i]); + TotalFinalStepIterations[i] += CG_d.IterationsToComplete; + } + + TotalTimer.Stop(); + + std::cout << GridLogMessage << std::endl; + for (int i=0; i Date: Sun, 26 Feb 2023 12:22:45 +0000 Subject: [PATCH 05/10] Expose cached bytes --- Grid/allocator/MemoryManager.cc | 6 +++++- Grid/allocator/MemoryManager.h | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index d055898f..955a1f90 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -35,6 +35,8 @@ void MemoryManager::PrintBytes(void) } +uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccSmall]; } + ////////////////////////////////////////////////////////////////////// // Data tables for recently freed pooiniter caches ////////////////////////////////////////////////////////////////////// @@ -190,7 +192,9 @@ void MemoryManager::InitMessage(void) { std::cout << GridLogMessage<< "MemoryManager::Init() setting up"< Date: Sun, 26 Feb 2023 14:15:28 +0000 Subject: [PATCH 06/10] Add huge cache type and allow Ncache==0 --- Grid/allocator/MemoryManager.cc | 51 ++++++++++++++++++++++----------- Grid/allocator/MemoryManager.h | 3 +- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index 955a1f90..e9097c75 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid); /*Allocation types, saying which pointer cache should be used*/ #define Cpu (0) -#define CpuSmall (1) -#define Acc (2) -#define AccSmall (3) -#define Shared (4) -#define SharedSmall (5) +#define CpuHuge (1) +#define CpuSmall (2) +#define Acc (3) +#define AccHuge (4) +#define AccSmall (5) +#define Shared (6) +#define SharedHuge (7) +#define SharedSmall (8) #undef GRID_MM_VERBOSE uint64_t total_shared; uint64_t total_device; @@ -35,14 +38,14 @@ void MemoryManager::PrintBytes(void) } -uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccSmall]; } +uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } ////////////////////////////////////////////////////////////////////// // Data tables for recently freed pooiniter caches ////////////////////////////////////////////////////////////////////// MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; int MemoryManager::Victim[MemoryManager::NallocType]; -int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 }; +int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 }; uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; ////////////////////////////////////////////////////////////////////// // Actual allocation and deallocation utils @@ -172,6 +175,16 @@ void MemoryManager::Init(void) } } + str= getenv("GRID_ALLOC_NCACHE_HUGE"); + if ( str ) { + Nc = atoi(str); + if ( (Nc>=0) && (Nc < NallocCacheMax)) { + Ncache[CpuHuge]=Nc; + Ncache[AccHuge]=Nc; + Ncache[SharedHuge]=Nc; + } + } + str= getenv("GRID_ALLOC_NCACHE_SMALL"); if ( str ) { Nc = atoi(str); @@ -192,9 +205,9 @@ void MemoryManager::InitMessage(void) { std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]); #else return ptr; @@ -236,11 +252,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif + if (ncache == 0) return ptr; + void * ret = NULL; int v = -1; @@ -275,8 +292,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries void *MemoryManager::Lookup(size_t bytes,int type) { #ifdef ALLOCATION_CACHE - bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); - int cache = type+small; + int cache; + if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; + else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); #else return NULL; @@ -285,7 +305,6 @@ void *MemoryManager::Lookup(size_t bytes,int type) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 74390bc5..7a5f978c 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? #define GRID_ALLOC_SMALL_LIMIT (4096) +#define GRID_ALLOC_HUGE_LIMIT (2147483648) #define STRINGIFY(x) #x #define TOSTRING(x) STRINGIFY(x) @@ -83,7 +84,7 @@ private: } AllocationCacheEntry; static const int NallocCacheMax=128; - static const int NallocType=6; + static const int NallocType=9; static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; static int Victim[NallocType]; static int Ncache[NallocType]; From a3e935c9028a77938603168aa0edd6b24f05d607 Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Mon, 27 Feb 2023 11:38:16 +0000 Subject: [PATCH 07/10] Batched block project/promote size checks --- Grid/lattice/Lattice_transfer.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 556785c0..4d1292a4 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -294,6 +294,8 @@ inline void batchBlockProject(std::vector>> &co const VLattice &Basis) { int NBatch = fineData.size(); + assert(coarseData.size() == NBatch); + GridBase * fine = fineData[0].Grid(); GridBase * coarse= coarseData[0].Grid(); @@ -622,7 +624,9 @@ inline void batchBlockPromote(const std::vector std::vector> &fineData, const VLattice &Basis) { - int NBatch = fineData.size(); + int NBatch = coarseData.size(); + assert(fineData.size() == NBatch); + GridBase * fine = fineData[0].Grid(); GridBase * coarse = coarseData[0].Grid(); for (int k=0; k Date: Tue, 21 Mar 2023 08:57:29 -0400 Subject: [PATCH 08/10] WriteDiscard --- Grid/lattice/Lattice_base.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 34f13fa6..d6289de2 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -129,7 +129,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWrite); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -152,7 +152,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWrite); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -174,7 +174,7 @@ public: this->checkerboard=cb; auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWrite); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -288,8 +288,8 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - auto me = View(AcceleratorWrite); auto him= r.View(AcceleratorRead); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); @@ -303,8 +303,8 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - auto me = View(AcceleratorWrite); auto him= r.View(AcceleratorRead); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); From 281488611a7b127a52e7d91930774bb7e7faa81f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 23 Mar 2023 10:28:50 -0400 Subject: [PATCH 09/10] WriteDiscard on construct --- Grid/lattice/Lattice_base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index d6289de2..838cdda5 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -245,7 +245,7 @@ public: /////////////////////////////////////////// // user defined constructor /////////////////////////////////////////// - Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) { + Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { this->_grid = grid; resize(this->_grid->oSites()); assert((((uint64_t)&this->_odata[0])&0xF) ==0); From 481bbaf1fce5b7ef0162c6f9ecec73a80e263cc7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 23 Mar 2023 12:55:31 -0400 Subject: [PATCH 10/10] Interface to query memory use --- Grid/allocator/MemoryManager.cc | 1 + Grid/allocator/MemoryManager.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index e9097c75..a9e5c9b4 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -39,6 +39,7 @@ void MemoryManager::PrintBytes(void) } uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } +uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; } ////////////////////////////////////////////////////////////////////// // Data tables for recently freed pooiniter caches diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 7a5f978c..0dc78f04 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -71,6 +71,21 @@ enum ViewMode { CpuWriteDiscard = 0x10 // same for now }; +struct MemoryStatus { + uint64_t DeviceBytes; + uint64_t DeviceLRUBytes; + uint64_t DeviceMaxBytes; + uint64_t HostToDeviceBytes; + uint64_t DeviceToHostBytes; + uint64_t HostToDeviceXfer; + uint64_t DeviceToHostXfer; + uint64_t DeviceEvictions; + uint64_t DeviceDestroy; + uint64_t DeviceAllocCacheBytes; + uint64_t HostAllocCacheBytes; +}; + + class MemoryManager { private: @@ -124,7 +139,24 @@ private: static uint64_t DeviceDestroy; static uint64_t DeviceCacheBytes(); + static uint64_t HostCacheBytes(); + static MemoryStatus GetFootprint(void) { + MemoryStatus stat; + stat.DeviceBytes = DeviceBytes; + stat.DeviceLRUBytes = DeviceLRUBytes; + stat.DeviceMaxBytes = DeviceMaxBytes; + stat.HostToDeviceBytes = HostToDeviceBytes; + stat.DeviceToHostBytes = DeviceToHostBytes; + stat.HostToDeviceXfer = HostToDeviceXfer; + stat.DeviceToHostXfer = DeviceToHostXfer; + stat.DeviceEvictions = DeviceEvictions; + stat.DeviceDestroy = DeviceDestroy; + stat.DeviceAllocCacheBytes = DeviceCacheBytes(); + stat.HostAllocCacheBytes = HostCacheBytes(); + return stat; + }; + private: #ifndef GRID_UVM //////////////////////////////////////////////////////////////////////