From 88015b08588e040abcb888041bd3f63187307834 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 26 Dec 2022 10:01:32 +0100 Subject: [PATCH 01/15] Split sum in rankSum and GlobalSum --- Grid/lattice/Lattice_reduction.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 326b9ea3..d9025de0 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -144,17 +144,23 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) } template -inline typename vobj::scalar_object sum(const Lattice &arg) +inline typename vobj::scalar_object rankSum(const Lattice &arg) { #if defined(GRID_CUDA)||defined(GRID_HIP) autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_gpu(&arg_v[0],osites); + return sum_gpu(&arg_v[0],osites); #else autoView(arg_v, arg, CpuRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_cpu(&arg_v[0],osites); + return sum_cpu(&arg_v[0],osites); #endif +} + +template +inline typename vobj::scalar_object sum(const Lattice &arg) +{ + auto ssum = rankSum(arg); arg.Grid()->GlobalSum(ssum); return ssum; } From 7d62f1d6d20790f681f37fe1755e3712d2a4e2b0 Mon Sep 17 00:00:00 2001 From: Makis Kappas Date: Wed, 11 Jan 2023 21:26:25 +0000 Subject: [PATCH 02/15] Populate the Cshift_table in the GPU Cshift is allocated in Unified memory and used in the LambdaApply kernels but also populated from the host. This creates a lot of Unified HtoD and DtoH mem operations and has a negative effect in performance. With this commit we populate the Cshift table in the device with the populate_Cshift_table() kernel. --- Grid/cshift/Cshift_common.h | 40 +++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index cf902b58..742c99da 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -297,6 +297,30 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA } } +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + +template +T iDivUp(T a, T b) // Round a / b to nearest higher integer value +{ return (a % b != 0) ? (a / b + 1) : (a / b); } + +template +__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) +{ + int idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx >= e1*e2) return; + + int n, b, o; + + n = idx / e2; + b = idx % e2; + o = n*stride + b; + + vector[2*idx + 0] = lo + o; + vector[2*idx + 1] = ro + o; +} + +#endif + ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// @@ -321,12 +345,20 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int ent=0; if(cbmask == 0x3 ){ +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + ent = e1*e2; + dim3 blockSize(acceleratorThreads()); + dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); + populate_Cshift_table<<>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o,ro+o); } } +#endif } else { for(int n=0;n void Copy_plane_permute(Lattice& lhs,const Lattice>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o+b,ro+o+b); }} +#endif } else { for(int n=0;n Date: Tue, 14 Feb 2023 14:37:10 +0000 Subject: [PATCH 03/15] Add batched block project/promote functions --- Grid/lattice/Lattice_transfer.h | 45 +++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index ef489ea6..556785c0 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -288,7 +288,34 @@ inline void blockProject(Lattice > &coarseData, blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); } } +template +inline void batchBlockProject(std::vector>> &coarseData, + const std::vector> &fineData, + const VLattice &Basis) +{ + int NBatch = fineData.size(); + GridBase * fine = fineData[0].Grid(); + GridBase * coarse= coarseData[0].Grid(); + Lattice> ip(coarse); + std::vector> fineDataCopy = fineData; + + autoView(ip_, ip, AcceleratorWrite); + for(int v=0;v + accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { + convertType(coarseData_[sc](v),ip_[sc]); + }); + + // improve numerical stability of projection + // |fine> = |fine> - |basis> + ip=-ip; + blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); + } + } +} template inline void blockZAXPY(Lattice &fineZ, @@ -590,6 +617,24 @@ inline void blockPromote(const Lattice > &coarseData, } #endif +template +inline void batchBlockPromote(const std::vector>> &coarseData, + std::vector> &fineData, + const VLattice &Basis) +{ + int NBatch = fineData.size(); + GridBase * fine = fineData[0].Grid(); + GridBase * coarse = coarseData[0].Grid(); + for (int k=0; k> ip = PeekIndex<0>(coarseData[k],i); + blockZAXPY(fineData[k],ip,Basis[i],fineData[k]); + } + } +} + // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Simd layouts need not match since we use peek/poke Local template From 920a51438db5a5aaaa2f93b7308b567573cb52dc Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Tue, 14 Feb 2023 17:04:13 +0000 Subject: [PATCH 04/15] Added batched Mixed precision CG --- Grid/algorithms/Algorithms.h | 1 + .../ConjugateGradientMixedPrecBatched.h | 213 ++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 7f27784b..ff3da17d 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB); #include #include #include +#include #include #include #include diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h new file mode 100644 index 00000000..93f5c677 --- /dev/null +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h @@ -0,0 +1,213 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h + + Copyright (C) 2015 + + Author: Raoul Hodgson + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H +#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H + +NAMESPACE_BEGIN(Grid); + +//Mixed precision restarted defect correction CG +template::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class MixedPrecisionConjugateGradientBatched : public LinearFunction { +public: + using LinearFunction::operator(); + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + Integer MaxPatchupIterations; + GridBase* SinglePrecGrid; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + LinearOperatorBase &Linop_f; + LinearOperatorBase &Linop_d; + + //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess + LinearFunction *guesser; + bool updateResidual; + + MixedPrecisionConjugateGradientBatched(RealD tol, + Integer maxinnerit, + Integer maxouterit, + Integer maxpatchit, + GridBase* _sp_grid, + LinearOperatorBase &_Linop_f, + LinearOperatorBase &_Linop_d, + bool _updateResidual=true) : + Linop_f(_Linop_f), Linop_d(_Linop_d), + Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid), + OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { }; + + void useGuesser(LinearFunction &g){ + guesser = &g; + } + + void operator() (const FieldD &src_d_in, FieldD &sol_d){ + std::vector srcs_d_in{src_d_in}; + std::vector sols_d{sol_d}; + + (*this)(srcs_d_in,sols_d); + + sol_d = sols_d[0]; + } + + void operator() (const std::vector &src_d_in, std::vector &sol_d){ + assert(src_d_in.size() == sol_d.size()); + int NBatch = src_d_in.size(); + + std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; + + Integer TotalOuterIterations = 0; //Number of restarts + std::vector TotalInnerIterations(NBatch,0); //Number of inner CG iterations + std::vector TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step + + GridStopWatch TotalTimer; + TotalTimer.Start(); + + GridStopWatch InnerCGtimer; + GridStopWatch PrecChangeTimer; + + int cb = src_d_in[0].Checkerboard(); + + std::vector src_norm; + std::vector norm; + std::vector stop; + + GridBase* DoublePrecGrid = src_d_in[0].Grid(); + FieldD tmp_d(DoublePrecGrid); + tmp_d.Checkerboard() = cb; + + FieldD tmp2_d(DoublePrecGrid); + tmp2_d.Checkerboard() = cb; + + std::vector src_d; + std::vector src_f; + std::vector sol_f; + + for (int i=0; i CG_f(inner_tol, MaxInnerIterations); + CG_f.ErrorOnNoConverge = false; + + Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count + + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl; + + bool allConverged = true; + + for (int i=0; i OuterLoopNormMult * stop[i]) { + allConverged = false; + } + } + if (allConverged) break; + + if (updateResidual) { + RealD normMax = *std::max_element(std::begin(norm), std::end(norm)); + RealD stopMax = *std::max_element(std::begin(stop), std::end(stop)); + while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? + CG_f.Tolerance = inner_tol; + } + + //Optionally improve inner solver guess (eg using known eigenvectors) + if(guesser != NULL) { + (*guesser)(src_f, sol_f); + } + + for (int i=0; i CG_d(Tolerance, MaxPatchupIterations); + CG_d(Linop_d, src_d_in[i], sol_d[i]); + TotalFinalStepIterations[i] += CG_d.IterationsToComplete; + } + + TotalTimer.Stop(); + + std::cout << GridLogMessage << std::endl; + for (int i=0; i Date: Sun, 26 Feb 2023 12:22:45 +0000 Subject: [PATCH 05/15] Expose cached bytes --- Grid/allocator/MemoryManager.cc | 6 +++++- Grid/allocator/MemoryManager.h | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index d055898f..955a1f90 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -35,6 +35,8 @@ void MemoryManager::PrintBytes(void) } +uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccSmall]; } + ////////////////////////////////////////////////////////////////////// // Data tables for recently freed pooiniter caches ////////////////////////////////////////////////////////////////////// @@ -190,7 +192,9 @@ void MemoryManager::InitMessage(void) { std::cout << GridLogMessage<< "MemoryManager::Init() setting up"< Date: Sun, 26 Feb 2023 14:15:28 +0000 Subject: [PATCH 06/15] Add huge cache type and allow Ncache==0 --- Grid/allocator/MemoryManager.cc | 51 ++++++++++++++++++++++----------- Grid/allocator/MemoryManager.h | 3 +- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index 955a1f90..e9097c75 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid); /*Allocation types, saying which pointer cache should be used*/ #define Cpu (0) -#define CpuSmall (1) -#define Acc (2) -#define AccSmall (3) -#define Shared (4) -#define SharedSmall (5) +#define CpuHuge (1) +#define CpuSmall (2) +#define Acc (3) +#define AccHuge (4) +#define AccSmall (5) +#define Shared (6) +#define SharedHuge (7) +#define SharedSmall (8) #undef GRID_MM_VERBOSE uint64_t total_shared; uint64_t total_device; @@ -35,14 +38,14 @@ void MemoryManager::PrintBytes(void) } -uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccSmall]; } +uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } ////////////////////////////////////////////////////////////////////// // Data tables for recently freed pooiniter caches ////////////////////////////////////////////////////////////////////// MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; int MemoryManager::Victim[MemoryManager::NallocType]; -int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 }; +int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 }; uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; ////////////////////////////////////////////////////////////////////// // Actual allocation and deallocation utils @@ -172,6 +175,16 @@ void MemoryManager::Init(void) } } + str= getenv("GRID_ALLOC_NCACHE_HUGE"); + if ( str ) { + Nc = atoi(str); + if ( (Nc>=0) && (Nc < NallocCacheMax)) { + Ncache[CpuHuge]=Nc; + Ncache[AccHuge]=Nc; + Ncache[SharedHuge]=Nc; + } + } + str= getenv("GRID_ALLOC_NCACHE_SMALL"); if ( str ) { Nc = atoi(str); @@ -192,9 +205,9 @@ void MemoryManager::InitMessage(void) { std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]); #else return ptr; @@ -236,11 +252,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif + if (ncache == 0) return ptr; + void * ret = NULL; int v = -1; @@ -275,8 +292,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries void *MemoryManager::Lookup(size_t bytes,int type) { #ifdef ALLOCATION_CACHE - bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); - int cache = type+small; + int cache; + if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; + else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); #else return NULL; @@ -285,7 +305,6 @@ void *MemoryManager::Lookup(size_t bytes,int type) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 74390bc5..7a5f978c 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? #define GRID_ALLOC_SMALL_LIMIT (4096) +#define GRID_ALLOC_HUGE_LIMIT (2147483648) #define STRINGIFY(x) #x #define TOSTRING(x) STRINGIFY(x) @@ -83,7 +84,7 @@ private: } AllocationCacheEntry; static const int NallocCacheMax=128; - static const int NallocType=6; + static const int NallocType=9; static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; static int Victim[NallocType]; static int Ncache[NallocType]; From a3e935c9028a77938603168aa0edd6b24f05d607 Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Mon, 27 Feb 2023 11:38:16 +0000 Subject: [PATCH 07/15] Batched block project/promote size checks --- Grid/lattice/Lattice_transfer.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 556785c0..4d1292a4 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -294,6 +294,8 @@ inline void batchBlockProject(std::vector>> &co const VLattice &Basis) { int NBatch = fineData.size(); + assert(coarseData.size() == NBatch); + GridBase * fine = fineData[0].Grid(); GridBase * coarse= coarseData[0].Grid(); @@ -622,7 +624,9 @@ inline void batchBlockPromote(const std::vector std::vector> &fineData, const VLattice &Basis) { - int NBatch = fineData.size(); + int NBatch = coarseData.size(); + assert(fineData.size() == NBatch); + GridBase * fine = fineData[0].Grid(); GridBase * coarse = coarseData[0].Grid(); for (int k=0; k Date: Thu, 23 Mar 2023 10:28:50 -0400 Subject: [PATCH 08/15] WriteDiscard on construct --- Grid/lattice/Lattice_base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index d6289de2..838cdda5 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -245,7 +245,7 @@ public: /////////////////////////////////////////// // user defined constructor /////////////////////////////////////////// - Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) { + Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { this->_grid = grid; resize(this->_grid->oSites()); assert((((uint64_t)&this->_odata[0])&0xF) ==0); From 481bbaf1fce5b7ef0162c6f9ecec73a80e263cc7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 23 Mar 2023 12:55:31 -0400 Subject: [PATCH 09/15] Interface to query memory use --- Grid/allocator/MemoryManager.cc | 1 + Grid/allocator/MemoryManager.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index e9097c75..a9e5c9b4 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -39,6 +39,7 @@ void MemoryManager::PrintBytes(void) } uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } +uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; } ////////////////////////////////////////////////////////////////////// // Data tables for recently freed pooiniter caches diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 7a5f978c..0dc78f04 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -71,6 +71,21 @@ enum ViewMode { CpuWriteDiscard = 0x10 // same for now }; +struct MemoryStatus { + uint64_t DeviceBytes; + uint64_t DeviceLRUBytes; + uint64_t DeviceMaxBytes; + uint64_t HostToDeviceBytes; + uint64_t DeviceToHostBytes; + uint64_t HostToDeviceXfer; + uint64_t DeviceToHostXfer; + uint64_t DeviceEvictions; + uint64_t DeviceDestroy; + uint64_t DeviceAllocCacheBytes; + uint64_t HostAllocCacheBytes; +}; + + class MemoryManager { private: @@ -124,7 +139,24 @@ private: static uint64_t DeviceDestroy; static uint64_t DeviceCacheBytes(); + static uint64_t HostCacheBytes(); + static MemoryStatus GetFootprint(void) { + MemoryStatus stat; + stat.DeviceBytes = DeviceBytes; + stat.DeviceLRUBytes = DeviceLRUBytes; + stat.DeviceMaxBytes = DeviceMaxBytes; + stat.HostToDeviceBytes = HostToDeviceBytes; + stat.DeviceToHostBytes = DeviceToHostBytes; + stat.HostToDeviceXfer = HostToDeviceXfer; + stat.DeviceToHostXfer = DeviceToHostXfer; + stat.DeviceEvictions = DeviceEvictions; + stat.DeviceDestroy = DeviceDestroy; + stat.DeviceAllocCacheBytes = DeviceCacheBytes(); + stat.HostAllocCacheBytes = HostCacheBytes(); + return stat; + }; + private: #ifndef GRID_UVM ////////////////////////////////////////////////////////////////////// From 2fbcf13c46b1e7d0d0c6c7f9ead0f58d12c78cb5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Mar 2023 14:25:14 -0700 Subject: [PATCH 10/15] SYCL fix --- Grid/communicator/SharedMemoryMPI.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 795f3928..792f8405 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -36,9 +36,11 @@ Author: Christoph Lehner #ifdef GRID_HIP #include #endif -#ifdef GRID_SYCl +#ifdef GRID_SYCL #endif +#define GRID_SYCL_LEVEL_ZERO_IPC + NAMESPACE_BEGIN(Grid); #define header "SharedMemoryMpi: " From dd3bbb8fa2ff3878045f3b7e01e68036baac9bcb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Mar 2023 17:27:45 -0700 Subject: [PATCH 11/15] MOve the synchronise out to the stencil so one call instead of one call per packet --- Grid/communicator/Communicator_mpi3.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index b8ce7bca..280f5c4e 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -401,8 +401,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list,int dir) { // std::cout << "Copy Synchronised\n"< Date: Mon, 27 Mar 2023 17:28:38 -0700 Subject: [PATCH 12/15] Move the copy synch out to stencil and do one per call instead of one per packet --- Grid/stencil/Stencil.h | 48 +++++++++++------------------------------- 1 file changed, 12 insertions(+), 36 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 65d878cb..6296df4e 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -398,6 +398,8 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + // Buffers are gathered AND synchronised + // Copies are MPI ISend OR asynch copy on copy stream reqs.resize(Packets.size()); commtime-=usecond(); for(int i=0;iStencilBarrier();// Synch shared memory on a single nodes } void CommunicateComplete(std::vector > &reqs) { + // complete intranode + acceleratorCopySynchronise(); + // complete MPI for(int i=0;iStencilSendToRecvFromComplete(reqs[i],i); } + // Everyone agrees we are all done + _grid->StencilBarrier(); commtime+=usecond(); } //////////////////////////////////////////////////////////////////////// @@ -425,33 +431,9 @@ public: //////////////////////////////////////////////////////////////////////// void Communicate(void) { - if ( 0 ){ - thread_region { - // must be called in parallel region - int mythread = thread_num(); - int maxthreads= thread_max(); - int nthreads = CartesianCommunicator::nCommThreads; - assert(nthreads <= maxthreads); - if (nthreads == -1) nthreads = 1; - if (mythread < nthreads) { - for (int i = mythread; i < Packets.size(); i += nthreads) { - double start = usecond(); - uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - comm_bytes_thr[mythread] += bytes; - shm_bytes_thr[mythread] += Packets[i].bytes - bytes; - comm_time_thr[mythread] += usecond() - start; - } - } - } - } else { // Concurrent and non-threaded asynch calls to MPI - std::vector > reqs; - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); - } + std::vector > reqs; + this->CommunicateBegin(reqs); + this->CommunicateComplete(reqs); } template void HaloExchange(const Lattice &source,compressor &compress) @@ -527,7 +509,6 @@ public: _grid->StencilBarrier();// Synch shared memory on a single nodes mpi3synctime_g+=usecond(); - // conformable(source.Grid(),_grid); assert(source.Grid()==_grid); halogtime-=usecond(); @@ -586,13 +567,8 @@ public: CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { - mpi3synctime-=usecond(); - accelerator_barrier(); - _grid->StencilBarrier();// Synch shared memory on a single nodes - mpi3synctime+=usecond(); - shmmergetime-=usecond(); - CommsMerge(decompress,MergersSHM,DecompressionsSHM); - shmmergetime+=usecond(); + assert(MergersSHM.size()==0); + assert(DecompressionsSHM.size()==0); } template From 8feedb4f6f74498362785cc6cae8d43ad7b5988f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Mar 2023 17:29:21 -0700 Subject: [PATCH 13/15] Include files moved --- Grid/threads/Accelerator.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index e17e85d1..db998739 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -249,14 +249,16 @@ inline int acceleratorIsCommunicable(void *ptr) ////////////////////////////////////////////// #ifdef GRID_SYCL NAMESPACE_END(Grid); +#if 0 #include #include - -#define GRID_SYCL_LEVEL_ZERO_IPC - -#ifdef GRID_SYCL_LEVEL_ZERO_IPC #include #include +#else +#include +#include +#include +#include #endif NAMESPACE_BEGIN(Grid); From 0efa107cb6ccca45ea27d74ab2b928d206a9811a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Mar 2023 17:29:43 -0700 Subject: [PATCH 14/15] Script update --- systems/PVC/benchmarks/run-1tile.sh | 2 +- systems/PVC/benchmarks/run-2tile-mpi.sh | 14 ++++++-------- systems/PVC/benchmarks/wrap.sh | 10 +++++----- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh index 0fe80247..9a29b773 100755 --- a/systems/PVC/benchmarks/run-1tile.sh +++ b/systems/PVC/benchmarks/run-1tile.sh @@ -4,7 +4,7 @@ #SBATCH -p QZ1J-ICX-PVC ##SBATCH -p QZ1J-SPR-PVC-2C -source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh +#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh export NT=8 diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh index cefab776..5a6a9b8f 100755 --- a/systems/PVC/benchmarks/run-2tile-mpi.sh +++ b/systems/PVC/benchmarks/run-2tile-mpi.sh @@ -4,7 +4,7 @@ #SBATCH -p QZ1J-ICX-PVC -source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh +#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh export NT=16 @@ -19,16 +19,14 @@ export SYCL_DEVICE_FILTER=gpu,level_zero export I_MPI_OFFLOAD_CELL=tile export EnableImplicitScaling=0 export EnableWalkerPartition=0 -export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 -export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 +#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 -for i in 0 +for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 do -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 1.1.1.2.log$i +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 2.1.1.1.log$i done -#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log -#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh index bb7b517d..06ed0ca1 100755 --- a/systems/PVC/benchmarks/wrap.sh +++ b/systems/PVC/benchmarks/wrap.sh @@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK -if [ $MPI_LOCALRANKID = "0" ] -then +#if [ $MPI_LOCALRANKID = "0" ] +#then # ~psteinbr/build_pti/ze_tracer -h $@ - onetrace --chrome-device-timeline $@ -else +# onetrace --chrome-device-timeline $@ +#else $@ -fi +#fi From 900e01f49bca04257ee979e1c7e97c2dc1e3cd9e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Mar 2023 21:35:06 -0700 Subject: [PATCH 15/15] Temporary --- benchmarks/Benchmark_dwf_fp32_paranoid.cc | 387 ++++++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 benchmarks/Benchmark_dwf_fp32_paranoid.cc diff --git a/benchmarks/Benchmark_dwf_fp32_paranoid.cc b/benchmarks/Benchmark_dwf_fp32_paranoid.cc new file mode 100644 index 00000000..20f23b60 --- /dev/null +++ b/benchmarks/Benchmark_dwf_fp32_paranoid.cc @@ -0,0 +1,387 @@ + /************************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + Source file: ./benchmarks/Benchmark_dwf.cc + Copyright (C) 2015 + + Author: Peter Boyle + Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + int threads = GridThread::GetThreads(); + + Coordinate latt4 = GridDefaultLatt(); + int Ls=16; + for(int i=0;i> Ls; + } + + GridLogLayout(); + + long unsigned int single_site_flops = 8*Nc*(7+16*Nc); + + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionF src (FGrid); random(RNG5,src); + LatticeFermionF src1 (FGrid); random(RNG5,src1); +#if 0 + src = Zero(); + { + Coordinate origin({0,0,0,latt4[2]-1,0}); + SpinColourVectorF tmp; + tmp=Zero(); + tmp()(0)(0)=Complex(-2.0,0.0); + std::cout << " source site 0 " << tmp<::HotConfiguration(RNG4,Umu); + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; +#if 0 + Umu=1.0; + for(int mu=0;mu(Umu,mu); + // if (mu !=2 ) ttmp = 0; + // ttmp = ttmp* pow(10.0,mu); + PokeIndex(Umu,ttmp,mu); + } + std::cout << GridLogMessage << "Forced to diagonal " << std::endl; +#endif + + //////////////////////////////////// + // Naive wilson implementation + //////////////////////////////////// + // replicate across fifth dimension + // LatticeGaugeFieldF Umu5d(FGrid); + std::vector U(4,UGrid); + for(int mu=0;mu(Umu,mu); + } + std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; + + if (1) + { + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s_Nprocessors; + RealD NN = UGrid->NodeCount(); + + std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); + Dw.Dhop(src,result,0); + std::cout<Barrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4) ) { + + /* + std::cout << "RESULT\n " << result<Barrier(); + exit(-1); + } + assert (norm2(err)< 1.0e-4 ); + } + + if (1) + { // Naive wilson dag implementation + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s1.0e-4)){ +/* + std::cout<< "DAG RESULT\n " <Barrier(); + Dw.DhopEO(src_o,r_e,DaggerNo); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4)){ + /* + std::cout<< "Deo RESULT\n " <