From 88015b08588e040abcb888041bd3f63187307834 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 26 Dec 2022 10:01:32 +0100 Subject: [PATCH 01/21] Split sum in rankSum and GlobalSum --- Grid/lattice/Lattice_reduction.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 326b9ea3..d9025de0 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -144,17 +144,23 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) } template -inline typename vobj::scalar_object sum(const Lattice &arg) +inline typename vobj::scalar_object rankSum(const Lattice &arg) { #if defined(GRID_CUDA)||defined(GRID_HIP) autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_gpu(&arg_v[0],osites); + return sum_gpu(&arg_v[0],osites); #else autoView(arg_v, arg, CpuRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_cpu(&arg_v[0],osites); + return sum_cpu(&arg_v[0],osites); #endif +} + +template +inline typename vobj::scalar_object sum(const Lattice &arg) +{ + auto ssum = rankSum(arg); arg.Grid()->GlobalSum(ssum); return ssum; } From 7d62f1d6d20790f681f37fe1755e3712d2a4e2b0 Mon Sep 17 00:00:00 2001 From: Makis Kappas Date: Wed, 11 Jan 2023 21:26:25 +0000 Subject: [PATCH 02/21] Populate the Cshift_table in the GPU Cshift is allocated in Unified memory and used in the LambdaApply kernels but also populated from the host. This creates a lot of Unified HtoD and DtoH mem operations and has a negative effect in performance. With this commit we populate the Cshift table in the device with the populate_Cshift_table() kernel. --- Grid/cshift/Cshift_common.h | 40 +++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index cf902b58..742c99da 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -297,6 +297,30 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA } } +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + +template +T iDivUp(T a, T b) // Round a / b to nearest higher integer value +{ return (a % b != 0) ? (a / b + 1) : (a / b); } + +template +__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) +{ + int idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx >= e1*e2) return; + + int n, b, o; + + n = idx / e2; + b = idx % e2; + o = n*stride + b; + + vector[2*idx + 0] = lo + o; + vector[2*idx + 1] = ro + o; +} + +#endif + ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// @@ -321,12 +345,20 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int ent=0; if(cbmask == 0x3 ){ +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + ent = e1*e2; + dim3 blockSize(acceleratorThreads()); + dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); + populate_Cshift_table<<>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o,ro+o); } } +#endif } else { for(int n=0;n void Copy_plane_permute(Lattice& lhs,const Lattice>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o+b,ro+o+b); }} +#endif } else { for(int n=0;n Date: Tue, 14 Feb 2023 14:37:10 +0000 Subject: [PATCH 03/21] Add batched block project/promote functions --- Grid/lattice/Lattice_transfer.h | 45 +++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index ef489ea6..556785c0 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -288,7 +288,34 @@ inline void blockProject(Lattice > &coarseData, blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); } } +template +inline void batchBlockProject(std::vector>> &coarseData, + const std::vector> &fineData, + const VLattice &Basis) +{ + int NBatch = fineData.size(); + GridBase * fine = fineData[0].Grid(); + GridBase * coarse= coarseData[0].Grid(); + Lattice> ip(coarse); + std::vector> fineDataCopy = fineData; + + autoView(ip_, ip, AcceleratorWrite); + for(int v=0;v + accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { + convertType(coarseData_[sc](v),ip_[sc]); + }); + + // improve numerical stability of projection + // |fine> = |fine> - |basis> + ip=-ip; + blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); + } + } +} template inline void blockZAXPY(Lattice &fineZ, @@ -590,6 +617,24 @@ inline void blockPromote(const Lattice > &coarseData, } #endif +template +inline void batchBlockPromote(const std::vector>> &coarseData, + std::vector> &fineData, + const VLattice &Basis) +{ + int NBatch = fineData.size(); + GridBase * fine = fineData[0].Grid(); + GridBase * coarse = coarseData[0].Grid(); + for (int k=0; k> ip = PeekIndex<0>(coarseData[k],i); + blockZAXPY(fineData[k],ip,Basis[i],fineData[k]); + } + } +} + // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Simd layouts need not match since we use peek/poke Local template From 920a51438db5a5aaaa2f93b7308b567573cb52dc Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Tue, 14 Feb 2023 17:04:13 +0000 Subject: [PATCH 04/21] Added batched Mixed precision CG --- Grid/algorithms/Algorithms.h | 1 + .../ConjugateGradientMixedPrecBatched.h | 213 ++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 7f27784b..ff3da17d 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB); #include #include #include +#include #include #include #include diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h new file mode 100644 index 00000000..93f5c677 --- /dev/null +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h @@ -0,0 +1,213 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h + + Copyright (C) 2015 + + Author: Raoul Hodgson + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H +#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H + +NAMESPACE_BEGIN(Grid); + +//Mixed precision restarted defect correction CG +template::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class MixedPrecisionConjugateGradientBatched : public LinearFunction { +public: + using LinearFunction::operator(); + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + Integer MaxPatchupIterations; + GridBase* SinglePrecGrid; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + LinearOperatorBase &Linop_f; + LinearOperatorBase &Linop_d; + + //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess + LinearFunction *guesser; + bool updateResidual; + + MixedPrecisionConjugateGradientBatched(RealD tol, + Integer maxinnerit, + Integer maxouterit, + Integer maxpatchit, + GridBase* _sp_grid, + LinearOperatorBase &_Linop_f, + LinearOperatorBase &_Linop_d, + bool _updateResidual=true) : + Linop_f(_Linop_f), Linop_d(_Linop_d), + Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid), + OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { }; + + void useGuesser(LinearFunction &g){ + guesser = &g; + } + + void operator() (const FieldD &src_d_in, FieldD &sol_d){ + std::vector srcs_d_in{src_d_in}; + std::vector sols_d{sol_d}; + + (*this)(srcs_d_in,sols_d); + + sol_d = sols_d[0]; + } + + void operator() (const std::vector &src_d_in, std::vector &sol_d){ + assert(src_d_in.size() == sol_d.size()); + int NBatch = src_d_in.size(); + + std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; + + Integer TotalOuterIterations = 0; //Number of restarts + std::vector TotalInnerIterations(NBatch,0); //Number of inner CG iterations + std::vector TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step + + GridStopWatch TotalTimer; + TotalTimer.Start(); + + GridStopWatch InnerCGtimer; + GridStopWatch PrecChangeTimer; + + int cb = src_d_in[0].Checkerboard(); + + std::vector src_norm; + std::vector norm; + std::vector stop; + + GridBase* DoublePrecGrid = src_d_in[0].Grid(); + FieldD tmp_d(DoublePrecGrid); + tmp_d.Checkerboard() = cb; + + FieldD tmp2_d(DoublePrecGrid); + tmp2_d.Checkerboard() = cb; + + std::vector src_d; + std::vector src_f; + std::vector sol_f; + + for (int i=0; i CG_f(inner_tol, MaxInnerIterations); + CG_f.ErrorOnNoConverge = false; + + Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count + + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl; + + bool allConverged = true; + + for (int i=0; i OuterLoopNormMult * stop[i]) { + allConverged = false; + } + } + if (allConverged) break; + + if (updateResidual) { + RealD normMax = *std::max_element(std::begin(norm), std::end(norm)); + RealD stopMax = *std::max_element(std::begin(stop), std::end(stop)); + while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? + CG_f.Tolerance = inner_tol; + } + + //Optionally improve inner solver guess (eg using known eigenvectors) + if(guesser != NULL) { + (*guesser)(src_f, sol_f); + } + + for (int i=0; i CG_d(Tolerance, MaxPatchupIterations); + CG_d(Linop_d, src_d_in[i], sol_d[i]); + TotalFinalStepIterations[i] += CG_d.IterationsToComplete; + } + + TotalTimer.Stop(); + + std::cout << GridLogMessage << std::endl; + for (int i=0; i Date: Sun, 26 Feb 2023 12:22:45 +0000 Subject: [PATCH 05/21] Expose cached bytes --- Grid/allocator/MemoryManager.cc | 6 +++++- Grid/allocator/MemoryManager.h | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index d055898f..955a1f90 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -35,6 +35,8 @@ void MemoryManager::PrintBytes(void) } +uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccSmall]; } + ////////////////////////////////////////////////////////////////////// // Data tables for recently freed pooiniter caches ////////////////////////////////////////////////////////////////////// @@ -190,7 +192,9 @@ void MemoryManager::InitMessage(void) { std::cout << GridLogMessage<< "MemoryManager::Init() setting up"< Date: Sun, 26 Feb 2023 14:15:28 +0000 Subject: [PATCH 06/21] Add huge cache type and allow Ncache==0 --- Grid/allocator/MemoryManager.cc | 51 ++++++++++++++++++++++----------- Grid/allocator/MemoryManager.h | 3 +- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index 955a1f90..e9097c75 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid); /*Allocation types, saying which pointer cache should be used*/ #define Cpu (0) -#define CpuSmall (1) -#define Acc (2) -#define AccSmall (3) -#define Shared (4) -#define SharedSmall (5) +#define CpuHuge (1) +#define CpuSmall (2) +#define Acc (3) +#define AccHuge (4) +#define AccSmall (5) +#define Shared (6) +#define SharedHuge (7) +#define SharedSmall (8) #undef GRID_MM_VERBOSE uint64_t total_shared; uint64_t total_device; @@ -35,14 +38,14 @@ void MemoryManager::PrintBytes(void) } -uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccSmall]; } +uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } ////////////////////////////////////////////////////////////////////// // Data tables for recently freed pooiniter caches ////////////////////////////////////////////////////////////////////// MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; int MemoryManager::Victim[MemoryManager::NallocType]; -int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 }; +int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 }; uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; ////////////////////////////////////////////////////////////////////// // Actual allocation and deallocation utils @@ -172,6 +175,16 @@ void MemoryManager::Init(void) } } + str= getenv("GRID_ALLOC_NCACHE_HUGE"); + if ( str ) { + Nc = atoi(str); + if ( (Nc>=0) && (Nc < NallocCacheMax)) { + Ncache[CpuHuge]=Nc; + Ncache[AccHuge]=Nc; + Ncache[SharedHuge]=Nc; + } + } + str= getenv("GRID_ALLOC_NCACHE_SMALL"); if ( str ) { Nc = atoi(str); @@ -192,9 +205,9 @@ void MemoryManager::InitMessage(void) { std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]); #else return ptr; @@ -236,11 +252,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif + if (ncache == 0) return ptr; + void * ret = NULL; int v = -1; @@ -275,8 +292,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries void *MemoryManager::Lookup(size_t bytes,int type) { #ifdef ALLOCATION_CACHE - bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); - int cache = type+small; + int cache; + if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; + else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); #else return NULL; @@ -285,7 +305,6 @@ void *MemoryManager::Lookup(size_t bytes,int type) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 74390bc5..7a5f978c 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? #define GRID_ALLOC_SMALL_LIMIT (4096) +#define GRID_ALLOC_HUGE_LIMIT (2147483648) #define STRINGIFY(x) #x #define TOSTRING(x) STRINGIFY(x) @@ -83,7 +84,7 @@ private: } AllocationCacheEntry; static const int NallocCacheMax=128; - static const int NallocType=6; + static const int NallocType=9; static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; static int Victim[NallocType]; static int Ncache[NallocType]; From a3e935c9028a77938603168aa0edd6b24f05d607 Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Mon, 27 Feb 2023 11:38:16 +0000 Subject: [PATCH 07/21] Batched block project/promote size checks --- Grid/lattice/Lattice_transfer.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 556785c0..4d1292a4 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -294,6 +294,8 @@ inline void batchBlockProject(std::vector>> &co const VLattice &Basis) { int NBatch = fineData.size(); + assert(coarseData.size() == NBatch); + GridBase * fine = fineData[0].Grid(); GridBase * coarse= coarseData[0].Grid(); @@ -622,7 +624,9 @@ inline void batchBlockPromote(const std::vector std::vector> &fineData, const VLattice &Basis) { - int NBatch = fineData.size(); + int NBatch = coarseData.size(); + assert(fineData.size() == NBatch); + GridBase * fine = fineData[0].Grid(); GridBase * coarse = coarseData[0].Grid(); for (int k=0; k Date: Thu, 23 Mar 2023 10:28:50 -0400 Subject: [PATCH 08/21] WriteDiscard on construct --- Grid/lattice/Lattice_base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index d6289de2..838cdda5 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -245,7 +245,7 @@ public: /////////////////////////////////////////// // user defined constructor /////////////////////////////////////////// - Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) { + Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { this->_grid = grid; resize(this->_grid->oSites()); assert((((uint64_t)&this->_odata[0])&0xF) ==0); From 481bbaf1fce5b7ef0162c6f9ecec73a80e263cc7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 23 Mar 2023 12:55:31 -0400 Subject: [PATCH 09/21] Interface to query memory use --- Grid/allocator/MemoryManager.cc | 1 + Grid/allocator/MemoryManager.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index e9097c75..a9e5c9b4 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -39,6 +39,7 @@ void MemoryManager::PrintBytes(void) } uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } +uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; } ////////////////////////////////////////////////////////////////////// // Data tables for recently freed pooiniter caches diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 7a5f978c..0dc78f04 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -71,6 +71,21 @@ enum ViewMode { CpuWriteDiscard = 0x10 // same for now }; +struct MemoryStatus { + uint64_t DeviceBytes; + uint64_t DeviceLRUBytes; + uint64_t DeviceMaxBytes; + uint64_t HostToDeviceBytes; + uint64_t DeviceToHostBytes; + uint64_t HostToDeviceXfer; + uint64_t DeviceToHostXfer; + uint64_t DeviceEvictions; + uint64_t DeviceDestroy; + uint64_t DeviceAllocCacheBytes; + uint64_t HostAllocCacheBytes; +}; + + class MemoryManager { private: @@ -124,7 +139,24 @@ private: static uint64_t DeviceDestroy; static uint64_t DeviceCacheBytes(); + static uint64_t HostCacheBytes(); + static MemoryStatus GetFootprint(void) { + MemoryStatus stat; + stat.DeviceBytes = DeviceBytes; + stat.DeviceLRUBytes = DeviceLRUBytes; + stat.DeviceMaxBytes = DeviceMaxBytes; + stat.HostToDeviceBytes = HostToDeviceBytes; + stat.DeviceToHostBytes = DeviceToHostBytes; + stat.HostToDeviceXfer = HostToDeviceXfer; + stat.DeviceToHostXfer = DeviceToHostXfer; + stat.DeviceEvictions = DeviceEvictions; + stat.DeviceDestroy = DeviceDestroy; + stat.DeviceAllocCacheBytes = DeviceCacheBytes(); + stat.HostAllocCacheBytes = HostCacheBytes(); + return stat; + }; + private: #ifndef GRID_UVM ////////////////////////////////////////////////////////////////////// From d8a9a745d8117a924a4255422b8f984c14d511fd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 24 Mar 2023 15:40:30 -0400 Subject: [PATCH 10/21] stream synchronise --- Grid/lattice/Lattice_reduction_gpu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 4bdcce0b..bd83a1ea 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -217,19 +217,19 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi // which worked with earlier drivers. // Not sure which driver had first fail and this bears checking // Is awkward as must install multiple driver versions -#undef UVM_BLOCK_BUFFER +#undef UVM_BLOCK_BUFFER #ifndef UVM_BLOCK_BUFFER commVector buffer(numBlocks); sobj *buffer_v = &buffer[0]; sobj result; - reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); + reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); accelerator_barrier(); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); #else Vector buffer(numBlocks); sobj *buffer_v = &buffer[0]; sobj result; - reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); + reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); accelerator_barrier(); result = *buffer_v; #endif From 2fbcf13c46b1e7d0d0c6c7f9ead0f58d12c78cb5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Mar 2023 14:25:14 -0700 Subject: [PATCH 11/21] SYCL fix --- Grid/communicator/SharedMemoryMPI.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 795f3928..792f8405 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -36,9 +36,11 @@ Author: Christoph Lehner #ifdef GRID_HIP #include #endif -#ifdef GRID_SYCl +#ifdef GRID_SYCL #endif +#define GRID_SYCL_LEVEL_ZERO_IPC + NAMESPACE_BEGIN(Grid); #define header "SharedMemoryMpi: " From dd3bbb8fa2ff3878045f3b7e01e68036baac9bcb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Mar 2023 17:27:45 -0700 Subject: [PATCH 12/21] MOve the synchronise out to the stencil so one call instead of one call per packet --- Grid/communicator/Communicator_mpi3.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index b8ce7bca..280f5c4e 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -401,8 +401,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list,int dir) { // std::cout << "Copy Synchronised\n"< Date: Mon, 27 Mar 2023 17:28:38 -0700 Subject: [PATCH 13/21] Move the copy synch out to stencil and do one per call instead of one per packet --- Grid/stencil/Stencil.h | 48 +++++++++++------------------------------- 1 file changed, 12 insertions(+), 36 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 65d878cb..6296df4e 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -398,6 +398,8 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + // Buffers are gathered AND synchronised + // Copies are MPI ISend OR asynch copy on copy stream reqs.resize(Packets.size()); commtime-=usecond(); for(int i=0;iStencilBarrier();// Synch shared memory on a single nodes } void CommunicateComplete(std::vector > &reqs) { + // complete intranode + acceleratorCopySynchronise(); + // complete MPI for(int i=0;iStencilSendToRecvFromComplete(reqs[i],i); } + // Everyone agrees we are all done + _grid->StencilBarrier(); commtime+=usecond(); } //////////////////////////////////////////////////////////////////////// @@ -425,33 +431,9 @@ public: //////////////////////////////////////////////////////////////////////// void Communicate(void) { - if ( 0 ){ - thread_region { - // must be called in parallel region - int mythread = thread_num(); - int maxthreads= thread_max(); - int nthreads = CartesianCommunicator::nCommThreads; - assert(nthreads <= maxthreads); - if (nthreads == -1) nthreads = 1; - if (mythread < nthreads) { - for (int i = mythread; i < Packets.size(); i += nthreads) { - double start = usecond(); - uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - comm_bytes_thr[mythread] += bytes; - shm_bytes_thr[mythread] += Packets[i].bytes - bytes; - comm_time_thr[mythread] += usecond() - start; - } - } - } - } else { // Concurrent and non-threaded asynch calls to MPI - std::vector > reqs; - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); - } + std::vector > reqs; + this->CommunicateBegin(reqs); + this->CommunicateComplete(reqs); } template void HaloExchange(const Lattice &source,compressor &compress) @@ -527,7 +509,6 @@ public: _grid->StencilBarrier();// Synch shared memory on a single nodes mpi3synctime_g+=usecond(); - // conformable(source.Grid(),_grid); assert(source.Grid()==_grid); halogtime-=usecond(); @@ -586,13 +567,8 @@ public: CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { - mpi3synctime-=usecond(); - accelerator_barrier(); - _grid->StencilBarrier();// Synch shared memory on a single nodes - mpi3synctime+=usecond(); - shmmergetime-=usecond(); - CommsMerge(decompress,MergersSHM,DecompressionsSHM); - shmmergetime+=usecond(); + assert(MergersSHM.size()==0); + assert(DecompressionsSHM.size()==0); } template From 8feedb4f6f74498362785cc6cae8d43ad7b5988f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Mar 2023 17:29:21 -0700 Subject: [PATCH 14/21] Include files moved --- Grid/threads/Accelerator.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index e17e85d1..db998739 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -249,14 +249,16 @@ inline int acceleratorIsCommunicable(void *ptr) ////////////////////////////////////////////// #ifdef GRID_SYCL NAMESPACE_END(Grid); +#if 0 #include #include - -#define GRID_SYCL_LEVEL_ZERO_IPC - -#ifdef GRID_SYCL_LEVEL_ZERO_IPC #include #include +#else +#include +#include +#include +#include #endif NAMESPACE_BEGIN(Grid); From 0efa107cb6ccca45ea27d74ab2b928d206a9811a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Mar 2023 17:29:43 -0700 Subject: [PATCH 15/21] Script update --- systems/PVC/benchmarks/run-1tile.sh | 2 +- systems/PVC/benchmarks/run-2tile-mpi.sh | 14 ++++++-------- systems/PVC/benchmarks/wrap.sh | 10 +++++----- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh index 0fe80247..9a29b773 100755 --- a/systems/PVC/benchmarks/run-1tile.sh +++ b/systems/PVC/benchmarks/run-1tile.sh @@ -4,7 +4,7 @@ #SBATCH -p QZ1J-ICX-PVC ##SBATCH -p QZ1J-SPR-PVC-2C -source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh +#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh export NT=8 diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh index cefab776..5a6a9b8f 100755 --- a/systems/PVC/benchmarks/run-2tile-mpi.sh +++ b/systems/PVC/benchmarks/run-2tile-mpi.sh @@ -4,7 +4,7 @@ #SBATCH -p QZ1J-ICX-PVC -source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh +#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh export NT=16 @@ -19,16 +19,14 @@ export SYCL_DEVICE_FILTER=gpu,level_zero export I_MPI_OFFLOAD_CELL=tile export EnableImplicitScaling=0 export EnableWalkerPartition=0 -export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 -export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 +#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 -for i in 0 +for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 do -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 1.1.1.2.log$i +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 2.1.1.1.log$i done -#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log -#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh index bb7b517d..06ed0ca1 100755 --- a/systems/PVC/benchmarks/wrap.sh +++ b/systems/PVC/benchmarks/wrap.sh @@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK -if [ $MPI_LOCALRANKID = "0" ] -then +#if [ $MPI_LOCALRANKID = "0" ] +#then # ~psteinbr/build_pti/ze_tracer -h $@ - onetrace --chrome-device-timeline $@ -else +# onetrace --chrome-device-timeline $@ +#else $@ -fi +#fi From 900e01f49bca04257ee979e1c7e97c2dc1e3cd9e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Mar 2023 21:35:06 -0700 Subject: [PATCH 16/21] Temporary --- benchmarks/Benchmark_dwf_fp32_paranoid.cc | 387 ++++++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 benchmarks/Benchmark_dwf_fp32_paranoid.cc diff --git a/benchmarks/Benchmark_dwf_fp32_paranoid.cc b/benchmarks/Benchmark_dwf_fp32_paranoid.cc new file mode 100644 index 00000000..20f23b60 --- /dev/null +++ b/benchmarks/Benchmark_dwf_fp32_paranoid.cc @@ -0,0 +1,387 @@ + /************************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + Source file: ./benchmarks/Benchmark_dwf.cc + Copyright (C) 2015 + + Author: Peter Boyle + Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + int threads = GridThread::GetThreads(); + + Coordinate latt4 = GridDefaultLatt(); + int Ls=16; + for(int i=0;i> Ls; + } + + GridLogLayout(); + + long unsigned int single_site_flops = 8*Nc*(7+16*Nc); + + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionF src (FGrid); random(RNG5,src); + LatticeFermionF src1 (FGrid); random(RNG5,src1); +#if 0 + src = Zero(); + { + Coordinate origin({0,0,0,latt4[2]-1,0}); + SpinColourVectorF tmp; + tmp=Zero(); + tmp()(0)(0)=Complex(-2.0,0.0); + std::cout << " source site 0 " << tmp<::HotConfiguration(RNG4,Umu); + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; +#if 0 + Umu=1.0; + for(int mu=0;mu(Umu,mu); + // if (mu !=2 ) ttmp = 0; + // ttmp = ttmp* pow(10.0,mu); + PokeIndex(Umu,ttmp,mu); + } + std::cout << GridLogMessage << "Forced to diagonal " << std::endl; +#endif + + //////////////////////////////////// + // Naive wilson implementation + //////////////////////////////////// + // replicate across fifth dimension + // LatticeGaugeFieldF Umu5d(FGrid); + std::vector U(4,UGrid); + for(int mu=0;mu(Umu,mu); + } + std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; + + if (1) + { + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s_Nprocessors; + RealD NN = UGrid->NodeCount(); + + std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); + Dw.Dhop(src,result,0); + std::cout<Barrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4) ) { + + /* + std::cout << "RESULT\n " << result<Barrier(); + exit(-1); + } + assert (norm2(err)< 1.0e-4 ); + } + + if (1) + { // Naive wilson dag implementation + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s1.0e-4)){ +/* + std::cout<< "DAG RESULT\n " <Barrier(); + Dw.DhopEO(src_o,r_e,DaggerNo); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4)){ + /* + std::cout<< "Deo RESULT\n " < Date: Tue, 28 Mar 2023 08:34:24 -0700 Subject: [PATCH 17/21] Commet --- Grid/lattice/Lattice_reduction_gpu.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 4bdcce0b..b1139434 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -211,12 +211,9 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi assert(ok); Integer smemSize = numThreads * sizeof(sobj); - // UVM seems to be buggy under later CUDA drivers - // This fails on A100 and driver 5.30.02 / CUDA 12.1 - // Fails with multiple NVCC versions back to 11.4, - // which worked with earlier drivers. - // Not sure which driver had first fail and this bears checking - // Is awkward as must install multiple driver versions + // Move out of UVM + // Turns out I had messed up the synchronise after move to compute stream + // as running this on the default stream fools the synchronise #undef UVM_BLOCK_BUFFER #ifndef UVM_BLOCK_BUFFER commVector buffer(numBlocks); From 6af97069b93cb61503cad8770e4ccd6b7532ae98 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 28 Mar 2023 13:39:44 -0700 Subject: [PATCH 18/21] Preparing for close of feature/dirichlet Initial code change review complete --- .../WilsonKernelsImplementation.h | 4 ---- ...ayleyFermion5DInstantiationWilsonImplD2.cc | 1 - ...ctionFermion5DInstantiationWilsonImplD2.cc | 1 - ...allEOFAFermionInstantiationWilsonImplD2.cc | 1 - ...iusEOFAFermionInstantiationWilsonImplD2.cc | 1 - ...ctionFermion5DInstantiationWilsonImplD2.cc | 1 - ...nCloverFermionInstantiationWilsonImplD2.cc | 1 - ...ilsonFermion5DInstantiationWilsonImplD2.cc | 1 - .../WilsonFermionInstantiationWilsonImplD2.cc | 1 - .../WilsonKernelsInstantiationWilsonImplD2.cc | 1 - ...ilsonTMFermionInstantiationWilsonImplD2.cc | 1 - .../fermion/instantiation/WilsonImplD2/impl.h | 1 - ...yleyFermion5DInstantiationZWilsonImplD2.cc | 1 - ...tionFermion5DInstantiationZWilsonImplD2.cc | 1 - ...llEOFAFermionInstantiationZWilsonImplD2.cc | 1 - ...usEOFAFermionInstantiationZWilsonImplD2.cc | 1 - ...tionFermion5DInstantiationZWilsonImplD2.cc | 1 - ...lsonFermion5DInstantiationZWilsonImplD2.cc | 1 - ...WilsonKernelsInstantiationZWilsonImplD2.cc | 1 - .../instantiation/ZWilsonImplD2/impl.h | 1 - .../pseudofermion/TwoFlavourEvenOddRatio.h | 13 ------------ Grid/qcd/hmc/integrators/Integrator.h | 21 +++++++------------ 22 files changed, 8 insertions(+), 49 deletions(-) delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc delete mode 100644 Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc delete mode 100644 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index fcf1f1f3..d7541054 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -463,11 +463,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} -#ifdef SYCL_HACK - if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; } -#else if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} -#endif #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 9cc05107..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 5f6ab65e..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index d5789bcf..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonTMFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h deleted file mode 100644 index a836ff03..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION WilsonImplD2 diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h deleted file mode 100644 index 067d6080..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION ZWilsonImplD2 diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h index 476b1c53..c0e2c1d3 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h @@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid); // NumOp == V // DenOp == M // - AUDIT(); FermionField etaOdd (NumOp.FermionRedBlackGrid()); FermionField etaEven(NumOp.FermionRedBlackGrid()); FermionField tmp (NumOp.FermionRedBlackGrid()); - AUDIT(); pickCheckerboard(Even,etaEven,eta); - AUDIT(); pickCheckerboard(Odd,etaOdd,eta); - AUDIT(); NumOp.ImportGauge(U); - AUDIT(); DenOp.ImportGauge(U); std::cout << " TwoFlavourRefresh: Imported gauge "< Mpc(DenOp); - AUDIT(); SchurDifferentiableOperator Vpc(NumOp); - AUDIT(); std::cout << " TwoFlavourRefresh: Diff ops "<deriv_timer_start(); as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta as[level].actions.at(a)->deriv_timer_stop(); std::cout << GridLogMessage << "AuditForce["<is_smeared << std::endl; auto name = as[level].actions.at(a)->action_name(); @@ -382,12 +380,12 @@ public: Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); std::cout << GridLogMessage << "AuditRefresh["<refresh_timer_start(); as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); as[level].actions.at(actionID)->refresh_timer_stop(); std::cout << GridLogMessage << "AuditRefresh["<is_smeared); @@ -434,7 +432,7 @@ public: as[level].actions.at(actionID)->S_timer_stop(); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; - AUDIT(); + } as[level].apply(S_hireps, Representations, level, H); } @@ -447,9 +445,9 @@ public: void operator()(std::vector*> repr_set, Repr& Rep, int level, RealD& H) { for (int a = 0; a < repr_set.size(); ++a) { - AUDIT(); + RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); - AUDIT(); + std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; H += Hterm; @@ -474,10 +472,10 @@ public: Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; as[level].actions.at(actionID)->S_timer_start(); - AUDIT(); + Hterm = as[level].actions.at(actionID)->Sinitial(Us); as[level].actions.at(actionID)->S_timer_stop(); - AUDIT(); + std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; } @@ -490,7 +488,6 @@ public: void integrate(Field& U) { - AUDIT(); // reset the clocks t_U = 0; for (int level = 0; level < as.size(); ++level) { @@ -508,10 +505,8 @@ public: assert(fabs(t_U - t_P[level]) < 1.0e-6); // must be the same std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; } - AUDIT(); FieldImplementation::Project(U); - AUDIT(); // and that we indeed got to the end of the trajectory assert(fabs(t_U - Params.trajL) < 1.0e-6); From 4a261fab303b0338cb0529d83144860535155675 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 28 Mar 2023 20:04:21 -0700 Subject: [PATCH 19/21] Changes premerge to develop --- Grid/lattice/Lattice_rng.h | 11 +---------- .../implementation/WilsonKernelsImplementation.h | 9 +++++---- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index 180b8437..b7ef0e82 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -440,17 +440,8 @@ public: _grid->GlobalCoorToGlobalIndex(gcoor,gidx); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); -#if 1 - assert(rank == _grid->ThisRank() ); -#else -// - if (rank != _grid->ThisRank() ){ - std::cout <<"rank "<ThisRank() "<<_grid->ThisRank()<< std::endl; -// exit(-42); -// assert(0); - } -#endif + assert(rank == _grid->ThisRank() ); int l_idx=generator_idx(o_idx,i_idx); _generators[l_idx] = master_engine; diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index d7541054..ce7cd49c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -462,6 +462,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} #ifndef GRID_CUDA @@ -474,6 +475,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} #ifndef GRID_CUDA @@ -493,15 +495,15 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif - acceleratorFenceComputeStream(); } else if( interior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} - if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;} + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif @@ -512,7 +514,6 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} #endif - acceleratorFenceComputeStream(); } assert(0 && " Kernel optimisation case not covered "); } From 7212432f431c6a8e784bbe77bcdf9f294fce4296 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 28 Mar 2023 20:10:22 -0700 Subject: [PATCH 20/21] More careful fencing --- .../action/fermion/implementation/WilsonKernelsImplementation.h | 2 -- Grid/stencil/Stencil.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index ce7cd49c..70e2477f 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -462,7 +462,6 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { - acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} #ifndef GRID_CUDA @@ -495,7 +494,6 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { - acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} #ifndef GRID_CUDA diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index c9287bf2..1568cbf9 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -665,9 +665,11 @@ public: for(int i=0;i Date: Wed, 29 Mar 2023 14:36:50 -0400 Subject: [PATCH 21/21] Compile fix on Nvidia --- Grid/communicator/SharedMemoryMPI.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 3248d328..3a70395c 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -38,9 +38,8 @@ Author: Christoph Lehner #include #endif #ifdef GRID_SYCL - -#endif #define GRID_SYCL_LEVEL_ZERO_IPC +#endif NAMESPACE_BEGIN(Grid);