diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 47a0a92b..9eaf89f3 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB); #include #include #include +#include #include #include #include diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h new file mode 100644 index 00000000..93f5c677 --- /dev/null +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h @@ -0,0 +1,213 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h + + Copyright (C) 2015 + + Author: Raoul Hodgson + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H +#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H + +NAMESPACE_BEGIN(Grid); + +//Mixed precision restarted defect correction CG +template::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class MixedPrecisionConjugateGradientBatched : public LinearFunction { +public: + using LinearFunction::operator(); + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + Integer MaxPatchupIterations; + GridBase* SinglePrecGrid; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + LinearOperatorBase &Linop_f; + LinearOperatorBase &Linop_d; + + //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess + LinearFunction *guesser; + bool updateResidual; + + MixedPrecisionConjugateGradientBatched(RealD tol, + Integer maxinnerit, + Integer maxouterit, + Integer maxpatchit, + GridBase* _sp_grid, + LinearOperatorBase &_Linop_f, + LinearOperatorBase &_Linop_d, + bool _updateResidual=true) : + Linop_f(_Linop_f), Linop_d(_Linop_d), + Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid), + OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { }; + + void useGuesser(LinearFunction &g){ + guesser = &g; + } + + void operator() (const FieldD &src_d_in, FieldD &sol_d){ + std::vector srcs_d_in{src_d_in}; + std::vector sols_d{sol_d}; + + (*this)(srcs_d_in,sols_d); + + sol_d = sols_d[0]; + } + + void operator() (const std::vector &src_d_in, std::vector &sol_d){ + assert(src_d_in.size() == sol_d.size()); + int NBatch = src_d_in.size(); + + std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; + + Integer TotalOuterIterations = 0; //Number of restarts + std::vector TotalInnerIterations(NBatch,0); //Number of inner CG iterations + std::vector TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step + + GridStopWatch TotalTimer; + TotalTimer.Start(); + + GridStopWatch InnerCGtimer; + GridStopWatch PrecChangeTimer; + + int cb = src_d_in[0].Checkerboard(); + + std::vector src_norm; + std::vector norm; + std::vector stop; + + GridBase* DoublePrecGrid = src_d_in[0].Grid(); + FieldD tmp_d(DoublePrecGrid); + tmp_d.Checkerboard() = cb; + + FieldD tmp2_d(DoublePrecGrid); + tmp2_d.Checkerboard() = cb; + + std::vector src_d; + std::vector src_f; + std::vector sol_f; + + for (int i=0; i CG_f(inner_tol, MaxInnerIterations); + CG_f.ErrorOnNoConverge = false; + + Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count + + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl; + + bool allConverged = true; + + for (int i=0; i OuterLoopNormMult * stop[i]) { + allConverged = false; + } + } + if (allConverged) break; + + if (updateResidual) { + RealD normMax = *std::max_element(std::begin(norm), std::end(norm)); + RealD stopMax = *std::max_element(std::begin(stop), std::end(stop)); + while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? + CG_f.Tolerance = inner_tol; + } + + //Optionally improve inner solver guess (eg using known eigenvectors) + if(guesser != NULL) { + (*guesser)(src_f, sol_f); + } + + for (int i=0; i CG_d(Tolerance, MaxPatchupIterations); + CG_d(Linop_d, src_d_in[i], sol_d[i]); + TotalFinalStepIterations[i] += CG_d.IterationsToComplete; + } + + TotalTimer.Stop(); + + std::cout << GridLogMessage << std::endl; + for (int i=0; i=0) && (Nc < NallocCacheMax)) { + Ncache[CpuHuge]=Nc; + Ncache[AccHuge]=Nc; + Ncache[SharedHuge]=Nc; + } + } + str= getenv("GRID_ALLOC_NCACHE_SMALL"); if ( str ) { Nc = atoi(str); @@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) { std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]); #else return ptr; @@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif + if (ncache == 0) return ptr; + void * ret = NULL; int v = -1; @@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries void *MemoryManager::Lookup(size_t bytes,int type) { #ifdef ALLOCATION_CACHE - bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); - int cache = type+small; + int cache; + if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; + else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); #else return NULL; @@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index c22a54f3..0dc78f04 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? #define GRID_ALLOC_SMALL_LIMIT (4096) +#define GRID_ALLOC_HUGE_LIMIT (2147483648) #define STRINGIFY(x) #x #define TOSTRING(x) STRINGIFY(x) @@ -70,6 +71,21 @@ enum ViewMode { CpuWriteDiscard = 0x10 // same for now }; +struct MemoryStatus { + uint64_t DeviceBytes; + uint64_t DeviceLRUBytes; + uint64_t DeviceMaxBytes; + uint64_t HostToDeviceBytes; + uint64_t DeviceToHostBytes; + uint64_t HostToDeviceXfer; + uint64_t DeviceToHostXfer; + uint64_t DeviceEvictions; + uint64_t DeviceDestroy; + uint64_t DeviceAllocCacheBytes; + uint64_t HostAllocCacheBytes; +}; + + class MemoryManager { private: @@ -83,7 +99,7 @@ private: } AllocationCacheEntry; static const int NallocCacheMax=128; - static const int NallocType=6; + static const int NallocType=9; static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; static int Victim[NallocType]; static int Ncache[NallocType]; @@ -121,7 +137,26 @@ private: static uint64_t DeviceToHostXfer; static uint64_t DeviceEvictions; static uint64_t DeviceDestroy; - + + static uint64_t DeviceCacheBytes(); + static uint64_t HostCacheBytes(); + + static MemoryStatus GetFootprint(void) { + MemoryStatus stat; + stat.DeviceBytes = DeviceBytes; + stat.DeviceLRUBytes = DeviceLRUBytes; + stat.DeviceMaxBytes = DeviceMaxBytes; + stat.HostToDeviceBytes = HostToDeviceBytes; + stat.DeviceToHostBytes = DeviceToHostBytes; + stat.HostToDeviceXfer = HostToDeviceXfer; + stat.DeviceToHostXfer = DeviceToHostXfer; + stat.DeviceEvictions = DeviceEvictions; + stat.DeviceDestroy = DeviceDestroy; + stat.DeviceAllocCacheBytes = DeviceCacheBytes(); + stat.HostAllocCacheBytes = HostCacheBytes(); + return stat; + }; + private: #ifndef GRID_UVM ////////////////////////////////////////////////////////////////////// diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 892e3dbe..e7d7a96d 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list,int dir) { - acceleratorCopySynchronise(); - StencilBarrier();// Synch shared memory on a single nodes - int nreq=list.size(); if (nreq==0) return; diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 9a273dc4..3248d328 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -37,9 +37,11 @@ Author: Christoph Lehner #ifdef GRID_HIP #include #endif -#ifdef GRID_SYCl +#ifdef GRID_SYCL #endif +#define GRID_SYCL_LEVEL_ZERO_IPC + NAMESPACE_BEGIN(Grid); #define header "SharedMemoryMpi: " diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index cf902b58..742c99da 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -297,6 +297,30 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA } } +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + +template +T iDivUp(T a, T b) // Round a / b to nearest higher integer value +{ return (a % b != 0) ? (a / b + 1) : (a / b); } + +template +__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) +{ + int idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx >= e1*e2) return; + + int n, b, o; + + n = idx / e2; + b = idx % e2; + o = n*stride + b; + + vector[2*idx + 0] = lo + o; + vector[2*idx + 1] = ro + o; +} + +#endif + ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// @@ -321,12 +345,20 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int ent=0; if(cbmask == 0x3 ){ +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + ent = e1*e2; + dim3 blockSize(acceleratorThreads()); + dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); + populate_Cshift_table<<>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o,ro+o); } } +#endif } else { for(int n=0;n void Copy_plane_permute(Lattice& lhs,const Lattice>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o+b,ro+o+b); }} +#endif } else { for(int n=0;n -inline typename vobj::scalar_object sum(const Lattice &arg) +inline typename vobj::scalar_object rankSum(const Lattice &arg) { Integer osites = arg.Grid()->oSites(); #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) - typename vobj::scalar_object ssum; autoView( arg_v, arg, AcceleratorRead); - ssum= sum_gpu(&arg_v[0],osites); + return sum_gpu(&arg_v[0],osites); #else autoView(arg_v, arg, CpuRead); - auto ssum= sum_cpu(&arg_v[0],osites); + return sum_cpu(&arg_v[0],osites); #endif +} + +template +inline typename vobj::scalar_object sum(const Lattice &arg) +{ + auto ssum = rankSum(arg); arg.Grid()->GlobalSum(ssum); return ssum; } template -inline typename vobj::scalar_object sum_large(const Lattice &arg) +inline typename vobj::scalar_object rankSumLarge(const Lattice &arg) { #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_gpu_large(&arg_v[0],osites); + return sum_gpu_large(&arg_v[0],osites); #else autoView(arg_v, arg, CpuRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_cpu(&arg_v[0],osites); + return sum_cpu(&arg_v[0],osites); #endif +} + +template +inline typename vobj::scalar_object sum_large(const Lattice &arg) +{ + auto ssum = rankSumLarge(arg); arg.Grid()->GlobalSum(ssum); return ssum; } diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 6f9fc480..895e2ce7 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -288,7 +288,36 @@ inline void blockProject(Lattice > &coarseData, blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); } } +template +inline void batchBlockProject(std::vector>> &coarseData, + const std::vector> &fineData, + const VLattice &Basis) +{ + int NBatch = fineData.size(); + assert(coarseData.size() == NBatch); + GridBase * fine = fineData[0].Grid(); + GridBase * coarse= coarseData[0].Grid(); + + Lattice> ip(coarse); + std::vector> fineDataCopy = fineData; + + autoView(ip_, ip, AcceleratorWrite); + for(int v=0;v + accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { + convertType(coarseData_[sc](v),ip_[sc]); + }); + + // improve numerical stability of projection + // |fine> = |fine> - |basis> + ip=-ip; + blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); + } + } +} template inline void blockZAXPY(Lattice &fineZ, @@ -590,6 +619,26 @@ inline void blockPromote(const Lattice > &coarseData, } #endif +template +inline void batchBlockPromote(const std::vector>> &coarseData, + std::vector> &fineData, + const VLattice &Basis) +{ + int NBatch = coarseData.size(); + assert(fineData.size() == NBatch); + + GridBase * fine = fineData[0].Grid(); + GridBase * coarse = coarseData[0].Grid(); + for (int k=0; k> ip = PeekIndex<0>(coarseData[k],i); + blockZAXPY(fineData[k],ip,Basis[i],fineData[k]); + } + } +} + // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Simd layouts need not match since we use peek/poke Local template diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index f052cb25..c9287bf2 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -434,7 +434,6 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { - accelerator_barrier(); for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -443,7 +442,6 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } - _grid->StencilBarrier();// Synch shared memory on a single nodes } void CommunicateComplete(std::vector > &reqs) @@ -452,6 +450,9 @@ public: if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); + acceleratorCopySynchronise(); + // Everyone agrees we are all done + _grid->StencilBarrier(); } //////////////////////////////////////////////////////////////////////// // Blocking send and receive. Either sequential or parallel. @@ -529,7 +530,6 @@ public: { _grid->StencilBarrier();// Synch shared memory on a single nodes - // conformable(source.Grid(),_grid); assert(source.Grid()==_grid); u_comm_offset=0; @@ -655,8 +655,8 @@ public: CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { - _grid->StencilBarrier();// Synch shared memory on a single nodes - CommsMerge(decompress,MergersSHM,DecompressionsSHM); + assert(MergersSHM.size()==0); + assert(DecompressionsSHM.size()==0); } template diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh index 0fe80247..9a29b773 100755 --- a/systems/PVC/benchmarks/run-1tile.sh +++ b/systems/PVC/benchmarks/run-1tile.sh @@ -4,7 +4,7 @@ #SBATCH -p QZ1J-ICX-PVC ##SBATCH -p QZ1J-SPR-PVC-2C -source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh +#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh export NT=8 diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh index decbb6cd..1db67508 100755 --- a/systems/PVC/benchmarks/run-2tile-mpi.sh +++ b/systems/PVC/benchmarks/run-2tile-mpi.sh @@ -4,7 +4,7 @@ #SBATCH -p QZ1J-ICX-PVC -source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh +#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh export NT=16 @@ -19,11 +19,15 @@ export SYCL_DEVICE_FILTER=gpu,level_zero export I_MPI_OFFLOAD_CELL=tile export EnableImplicitScaling=0 export EnableWalkerPartition=0 -export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 -export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 +#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 -#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log +for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 +do +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 1.1.1.2.log$i +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 2.1.1.1.log$i +done mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh index 0e48625b..b8806b30 100755 --- a/systems/PVC/benchmarks/wrap.sh +++ b/systems/PVC/benchmarks/wrap.sh @@ -5,10 +5,5 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK -#if [ $MPI_LOCALRANKID = "0" ] -#then -# ~psteinbr/build_pti/ze_tracer -c $@ -# onetrace --chrome-kernel-timeline $@ -#else $@ -#fi +