diff --git a/Grid/DisableWarnings.h b/Grid/DisableWarnings.h index 64e4faf4..015e19d1 100644 --- a/Grid/DisableWarnings.h +++ b/Grid/DisableWarnings.h @@ -45,7 +45,7 @@ directory //disables nvcc specific warning in json.hpp #pragma clang diagnostic ignored "-Wdeprecated-register" -#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5) +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ //disables nvcc specific warning in json.hpp #pragma nv_diag_suppress unsigned_compare_with_zero #pragma nv_diag_suppress cast_to_qualified_type diff --git a/Grid/GridCore.h b/Grid/GridCore.h index 8e04a859..41c64ef6 100644 --- a/Grid/GridCore.h +++ b/Grid/GridCore.h @@ -44,10 +44,10 @@ Author: paboyle #include #include #include -#include //#include #include #include +#include #include #include #include diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index 5aee81de..bdd39a65 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -14,7 +14,7 @@ /* NVCC save and restore compile environment*/ #ifdef __NVCC__ #pragma push -#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5) +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ #pragma nv_diag_suppress code_is_unreachable #else #pragma diag_suppress code_is_unreachable diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index a9a82f34..7008008c 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -262,7 +262,7 @@ public: autoView( Tnp_v , (*Tnp), AcceleratorWrite); autoView( Tnm_v , (*Tnm), AcceleratorWrite); const int Nsimd = CComplex::Nsimd(); - accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { + accelerator_for(ss, FineGrid->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); }); @@ -324,9 +324,9 @@ public: GridBase* _cbgrid; int hermitian; - CartesianStencil Stencil; - CartesianStencil StencilEven; - CartesianStencil StencilOdd; + CartesianStencil Stencil; + CartesianStencil StencilEven; + CartesianStencil StencilOdd; std::vector A; std::vector Aeven; @@ -631,7 +631,7 @@ public: assert(Aself != nullptr); } - void DselfInternal(CartesianStencil &st, CoarseMatrix &a, + void DselfInternal(CartesianStencil &st, CoarseMatrix &a, const CoarseVector &in, CoarseVector &out, int dag) { int point = geom.npoint-1; autoView( out_v, out, AcceleratorWrite); @@ -694,7 +694,7 @@ public: } } - void DhopInternal(CartesianStencil &st, std::vector &a, + void DhopInternal(CartesianStencil &st, std::vector &a, const CoarseVector &in, CoarseVector &out, int dag) { SimpleCompressor compressor; @@ -784,9 +784,9 @@ public: _cbgrid(new GridRedBlackCartesian(&CoarseGrid)), geom(CoarseGrid._ndimension), hermitian(hermitian_), - Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0), + Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements), A(geom.npoint,&CoarseGrid), Aeven(geom.npoint,_cbgrid), Aodd(geom.npoint,_cbgrid), @@ -804,9 +804,9 @@ public: _cbgrid(&CoarseRBGrid), geom(CoarseGrid._ndimension), hermitian(hermitian_), - Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0), + Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements), A(geom.npoint,&CoarseGrid), Aeven(geom.npoint,&CoarseRBGrid), Aodd(geom.npoint,&CoarseRBGrid), diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h index b1cf4d97..090fed46 100644 --- a/Grid/algorithms/LinearOperator.h +++ b/Grid/algorithms/LinearOperator.h @@ -526,6 +526,7 @@ public: (*this)(Linop,in[k],out[k]); } }; + virtual ~OperatorFunction(){}; }; template class LinearFunction { diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 584ed1d5..1d6984f3 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -258,26 +258,12 @@ public: for(int n=2;nView(); - auto Tnp_v = Tnp->View(); - auto Tnm_v = Tnm->View(); - constexpr int Nsimd = vector_type::Nsimd(); - accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, { - coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); - coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); - }); - if ( Coeffs[n] != 0.0) { - axpy(out,Coeffs[n],*Tnp,out); - } -#else axpby(y,xscale,mscale,y,(*Tn)); axpby(*Tnp,2.0,-1.0,y,(*Tnm)); if ( Coeffs[n] != 0.0) { axpy(out,Coeffs[n],*Tnp,out); } -#endif + // Cycle pointers to avoid copies Field *swizzle = Tnm; Tnm =Tn; diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h index 31ac55e0..27fee791 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h @@ -108,7 +108,10 @@ NAMESPACE_BEGIN(Grid); GridStopWatch PrecChangeTimer; Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count - + + precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid); + precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid); + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ //Compute double precision rsd and also new RHS vector. Linop_d.HermOp(sol_d, tmp_d); @@ -123,7 +126,7 @@ NAMESPACE_BEGIN(Grid); while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? PrecChangeTimer.Start(); - precisionChange(src_f, src_d); + precisionChange(src_f, src_d, pc_wk_dp_to_sp); PrecChangeTimer.Stop(); sol_f = Zero(); @@ -142,7 +145,7 @@ NAMESPACE_BEGIN(Grid); //Convert sol back to double and add to double prec solution PrecChangeTimer.Start(); - precisionChange(tmp_d, sol_f); + precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp); PrecChangeTimer.Stop(); axpy(sol_d, 1.0, tmp_d, sol_d); diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h new file mode 100644 index 00000000..17714f09 --- /dev/null +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h @@ -0,0 +1,373 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle +Author: Christopher Kelly + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. +//The residual is stored in single precision, but the search directions and solution are stored in double precision. +//Every update_freq iterations the residual is corrected in double precision. +//For safety the a final regular CG is applied to clean up if necessary + +//PB Pure single, then double fixup + +template::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction, + public OperatorFunction +{ +public: + + using OperatorFunction::operator(); + + RealD Tolerance; + Integer MaxIterationsMshift; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + std::vector IterationsToCompleteShift; // Iterations for this shift + int verbose; + MultiShiftFunction shifts; + std::vector TrueResidualShift; + + int ReliableUpdateFreq; //number of iterations between reliable updates + + GridBase* SinglePrecGrid; //Grid for single-precision fields + LinearOperatorBase &Linop_f; //single precision + + ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts, + GridBase* _SinglePrecGrid, LinearOperatorBase &_Linop_f, + int _ReliableUpdateFreq) : + MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq), + MaxIterations(20000) + { + verbose=1; + IterationsToCompleteShift.resize(_shifts.order); + TrueResidualShift.resize(_shifts.order); + } + + void operator() (LinearOperatorBase &Linop, const FieldD &src, FieldD &psi) + { + GridBase *grid = src.Grid(); + int nshift = shifts.order; + std::vector results(nshift,grid); + (*this)(Linop,src,results,psi); + } + void operator() (LinearOperatorBase &Linop, const FieldD &src, std::vector &results, FieldD &psi) + { + int nshift = shifts.order; + + (*this)(Linop,src,results); + + psi = shifts.norm*src; + for(int i=0;i &Linop_d, const FieldD &src_d, std::vector &psi_d) + { + GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup"); + GridBase *DoublePrecGrid = src_d.Grid(); + + //////////////////////////////////////////////////////////////////////// + // Convenience references to the info stored in "MultiShiftFunction" + //////////////////////////////////////////////////////////////////////// + int nshift = shifts.order; + + std::vector &mass(shifts.poles); // Make references to array in "shifts" + std::vector &mresidual(shifts.tolerances); + std::vector alpha(nshift,1.0); + + //Double precision search directions + FieldD p_d(DoublePrecGrid); + std::vector ps_f (nshift, SinglePrecGrid);// Search directions (single precision) + std::vector psi_f(nshift, SinglePrecGrid);// solutions (single precision) + + FieldD tmp_d(DoublePrecGrid); + FieldD r_d(DoublePrecGrid); + FieldF r_f(SinglePrecGrid); + FieldD mmp_d(DoublePrecGrid); + + assert(psi_d.size()==nshift); + assert(mass.size()==nshift); + assert(mresidual.size()==nshift); + + // dynamic sized arrays on stack; 2d is a pain with vector + RealD bs[nshift]; + RealD rsq[nshift]; + RealD rsqf[nshift]; + RealD z[nshift][2]; + int converged[nshift]; + + const int primary =0; + + //Primary shift fields CG iteration + RealD a,b,c,d; + RealD cp,bp,qq; //prev + + // Matrix mult fields + FieldF p_f(SinglePrecGrid); + FieldF mmp_f(SinglePrecGrid); + + // Check lightest mass + for(int s=0;s= mass[primary] ); + converged[s]=0; + } + + // Wire guess to zero + // Residuals "r" are src + // First search direction "p" is also src + cp = norm2(src_d); + + // Handle trivial case of zero src. + if( cp == 0. ){ + for(int s=0;s= rsq[s]){ + CleanupTimer.Start(); + std::cout< Linop_shift_d(Linop_d, mass[s]); + ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop Linop_shift_f(Linop_f, mass[s]); + + MixedPrecisionConjugateGradient cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); + cg(src_d, psi_d[s]); + + TrueResidualShift[s] = cg.TrueResidual; + CleanupTimer.Stop(); + } + } + + std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<::operator(); RealD Tolerance; + Integer MaxIterationsMshift; Integer MaxIterations; Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion std::vector IterationsToCompleteShift; // Iterations for this shift @@ -95,9 +96,9 @@ public: ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts, GridBase* _SinglePrecGrid, LinearOperatorBase &_Linop_f, - int _ReliableUpdateFreq - ) : - MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq) + int _ReliableUpdateFreq) : + MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq), + MaxIterations(20000) { verbose=1; IterationsToCompleteShift.resize(_shifts.order); @@ -130,6 +131,9 @@ public: GRID_TRACE("ConjugateGradientMultiShiftMixedPrec"); GridBase *DoublePrecGrid = src_d.Grid(); + precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid); + precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid); + //////////////////////////////////////////////////////////////////////// // Convenience references to the info stored in "MultiShiftFunction" //////////////////////////////////////////////////////////////////////// @@ -154,6 +158,7 @@ public: // dynamic sized arrays on stack; 2d is a pain with vector RealD bs[nshift]; RealD rsq[nshift]; + RealD rsqf[nshift]; RealD z[nshift][2]; int converged[nshift]; @@ -164,12 +169,8 @@ public: RealD cp,bp,qq; //prev // Matrix mult fields - FieldF r_f(SinglePrecGrid); FieldF p_f(SinglePrecGrid); - FieldF tmp_f(SinglePrecGrid); FieldF mmp_f(SinglePrecGrid); - FieldF src_f(SinglePrecGrid); - precisionChange(src_f, src_d); // Check lightest mass for(int s=0;s &Linop_f; LinearOperatorBase &Linop_d; GridBase* SinglePrecGrid; - RealD Delta; //reliable update parameter + RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single LinearOperatorBase *Linop_fallback; @@ -65,7 +65,9 @@ public: ErrorOnNoConverge(err_on_no_conv), DoFinalCleanup(true), Linop_fallback(NULL) - {}; + { + assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1"); + }; void setFallbackLinop(LinearOperatorBase &_Linop_fallback, const RealD _fallback_transition_tol){ Linop_fallback = &_Linop_fallback; @@ -116,9 +118,12 @@ public: } //Single prec initialization + precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid); + precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid()); + FieldF r_f(SinglePrecGrid); r_f.Checkerboard() = r.Checkerboard(); - precisionChange(r_f, r); + precisionChange(r_f, r, pc_wk_dp_to_sp); FieldF psi_f(r_f); psi_f = Zero(); @@ -134,7 +139,8 @@ public: GridStopWatch LinalgTimer; GridStopWatch MatrixTimer; GridStopWatch SolverTimer; - + GridStopWatch PrecChangeTimer; + SolverTimer.Start(); int k = 0; int l = 0; @@ -173,7 +179,9 @@ public: // Stopping condition if (cp <= rsq) { //Although not written in the paper, I assume that I have to add on the final solution - precisionChange(mmp, psi_f); + PrecChangeTimer.Start(); + precisionChange(mmp, psi_f, pc_wk_sp_to_dp); + PrecChangeTimer.Stop(); psi = psi + mmp; @@ -194,7 +202,10 @@ public: std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() < DeviceMaxBytes){ if ( DeviceLRUBytes > 0){ assert(LRU.size()>0); - uint64_t victim = LRU.back(); + uint64_t victim = LRU.back(); // From the LRU auto AccCacheIterator = EntryLookup(victim); auto & AccCache = AccCacheIterator->second; Evict(AccCache); + } else { + return; } } } @@ -247,11 +265,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { - dprintf("ViewOpen found entry %lx %lx : %ld %ld\n", + dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr, (uint64_t)CpuPtr, (uint64_t)AccCache.bytes, - (uint64_t)bytes); + (uint64_t)bytes, + (uint64_t)AccCache.accLock); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -286,6 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // Empty + AccRead => Consistent } AccCache.accLock= 1; + dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); } else if(AccCache.state==CpuDirty ){ if(mode==AcceleratorWriteDiscard) { CpuDiscard(AccCache); @@ -298,28 +318,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); } else { assert(0); } - // If view is opened on device remove from LRU + assert(AccCache.accLock>0); + // If view is opened on device must remove from LRU if(AccCache.LRU_valid==1){ // must possibly remove from LRU as now locked on GPU + dprintf("AccCache entry removed from LRU \n"); LRUremove(AccCache); } @@ -340,10 +362,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) assert(AccCache.accLock>0); AccCache.accLock--; - // Move to LRU queue if not locked and close on device if(AccCache.accLock==0) { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); LRUinsert(AccCache); + } else { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); } } void MemoryManager::CpuViewClose(uint64_t CpuPtr) @@ -380,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - if (!AccCache.AccPtr) { - EvictVictims(bytes); - } + // CPU doesn't need to free space + // if (!AccCache.AccPtr) { + // EvictVictims(bytes); + // } assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error @@ -436,20 +461,28 @@ void MemoryManager::NotifyDeletion(void *_ptr) void MemoryManager::Print(void) { PrintBytes(); - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "Memory Manager " << std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl; - std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl; - std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl; - std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl; - std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl; - std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<second; @@ -459,13 +492,13 @@ void MemoryManager::Print(void) if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); if ( AccCache.state==Consistent)str = std::string("Consistent"); - std::cout << GridLogDebug << "0x"<second; + LruBytes2+=AccCache.bytes; + assert(AccCache.LRU_valid==1); + assert(AccCache.LRU_entry==it); + } + std::cout << " Memory Manager::Audit() LRU queue matches table entries "<second; + + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + + CpuBytes+=AccCache.bytes; + if( AccCache.AccPtr ) AccBytes+=AccCache.bytes; + if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes; + if( AccCache.LRU_valid ) LruCnt++; + + if ( AccCache.cpuLock || AccCache.accLock ) { + assert(AccCache.LRU_valid==0); + + std::cout << GridLogError << s<< "\n\t 0x"<Device memory movement not currently managed by Grid." << std::endl; }; void MemoryManager::Print(void){}; +void MemoryManager::PrintAll(void){}; void MemoryManager::NotifyDeletion(void *ptr){}; NAMESPACE_END(Grid); diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index cb3b9f0e..b98424a1 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -131,7 +131,7 @@ public: template void GlobalSum(obj &o){ typedef typename obj::scalar_type scalar_type; int words = sizeof(obj)/sizeof(scalar_type); - scalar_type * ptr = (scalar_type *)& o; + scalar_type * ptr = (scalar_type *)& o; // Safe alias GlobalSumVector(ptr,words); } @@ -155,7 +155,7 @@ public: int xmit_to_rank,int do_xmit, void *recv, int recv_from_rank,int do_recv, - int bytes,int dir); + int xbytes,int rbytes,int dir); void StencilSendToRecvFromComplete(std::vector &waitall,int i); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index fef4ea1f..892e3dbe 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -343,7 +343,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int bytes,int dir) { std::vector list; - double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,dir); + double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); StencilSendToRecvFromComplete(list,dir); return offbytes; } @@ -353,7 +353,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); assert(shm!=NULL); - acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); } } - - /* if ( CommunicatorPolicy == CommunicatorPolicySequential ) { - * this->StencilSendToRecvFromComplete(list,dir); - * list.resize(0); - * } - */ + return off_node_bytes; } void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 4b533c4b..5d917b79 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -126,7 +126,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector #include #include +#include #ifdef GRID_CUDA #include diff --git a/Grid/json/json.hpp b/Grid/json/json.hpp index 618aa7a1..cb27e058 100644 --- a/Grid/json/json.hpp +++ b/Grid/json/json.hpp @@ -1,13 +1,12 @@ -#ifndef __NVCC__ /* __ _____ _____ _____ __| | __| | | | JSON for Modern C++ -| | |__ | | | | | | version 3.2.0 +| | |__ | | | | | | version 3.10.5 |_____|_____|_____|_|___| https://github.com/nlohmann/json Licensed under the MIT License . SPDX-License-Identifier: MIT -Copyright (c) 2013-2018 Niels Lohmann . +Copyright (c) 2013-2022 Niels Lohmann . Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,588 +27,56 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef NLOHMANN_JSON_HPP -#define NLOHMANN_JSON_HPP +/****************************************************************************\ + * Note on documentation: The source files contain links to the online * + * documentation of the public API at https://json.nlohmann.me. This URL * + * contains the most recent documentation and should also be applicable to * + * previous versions; documentation for deprecated functions is not * + * removed, but marked deprecated. See "Generate documentation" section in * + * file doc/README.md. * +\****************************************************************************/ + +#ifndef INCLUDE_NLOHMANN_JSON_HPP_ +#define INCLUDE_NLOHMANN_JSON_HPP_ #define NLOHMANN_JSON_VERSION_MAJOR 3 -#define NLOHMANN_JSON_VERSION_MINOR 2 -#define NLOHMANN_JSON_VERSION_PATCH 0 +#define NLOHMANN_JSON_VERSION_MINOR 10 +#define NLOHMANN_JSON_VERSION_PATCH 5 #include // all_of, find, for_each -#include // assert -#include // and, not, or #include // nullptr_t, ptrdiff_t, size_t #include // hash, less #include // initializer_list -#include // istream, ostream -#include // iterator_traits, random_access_iterator_tag +#ifndef JSON_NO_IO + #include // istream, ostream +#endif // JSON_NO_IO +#include // random_access_iterator_tag +#include // unique_ptr #include // accumulate #include // string, stoi, to_string #include // declval, forward, move, pair, swap - -// #include -#ifndef NLOHMANN_JSON_FWD_HPP -#define NLOHMANN_JSON_FWD_HPP - -#include // int64_t, uint64_t -#include // map -#include // allocator -#include // string #include // vector -/*! -@brief namespace for Niels Lohmann -@see https://github.com/nlohmann -@since version 1.0.0 -*/ -namespace nlohmann -{ -/*! -@brief default JSONSerializer template argument - -This serializer ignores the template arguments and uses ADL -([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl)) -for serialization. -*/ -template -struct adl_serializer; - -template class ObjectType = - std::map, - template class ArrayType = std::vector, - class StringType = std::string, class BooleanType = bool, - class NumberIntegerType = std::int64_t, - class NumberUnsignedType = std::uint64_t, - class NumberFloatType = double, - template class AllocatorType = std::allocator, - template class JSONSerializer = - adl_serializer> -class basic_json; - -/*! -@brief JSON Pointer - -A JSON pointer defines a string syntax for identifying a specific value -within a JSON document. It can be used with functions `at` and -`operator[]`. Furthermore, JSON pointers are the base for JSON patches. - -@sa [RFC 6901](https://tools.ietf.org/html/rfc6901) - -@since version 2.0.0 -*/ -template -class json_pointer; - -/*! -@brief default JSON class - -This type is the default specialization of the @ref basic_json class which -uses the standard template types. - -@since version 1.0.0 -*/ -using json = basic_json<>; -} - -#endif - -// #include - - -// This file contains all internal macro definitions -// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them - -// exclude unsupported compilers -#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK) - #if defined(__clang__) - #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400 - #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers" - #endif - #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER)) - #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800 - #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers" - #endif - #endif -#endif - -// disable float-equal warnings on GCC/clang -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wfloat-equal" -#endif - -// disable documentation warnings on clang -#if defined(__clang__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wdocumentation" -#endif - -// allow for portable deprecation warnings -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #define JSON_DEPRECATED __attribute__((deprecated)) -#elif defined(_MSC_VER) - #define JSON_DEPRECATED __declspec(deprecated) -#else - #define JSON_DEPRECATED -#endif - -// allow to disable exceptions -#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION) - #define JSON_THROW(exception) throw exception - #define JSON_TRY try - #define JSON_CATCH(exception) catch(exception) - #define JSON_INTERNAL_CATCH(exception) catch(exception) -#else - #define JSON_THROW(exception) std::abort() - #define JSON_TRY if(true) - #define JSON_CATCH(exception) if(false) - #define JSON_INTERNAL_CATCH(exception) if(false) -#endif - -// override exception macros -#if defined(JSON_THROW_USER) - #undef JSON_THROW - #define JSON_THROW JSON_THROW_USER -#endif -#if defined(JSON_TRY_USER) - #undef JSON_TRY - #define JSON_TRY JSON_TRY_USER -#endif -#if defined(JSON_CATCH_USER) - #undef JSON_CATCH - #define JSON_CATCH JSON_CATCH_USER - #undef JSON_INTERNAL_CATCH - #define JSON_INTERNAL_CATCH JSON_CATCH_USER -#endif -#if defined(JSON_INTERNAL_CATCH_USER) - #undef JSON_INTERNAL_CATCH - #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER -#endif - -// manual branch prediction -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #define JSON_LIKELY(x) __builtin_expect(!!(x), 1) - #define JSON_UNLIKELY(x) __builtin_expect(!!(x), 0) -#else - #define JSON_LIKELY(x) x - #define JSON_UNLIKELY(x) x -#endif - -// C++ language standard detection -#if (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464 - #define JSON_HAS_CPP_17 - #define JSON_HAS_CPP_14 -#elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1) - #define JSON_HAS_CPP_14 -#endif - -// Ugly macros to avoid uglier copy-paste when specializing basic_json. They -// may be removed in the future once the class is split. - -#define NLOHMANN_BASIC_JSON_TPL_DECLARATION \ - template class ObjectType, \ - template class ArrayType, \ - class StringType, class BooleanType, class NumberIntegerType, \ - class NumberUnsignedType, class NumberFloatType, \ - template class AllocatorType, \ - template class JSONSerializer> - -#define NLOHMANN_BASIC_JSON_TPL \ - basic_json - -// #include - - -#include // not -#include // size_t -#include // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type - -namespace nlohmann -{ -namespace detail -{ -// alias templates to reduce boilerplate -template -using enable_if_t = typename std::enable_if::type; - -template -using uncvref_t = typename std::remove_cv::type>::type; - -// implementation of C++14 index_sequence and affiliates -// source: https://stackoverflow.com/a/32223343 -template -struct index_sequence -{ - using type = index_sequence; - using value_type = std::size_t; - static constexpr std::size_t size() noexcept - { - return sizeof...(Ints); - } -}; - -template -struct merge_and_renumber; - -template -struct merge_and_renumber, index_sequence> - : index_sequence < I1..., (sizeof...(I1) + I2)... > {}; - -template -struct make_index_sequence - : merge_and_renumber < typename make_index_sequence < N / 2 >::type, - typename make_index_sequence < N - N / 2 >::type > {}; - -template<> struct make_index_sequence<0> : index_sequence<> {}; -template<> struct make_index_sequence<1> : index_sequence<0> {}; - -template -using index_sequence_for = make_index_sequence; - -// dispatch utility (taken from ranges-v3) -template struct priority_tag : priority_tag < N - 1 > {}; -template<> struct priority_tag<0> {}; - -// taken from ranges-v3 -template -struct static_const -{ - static constexpr T value{}; -}; - -template -constexpr T static_const::value; -} -} - -// #include - - -#include // not -#include // numeric_limits -#include // false_type, is_constructible, is_integral, is_same, true_type -#include // declval - -// #include - -// #include - -// #include +// #include #include - -// #include - - -namespace nlohmann -{ -namespace detail -{ -template struct make_void -{ - using type = void; -}; -template using void_t = typename make_void::type; -} -} - - -// http://en.cppreference.com/w/cpp/experimental/is_detected -namespace nlohmann -{ -namespace detail -{ -struct nonesuch -{ - nonesuch() = delete; - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; - void operator=(nonesuch const&) = delete; -}; - -template class Op, - class... Args> -struct detector -{ - using value_t = std::false_type; - using type = Default; -}; - -template class Op, class... Args> -struct detector>, Op, Args...> -{ - using value_t = std::true_type; - using type = Op; -}; - -template